undo type inference add docs and asv

chris-b1 · chris-b1 · commit 4e0722d063aa · 2016-08-04T18:09:16.000-05:00
diff --git a/asv_bench/benchmarks/parser_vb.py b/asv_bench/benchmarks/parser_vb.py
@@ -114,6 +114,27 @@ def teardown(self):
         os.remove('test.csv')
 
 
+class read_csv_categorical(object):
+    def setup(self):
+        goal_time = 0.2
+
+        N = 100000
+        group1 = ['aaaaaaaa', 'bbbbbbb', 'cccccccc', 'dddddddd', 'eeeeeeee']
+        df = DataFrame({'a': np.random.choice(group1, N).astype('object'),
+                        'b': np.random.choice(group1, N).astype('object'),
+                        'c': np.random.choice(group1, N).astype('object')})
+        df.to_csv('strings.csv', index=False)
+
+    def time_read_csv_categorical_post(self):
+        read_csv('strings.csv').apply(pd.Categorical)
+
+    def time_read_csv_categorical_direct(self):
+        read_csv('strings.csv', dtype='category')
+
+    def teardown(self):
+        os.remove('strings.csv')
+
+
 class read_table_multiple_date(object):
     goal_time = 0.2
 
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -500,6 +500,40 @@ worth trying.
    data that was read in. It is important to note that the overall column will be
    marked with a ``dtype`` of ``object``, which is used for columns with mixed dtypes.
 
+Specifying Categorical dtype
+''''''''''''''''''''''''''''
+
+.. versionadded:: 0.19.0
+
+`Categorical` columns can be parsed directly by specifying `dtype='category'`
+
+.. ipython :: python
+
+   data = 'col1,col2,col3\na,b,1\na,b,2\nc,d,3'
+
+   pd.read_csv(StringIO(data))
+   pd.read_csv(StringIO(data)).dtypes
+   pd.read_csv(StringIO(data), dtype='category').dtypes
+
+Individual columns can be parsed as a `Categorical` using a dict specification
+
+.. ipython :: python
+
+   pd.read_csv(StringIO(data), dtype={'col1': 'category'}).dtypes
+
+.. note::
+
+   The resulting categories will always be parsed as string (object dtype).
+   Numeric categories can be converted using the :func:`pd.to_numeric` function.
+
+   .. ipython :: python
+
+      df = pd.read_csv(StringIO(data), dtype='category')
+      df.dtypes
+      df['col3']
+      df['col3'].cat.categories = pd.to_numeric(df['col3'].cat.categories)
+      df['col3']
+>>>>>>> undo type inference add docs and asv
 
 
 Naming and Using Columns
diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt
@@ -195,6 +195,14 @@ default of the index) in a DataFrame.
 :func:`read_csv` has improved support for duplicate column names
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
+.. ipython:: python
+   :suppress:
+
+   from pandas.compat import StringIO
+
+.. _whatsnew_0190.enhancements.read_csv_dupe_col_names_support:
+
+
 :ref:`Duplicate column names <io.dupe_names>` are now supported in :func:`read_csv` whether
 they are in the file or passed in as the ``names`` parameter (:issue:`7160`, :issue:`9424`)
 
@@ -222,6 +230,44 @@ New behaviour:
 
    In [2]: pd.read_csv(StringIO(data), names=names)
 
+
+.. _whatsnew_0190.enhancements.read_csv_categorical:
+
+:func:`read_csv` supports parsing `Categorical` directly
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The :func:`read_csv` function now supports parsing a `Categorical` column when
+specified as a dtype (:issue:`10153`).  Depending on the structure of the data,
+this can result in a faster parse time and lower memory usage, compared to
+converting to `Categorical` after parsing.
+
+.. ipython :: python
+
+   data = 'col1,col2,col3\na,b,1\na,b,2\nc,d,3'
+
+   pd.read_csv(StringIO(data))
+   pd.read_csv(StringIO(data)).dtypes
+   pd.read_csv(StringIO(data), dtype='category').dtypes
+
+Individual columns can be parsed as a `Categorical` using a dict specification
+
+.. ipython :: python
+
+   pd.read_csv(StringIO(data), dtype={'col1': 'category'}).dtypes
+
+.. note::
+
+   The resulting categories will always be parsed as string (object dtype).
+   Numeric categories can be converted using the :func:`pd.to_numeric` function.
+
+   .. ipython :: python
+
+      df = pd.read_csv(StringIO(data), dtype='category')
+      df.dtypes
+      df['col3']
+      df['col3'].cat.categories = pd.to_numeric(df['col3'].cat.categories)
+      df['col3']
+
 .. _whatsnew_0190.enhancements.semi_month_offsets:
 
 Semi-Month Offsets
diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py
@@ -191,9 +191,9 @@ def test_categorical_dtype(self):
 1,a,3.4
 1,a,3.4
 2,b,4.5"""
-        expected = pd.DataFrame({'a': Categorical([1, 1, 2]),
+        expected = pd.DataFrame({'a': Categorical(['1', '1', '2']),
                                  'b': Categorical(['a', 'a', 'b']),
-                                 'c': Categorical([3.4, 3.4, 4.5])})
+                                 'c': Categorical(['3.4', '3.4', '4.5'])})
         actual = self.read_csv(StringIO(data), dtype='category')
         tm.assert_frame_equal(actual, expected)
 
@@ -205,6 +205,26 @@ def test_categorical_dtype(self):
                                                       'c': CategoricalDtype()})
         tm.assert_frame_equal(actual, expected)
 
+        actual = self.read_csv(StringIO(data), dtype={'b': 'category'})
+        expected = pd.DataFrame({'a': [1, 1, 2],
+                                 'b': Categorical(['a', 'a', 'b']),
+                                 'c': [3.4, 3.4, 4.5]})
+        tm.assert_frame_equal(actual, expected)
+
+    def test_categorical_dtype_encoding(self):
+        # GH 10153
+        cases = [
+            ('unicode_series.csv', 'latin-1'),
+            ('utf16_ex.txt', 'utf-16')
+        ]
+
+        for f, encoding in cases:
+            pth = tm.get_data_path(f)
+            expected = self.read_csv(pth, header=None, encoding=encoding)
+            result = self.read_csv(pth, header=None, encoding=encoding, dtype='category')
+            result = result.apply(lambda x: x.astype(object))
+            tm.assert_frame_equal(actual, expected)
+
     def test_pass_dtype_as_recarray(self):
         if compat.is_platform_windows() and self.low_memory:
             raise nose.SkipTest(
diff --git a/pandas/parser.pyx b/pandas/parser.pyx