Skip to content

Commit 4e0722d

Browse files
committed
undo type inference add docs and asv
1 parent 849a112 commit 4e0722d

File tree

5 files changed

+220
-179
lines changed

5 files changed

+220
-179
lines changed

asv_bench/benchmarks/parser_vb.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,27 @@ def teardown(self):
114114
os.remove('test.csv')
115115

116116

117+
class read_csv_categorical(object):
118+
def setup(self):
119+
goal_time = 0.2
120+
121+
N = 100000
122+
group1 = ['aaaaaaaa', 'bbbbbbb', 'cccccccc', 'dddddddd', 'eeeeeeee']
123+
df = DataFrame({'a': np.random.choice(group1, N).astype('object'),
124+
'b': np.random.choice(group1, N).astype('object'),
125+
'c': np.random.choice(group1, N).astype('object')})
126+
df.to_csv('strings.csv', index=False)
127+
128+
def time_read_csv_categorical_post(self):
129+
read_csv('strings.csv').apply(pd.Categorical)
130+
131+
def time_read_csv_categorical_direct(self):
132+
read_csv('strings.csv', dtype='category')
133+
134+
def teardown(self):
135+
os.remove('strings.csv')
136+
137+
117138
class read_table_multiple_date(object):
118139
goal_time = 0.2
119140

doc/source/io.rst

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -500,6 +500,40 @@ worth trying.
500500
data that was read in. It is important to note that the overall column will be
501501
marked with a ``dtype`` of ``object``, which is used for columns with mixed dtypes.
502502

503+
Specifying Categorical dtype
504+
''''''''''''''''''''''''''''
505+
506+
.. versionadded:: 0.19.0
507+
508+
`Categorical` columns can be parsed directly by specifying `dtype='category'`
509+
510+
.. ipython :: python
511+
512+
data = 'col1,col2,col3\na,b,1\na,b,2\nc,d,3'
513+
514+
pd.read_csv(StringIO(data))
515+
pd.read_csv(StringIO(data)).dtypes
516+
pd.read_csv(StringIO(data), dtype='category').dtypes
517+
518+
Individual columns can be parsed as a `Categorical` using a dict specification
519+
520+
.. ipython :: python
521+
522+
pd.read_csv(StringIO(data), dtype={'col1': 'category'}).dtypes
523+
524+
.. note::
525+
526+
The resulting categories will always be parsed as string (object dtype).
527+
Numeric categories can be converted using the :func:`pd.to_numeric` function.
528+
529+
.. ipython :: python
530+
531+
df = pd.read_csv(StringIO(data), dtype='category')
532+
df.dtypes
533+
df['col3']
534+
df['col3'].cat.categories = pd.to_numeric(df['col3'].cat.categories)
535+
df['col3']
536+
>>>>>>> undo type inference add docs and asv
503537

504538

505539
Naming and Using Columns

doc/source/whatsnew/v0.19.0.txt

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,14 @@ default of the index) in a DataFrame.
195195
:func:`read_csv` has improved support for duplicate column names
196196
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
197197

198+
.. ipython:: python
199+
:suppress:
200+
201+
from pandas.compat import StringIO
202+
203+
.. _whatsnew_0190.enhancements.read_csv_dupe_col_names_support:
204+
205+
198206
:ref:`Duplicate column names <io.dupe_names>` are now supported in :func:`read_csv` whether
199207
they are in the file or passed in as the ``names`` parameter (:issue:`7160`, :issue:`9424`)
200208

@@ -222,6 +230,44 @@ New behaviour:
222230

223231
In [2]: pd.read_csv(StringIO(data), names=names)
224232

233+
234+
.. _whatsnew_0190.enhancements.read_csv_categorical:
235+
236+
:func:`read_csv` supports parsing `Categorical` directly
237+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
238+
239+
The :func:`read_csv` function now supports parsing a `Categorical` column when
240+
specified as a dtype (:issue:`10153`). Depending on the structure of the data,
241+
this can result in a faster parse time and lower memory usage, compared to
242+
converting to `Categorical` after parsing.
243+
244+
.. ipython :: python
245+
246+
data = 'col1,col2,col3\na,b,1\na,b,2\nc,d,3'
247+
248+
pd.read_csv(StringIO(data))
249+
pd.read_csv(StringIO(data)).dtypes
250+
pd.read_csv(StringIO(data), dtype='category').dtypes
251+
252+
Individual columns can be parsed as a `Categorical` using a dict specification
253+
254+
.. ipython :: python
255+
256+
pd.read_csv(StringIO(data), dtype={'col1': 'category'}).dtypes
257+
258+
.. note::
259+
260+
The resulting categories will always be parsed as string (object dtype).
261+
Numeric categories can be converted using the :func:`pd.to_numeric` function.
262+
263+
.. ipython :: python
264+
265+
df = pd.read_csv(StringIO(data), dtype='category')
266+
df.dtypes
267+
df['col3']
268+
df['col3'].cat.categories = pd.to_numeric(df['col3'].cat.categories)
269+
df['col3']
270+
225271
.. _whatsnew_0190.enhancements.semi_month_offsets:
226272

227273
Semi-Month Offsets

pandas/io/tests/parser/c_parser_only.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -191,9 +191,9 @@ def test_categorical_dtype(self):
191191
1,a,3.4
192192
1,a,3.4
193193
2,b,4.5"""
194-
expected = pd.DataFrame({'a': Categorical([1, 1, 2]),
194+
expected = pd.DataFrame({'a': Categorical(['1', '1', '2']),
195195
'b': Categorical(['a', 'a', 'b']),
196-
'c': Categorical([3.4, 3.4, 4.5])})
196+
'c': Categorical(['3.4', '3.4', '4.5'])})
197197
actual = self.read_csv(StringIO(data), dtype='category')
198198
tm.assert_frame_equal(actual, expected)
199199

@@ -205,6 +205,26 @@ def test_categorical_dtype(self):
205205
'c': CategoricalDtype()})
206206
tm.assert_frame_equal(actual, expected)
207207

208+
actual = self.read_csv(StringIO(data), dtype={'b': 'category'})
209+
expected = pd.DataFrame({'a': [1, 1, 2],
210+
'b': Categorical(['a', 'a', 'b']),
211+
'c': [3.4, 3.4, 4.5]})
212+
tm.assert_frame_equal(actual, expected)
213+
214+
def test_categorical_dtype_encoding(self):
215+
# GH 10153
216+
cases = [
217+
('unicode_series.csv', 'latin-1'),
218+
('utf16_ex.txt', 'utf-16')
219+
]
220+
221+
for f, encoding in cases:
222+
pth = tm.get_data_path(f)
223+
expected = self.read_csv(pth, header=None, encoding=encoding)
224+
result = self.read_csv(pth, header=None, encoding=encoding, dtype='category')
225+
result = result.apply(lambda x: x.astype(object))
226+
tm.assert_frame_equal(actual, expected)
227+
208228
def test_pass_dtype_as_recarray(self):
209229
if compat.is_platform_windows() and self.low_memory:
210230
raise nose.SkipTest(

0 commit comments

Comments
 (0)