Skip to content

Commit 08315b8

Browse files
chris-b1jorisvandenbossche
authored andcommitted
add docs; test for conv cast
1 parent 7fbe0a3 commit 08315b8

File tree

4 files changed

+79
-42
lines changed

4 files changed

+79
-42
lines changed

doc/source/io.rst

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,9 @@ dtype : Type name or dict of column -> type, default ``None``
157157
Data type for data or columns. E.g. ``{'a': np.float64, 'b': np.int32}``
158158
(unsupported with ``engine='python'``). Use `str` or `object` to preserve and
159159
not interpret dtype.
160+
161+
.. versionadded:: 0.20.0 support for the Python parser.
162+
160163
engine : {``'c'``, ``'python'``}
161164
Parser engine to use. The C engine is faster while the python engine is
162165
currently more feature-complete.
@@ -473,10 +476,8 @@ However, if you wanted for all the data to be coerced, no matter the type, then
473476
using the ``converters`` argument of :func:`~pandas.read_csv` would certainly be
474477
worth trying.
475478

476-
.. note::
477-
The ``dtype`` option is currently only supported by the C engine.
478-
Specifying ``dtype`` with ``engine`` other than 'c' raises a
479-
``ValueError``.
479+
.. versionadded:: 0.20.0 support for the Python parser.
480+
The ``dtype`` option is supported by the 'python' engine
480481

481482
.. note::
482483
In some cases, reading in abnormal data with columns containing mixed dtypes

doc/source/whatsnew/v0.20.0.txt

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,15 @@ Other enhancements
3232

3333
- ``pd.read_excel`` now preserves sheet order when using ``sheetname=None`` (:issue:`9930`)
3434

35+
- The ``dtype`` keyword argument in the :func:`read_csv` function for specifying the types of parsed columns
36+
is now supported with the ``'python'`` engine. See the :ref:`io docs <io.dtypes>` for more information.
37+
38+
.. ipython:: python
39+
40+
from io import StringIO
41+
data = "a,b\n1,2\n3,4"
42+
pd.read_csv(StringIO(data), engine='python').dtypes
43+
pd.read_csv(StringIO(data), engine='python', dtype={'a':'float64', 'b':'object'}).dtypes
3544

3645
.. _whatsnew_0200.api_breaking:
3746

pandas/io/parsers.py

Lines changed: 55 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -116,8 +116,11 @@
116116
dtype : Type name or dict of column -> type, default None
117117
Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32}
118118
Use `str` or `object` to preserve and not interpret dtype.
119-
If converters are specified, they will be applied AFTER
120-
dtype conversion.
119+
If converters are specified, they will be applied INSTEAD
120+
of dtype conversion.
121+
122+
.. versionadded:: 0.20.0 support for the Python parser.
123+
121124
%s
122125
converters : dict, default None
123126
Dict of functions for converting values in certain columns. Keys can either
@@ -1293,20 +1296,6 @@ def _agg_index(self, index, try_parse_dates=True):
12931296

12941297
return index
12951298

1296-
def _apply_converter(self, values, conv_f, na_values, col_na_values,
1297-
col_na_fvalues):
1298-
""" apply converter function to values, respecting NAs """
1299-
try:
1300-
values = lib.map_infer(values, conv_f)
1301-
except ValueError:
1302-
mask = lib.ismember(values, na_values).view(np.uint8)
1303-
values = lib.map_infer_mask(values, conv_f, mask)
1304-
1305-
cvals, na_count = self._infer_types(
1306-
values, set(col_na_values) | col_na_fvalues,
1307-
try_num_bool=False)
1308-
return cvals, na_count
1309-
13101299
def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False,
13111300
converters=None, dtypes=None):
13121301
result = {}
@@ -1324,45 +1313,58 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False,
13241313
else:
13251314
col_na_values, col_na_fvalues = set(), set()
13261315

1327-
if conv_f is not None and cast_type is None:
1328-
# if type is not specified, apply the conversion first, without
1329-
# inference
1330-
cvals, na_count = self._apply_converter(
1331-
values, conv_f, na_values,
1332-
col_na_values, col_na_fvalues)
1316+
if conv_f is not None:
1317+
# conv_f applied to data before inference
1318+
# dtype isn't used if a converted specified
1319+
try:
1320+
values = lib.map_infer(values, conv_f)
1321+
except ValueError:
1322+
mask = lib.ismember(values, na_values).view(np.uint8)
1323+
values = lib.map_infer_mask(values, conv_f, mask)
1324+
1325+
cvals, na_count = self._infer_types(
1326+
values, set(col_na_values) | col_na_fvalues,
1327+
try_num_bool=False)
13331328
else:
1334-
try_num_bool = True
1335-
if cast_type and is_object_dtype(cast_type):
1336-
# skip inference if specified dtype is object
1337-
try_num_bool = False
1329+
# skip inference if specified dtype is object
1330+
try_num_bool = not (cast_type and is_object_dtype(cast_type))
13381331

13391332
# general type inference and conversion
13401333
cvals, na_count = self._infer_types(
13411334
values, set(col_na_values) | col_na_fvalues,
13421335
try_num_bool)
13431336

1337+
# type specificed in dtype param
1338+
if cast_type and not is_dtype_equal(cvals, cast_type):
1339+
cvals = self._cast_types(cvals, cast_type, c)
1340+
13441341
if issubclass(cvals.dtype.type, np.integer) and self.compact_ints:
13451342
cvals = lib.downcast_int64(
13461343
cvals, _parser.na_values,
13471344
self.use_unsigned)
13481345

1349-
if cast_type and not is_dtype_equal(cvals, cast_type):
1350-
# type specificed in dtype param
1351-
1352-
cvals = self._cast_types(cvals, cast_type, c)
1353-
# for consistency with c-parser, if a converter and dtype are
1354-
# specified, apply the converter last
1355-
if conv_f is not None:
1356-
values, na_count = self._apply_converter(
1357-
values, conv_f, na_values,
1358-
col_na_values, col_na_fvalues)
1359-
13601346
result[c] = cvals
13611347
if verbose and na_count:
13621348
print('Filled %d NA values in column %s' % (na_count, str(c)))
13631349
return result
13641350

13651351
def _infer_types(self, values, na_values, try_num_bool=True):
1352+
"""
1353+
Infer types of values, possibly casting
1354+
1355+
Parameters
1356+
----------
1357+
values : ndarray
1358+
na_values : set
1359+
try_num_bool : bool, default try
1360+
try to cast values to numeric (first preference) or boolean
1361+
1362+
Returns:
1363+
--------
1364+
converted : ndarray
1365+
na_count : int
1366+
"""
1367+
13661368
na_count = 0
13671369
if issubclass(values.dtype.type, (np.number, np.bool_)):
13681370
mask = lib.ismember(values, na_values)
@@ -1394,7 +1396,22 @@ def _infer_types(self, values, na_values, try_num_bool=True):
13941396
return result, na_count
13951397

13961398
def _cast_types(self, values, cast_type, column):
1397-
""" cast column to type specified in dtypes= param """
1399+
"""
1400+
Cast values to specified type
1401+
1402+
Parameters
1403+
----------
1404+
values : ndarray
1405+
cast_type : string or np.dtype
1406+
dtype to cast values to
1407+
column : string
1408+
column name - used only for error reporting
1409+
1410+
Returns
1411+
-------
1412+
converted : ndarray
1413+
"""
1414+
13981415
if is_categorical_dtype(cast_type):
13991416
# XXX this is for consistency with
14001417
# c-parser which parses all categories

pandas/io/tests/parser/dtypes.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,3 +214,13 @@ def test_raise_on_passed_int_dtype_with_nas(self):
214214
self.assertRaises(ValueError, self.read_csv, StringIO(data),
215215
sep=",", skipinitialspace=True,
216216
dtype={'DOY': np.int64})
217+
218+
def test_dtype_with_converter(self):
219+
data = """a,b
220+
1.1,2.2
221+
1.2,2.3"""
222+
result = self.read_csv(StringIO(data), dtype={'a': 'i8'},
223+
converters={'a': lambda x: str(x)})
224+
# dtype spec ignored if converted specified
225+
expected = DataFrame({'a': ['1.1', '1.2'], 'b': [2.2, 2.3]})
226+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)