Skip to content

Commit cfa0ce4

Browse files
committed
clean up dtype checking, add function specialization
1 parent 286d907 commit cfa0ce4

File tree

1 file changed

+46
-59
lines changed

1 file changed

+46
-59
lines changed

pandas/parser.pyx

Lines changed: 46 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,10 @@ import numpy as np
3434
cimport util
3535

3636
import pandas.lib as lib
37-
from pandas.core.common import is_categorical_dtype, CategoricalDtype
37+
from pandas.core.common import (is_categorical_dtype, CategoricalDtype,
38+
is_integer_dtype, is_float_dtype,
39+
is_bool_dtype, is_object_dtype,
40+
is_string_dtype, is_datetime64_dtype)
3841
from pandas.core.categorical import Categorical
3942
from pandas.types.concat import union_categoricals
4043

@@ -224,19 +227,13 @@ cdef extern from "parser/tokenizer.h":
224227
int to_boolean(const char *item, uint8_t *val) nogil
225228

226229

227-
# XXX
228-
# this is a hack - in order to make the inference
229-
# functions generic (converting either data directly
230-
# from the parser or from a passed in hash table)
231-
# we add an "optional" parameter via fused type, that can either
232-
# be the hash table to parse, or an integer, which is used
233-
# as a sentinel to specialize the function for reading
234-
# from the parser.
235230

236-
# This is to avoid duplicating a bunch of code or
237-
# adding runtime checks, but may be too much
231+
# to make the inference functions generic
232+
# add an optional last parameter that is
233+
# the source of data to be used
234+
# other than the parser_t
238235
ctypedef kh_str_t* kh_str_t_p
239-
ctypedef int use_parser_data
236+
ctypedef void* use_parser_data
240237

241238
ctypedef fused inference_data_t:
242239
kh_str_t_p
@@ -421,11 +418,12 @@ cdef class TextReader:
421418

422419
self._set_quoting(quotechar, quoting)
423420

424-
# TODO: endianness just a placeholder?
421+
422+
dtype_order = ['int64', 'float64', 'bool', 'object']
425423
if quoting == QUOTE_NONNUMERIC:
426-
self.dtype_cast_order = ['<f8', '<i8', '|b1', '|O8']
427-
else:
428-
self.dtype_cast_order = ['<i8', '<f8', '|b1', '|O8']
424+
# consistent with csv module semantics, cast all to float
425+
dtype_order = dtype_order[1:]
426+
self.dtype_cast_order = [np.dtype(x) for x in dtype_order]
429427

430428
if comment is not None:
431429
if len(comment) > 1:
@@ -1108,12 +1106,6 @@ cdef class TextReader:
11081106
col_dtype = self.dtype
11091107

11101108
if col_dtype is not None:
1111-
if not isinstance(col_dtype, basestring):
1112-
if isinstance(col_dtype, np.dtype) or is_categorical_dtype(col_dtype):
1113-
col_dtype = col_dtype.str
1114-
else:
1115-
col_dtype = np.dtype(col_dtype).str
1116-
11171109
col_res, na_count = self._convert_with_dtype(col_dtype, i, start, end,
11181110
na_filter, 1, na_hashset, na_flist)
11191111

@@ -1131,7 +1123,7 @@ cdef class TextReader:
11311123
dt, i, start, end, na_filter, 0, na_hashset, na_flist)
11321124
except OverflowError:
11331125
col_res, na_count = self._convert_with_dtype(
1134-
'|O8', i, start, end, na_filter, 0, na_hashset, na_flist)
1126+
np.dtype('object'), i, start, end, na_filter, 0, na_hashset, na_flist)
11351127

11361128
if col_res is not None:
11371129
break
@@ -1163,70 +1155,66 @@ cdef class TextReader:
11631155
bint user_dtype,
11641156
kh_str_t *na_hashset,
11651157
object na_flist):
1166-
if dtype[1] == 'i' or dtype[1] == 'u':
1167-
result, na_count = _try_int64(self.parser, i, start, end,
1168-
na_filter, na_hashset,
1169-
<use_parser_data>NULL)
1158+
if is_integer_dtype(dtype):
1159+
result, na_count = _try_int64[use_parser_data](self.parser, i,
1160+
start, end, na_filter,
1161+
na_hashset, NULL)
11701162
if user_dtype and na_count is not None:
11711163
if na_count > 0:
11721164
raise ValueError("Integer column has NA values in "
11731165
"column {column}".format(column=i))
11741166

1175-
if result is not None and dtype[1:] != 'i8':
1167+
if result is not None and dtype != 'int64':
11761168
result = result.astype(dtype)
11771169

11781170
return result, na_count
11791171

1180-
elif dtype[1] == 'f':
1181-
result, na_count = _try_double(self.parser, i, start, end,
1182-
na_filter, na_hashset, na_flist,
1183-
<use_parser_data>NULL)
1172+
elif is_float_dtype(dtype):
1173+
result, na_count = _try_double[use_parser_data](self.parser, i, start, end,
1174+
na_filter, na_hashset, na_flist,
1175+
NULL)
11841176

1185-
if result is not None and dtype[1:] != 'f8':
1177+
if result is not None and dtype != 'float64':
11861178
result = result.astype(dtype)
11871179
return result, na_count
11881180

1189-
elif dtype[1] == 'b':
1190-
result, na_count = _try_bool_flex(self.parser, i, start, end,
1191-
na_filter, na_hashset,
1192-
self.true_set, self.false_set,
1193-
<use_parser_data>NULL)
1181+
elif is_bool_dtype(dtype):
1182+
result, na_count = _try_bool_flex[use_parser_data](self.parser, i, start, end,
1183+
na_filter, na_hashset,
1184+
self.true_set, self.false_set,
1185+
NULL)
11941186
return result, na_count
1195-
elif dtype[1] == 'c':
1196-
raise NotImplementedError("the dtype %s is not supported for parsing" % dtype)
1197-
1198-
elif dtype[1] == 'S':
1187+
elif dtype.kind == 'S':
11991188
# TODO: na handling
1200-
width = int(dtype[2:])
1189+
width = dtype.itemsize
12011190
if width > 0:
12021191
result = _to_fw_string(self.parser, i, start, end, width)
12031192
return result, 0
12041193

12051194
# treat as a regular string parsing
12061195
return self._string_convert(i, start, end, na_filter,
12071196
na_hashset)
1208-
elif dtype[1] == 'U':
1209-
width = int(dtype[2:])
1197+
elif dtype.kind == 'U':
1198+
width = dtype.itemsize
12101199
if width > 0:
12111200
raise NotImplementedError("the dtype %s is not supported for parsing" % dtype)
12121201

12131202
# unicode variable width
12141203
return self._string_convert(i, start, end, na_filter,
12151204
na_hashset)
12161205
# is this comparison good enough?
1217-
elif dtype == '|O08':
1206+
elif is_categorical_dtype(dtype):
12181207
codes, cats, na_count = _categorical_convert(self.parser, i, start,
12191208
end, na_filter, na_hashset,
12201209
na_flist, self.true_set,
12211210
self.false_set, self.c_encoding)
1222-
12231211
return Categorical(codes, categories=cats, ordered=False,
12241212
fastpath=True), na_count
1225-
elif dtype[1] == 'O':
1213+
elif is_object_dtype(dtype):
12261214
return self._string_convert(i, start, end, na_filter,
12271215
na_hashset)
12281216
else:
1229-
if dtype[1] == 'M':
1217+
if is_datetime64_dtype(dtype):
12301218
raise TypeError("the dtype %s is not supported for parsing, "
12311219
"pass this column using parse_dates instead" % dtype)
12321220
raise TypeError("the dtype %s is not supported for parsing" % dtype)
@@ -1588,7 +1576,7 @@ cdef _categorical_convert(parser_t *parser, int col,
15881576

15891577
codes[i] = table.vals[k]
15901578

1591-
1579+
# Codes are complete, now inference on cats
15921580
# follow the same inference attempts as
15931581
# normal data (int64, float64, bool, object)
15941582
result, result_na = _try_int64(parser, col, 0, table.n_occupied,
@@ -1603,9 +1591,10 @@ cdef _categorical_convert(parser_t *parser, int col,
16031591
result, result_na = _try_bool_flex(parser, col, 0, table.n_occupied,
16041592
na_filter, na_hashset, true_hashset,
16051593
false_hashset, table)
1606-
# duplicated logic here, but doesn't make sense to reuse
1607-
# other string logic since those paths factorize where we
1608-
# already have guaranteed uniques
1594+
1595+
# if no numeric types parsed, convert to object.
1596+
# Note that the decoding path logic should sync up with that
1597+
# of `TextReader.string_convert`
16091598
if result is None:
16101599
i = 0
16111600
result = np.empty(table.n_occupied, dtype=np.object_)
@@ -1694,10 +1683,10 @@ cdef inline _try_double(parser_t *parser, int col, int line_start, int line_end,
16941683

16951684

16961685
cdef inline int _try_double_nogil(parser_t *parser, int col, int line_start, int line_end,
1697-
bint na_filter, kh_str_t *na_hashset, bint use_na_flist,
1698-
const kh_float64_t *na_flist,
1699-
double NA, double *data, int *na_count,
1700-
inference_data_t inference_data) nogil:
1686+
bint na_filter, kh_str_t *na_hashset, bint use_na_flist,
1687+
const kh_float64_t *na_flist,
1688+
double NA, double *data, int *na_count,
1689+
inference_data_t inference_data) nogil:
17011690
cdef:
17021691
int error,
17031692
size_t i
@@ -1783,7 +1772,6 @@ cdef _try_int64(parser_t *parser, int col, int line_start, int line_end,
17831772
lines = line_end - line_start
17841773
result = np.empty(lines, dtype=np.int64)
17851774
data = <int64_t *> result.data
1786-
# compile time
17871775
with nogil:
17881776
error = _try_int64_nogil(parser, col, line_start, line_end, na_filter,
17891777
na_hashset, NA, data, &na_count, inference_data)
@@ -2104,7 +2092,6 @@ def _concatenate_chunks(list chunks):
21042092

21052093
if is_categorical_dtype(dtypes.pop()):
21062094
result[name] = union_categoricals(arrs)
2107-
#np.concatenate([c.codes for c in arrs])
21082095
else:
21092096
result[name] = np.concatenate(arrs)
21102097

0 commit comments

Comments
 (0)