@@ -34,7 +34,10 @@ import numpy as np
34
34
cimport util
35
35
36
36
import pandas.lib as lib
37
- from pandas.core.common import is_categorical_dtype, CategoricalDtype
37
+ from pandas.core.common import (is_categorical_dtype, CategoricalDtype,
38
+ is_integer_dtype, is_float_dtype,
39
+ is_bool_dtype, is_object_dtype,
40
+ is_string_dtype, is_datetime64_dtype)
38
41
from pandas.core.categorical import Categorical
39
42
from pandas.types.concat import union_categoricals
40
43
@@ -224,19 +227,13 @@ cdef extern from "parser/tokenizer.h":
224
227
int to_boolean(const char * item, uint8_t * val) nogil
225
228
226
229
227
- # XXX
228
- # this is a hack - in order to make the inference
229
- # functions generic (converting either data directly
230
- # from the parser or from a passed in hash table)
231
- # we add an "optional" parameter via fused type, that can either
232
- # be the hash table to parse, or an integer, which is used
233
- # as a sentinel to specialize the function for reading
234
- # from the parser.
235
230
236
- # This is to avoid duplicating a bunch of code or
237
- # adding runtime checks, but may be too much
231
+ # to make the inference functions generic
232
+ # add an optional last parameter that is
233
+ # the source of data to be used
234
+ # other than the parser_t
238
235
ctypedef kh_str_t* kh_str_t_p
239
- ctypedef int use_parser_data
236
+ ctypedef void * use_parser_data
240
237
241
238
ctypedef fused inference_data_t:
242
239
kh_str_t_p
@@ -421,11 +418,12 @@ cdef class TextReader:
421
418
422
419
self ._set_quoting(quotechar, quoting)
423
420
424
- # TODO: endianness just a placeholder?
421
+
422
+ dtype_order = [' int64' , ' float64' , ' bool' , ' object' ]
425
423
if quoting == QUOTE_NONNUMERIC:
426
- self .dtype_cast_order = [ ' <f8 ' , ' <i8 ' , ' |b1 ' , ' |O8 ' ]
427
- else :
428
- self .dtype_cast_order = [' <i8 ' , ' <f8 ' , ' |b1 ' , ' |O8 ' ]
424
+ # consistent with csv module semantics, cast all to float
425
+ dtype_order = dtype_order[ 1 :]
426
+ self .dtype_cast_order = [np.dtype(x) for x in dtype_order ]
429
427
430
428
if comment is not None :
431
429
if len (comment) > 1 :
@@ -1108,12 +1106,6 @@ cdef class TextReader:
1108
1106
col_dtype = self .dtype
1109
1107
1110
1108
if col_dtype is not None :
1111
- if not isinstance (col_dtype, basestring ):
1112
- if isinstance (col_dtype, np.dtype) or is_categorical_dtype(col_dtype):
1113
- col_dtype = col_dtype.str
1114
- else :
1115
- col_dtype = np.dtype(col_dtype).str
1116
-
1117
1109
col_res, na_count = self ._convert_with_dtype(col_dtype, i, start, end,
1118
1110
na_filter, 1 , na_hashset, na_flist)
1119
1111
@@ -1131,7 +1123,7 @@ cdef class TextReader:
1131
1123
dt, i, start, end, na_filter, 0 , na_hashset, na_flist)
1132
1124
except OverflowError :
1133
1125
col_res, na_count = self ._convert_with_dtype(
1134
- ' |O8 ' , i, start, end, na_filter, 0 , na_hashset, na_flist)
1126
+ np.dtype( ' object ' ) , i, start, end, na_filter, 0 , na_hashset, na_flist)
1135
1127
1136
1128
if col_res is not None :
1137
1129
break
@@ -1163,70 +1155,66 @@ cdef class TextReader:
1163
1155
bint user_dtype,
1164
1156
kh_str_t * na_hashset,
1165
1157
object na_flist):
1166
- if dtype[ 1 ] == ' i ' or dtype[ 1 ] == ' u ' :
1167
- result, na_count = _try_int64(self .parser, i, start, end ,
1168
- na_filter, na_hashset ,
1169
- < use_parser_data > NULL )
1158
+ if is_integer_dtype( dtype) :
1159
+ result, na_count = _try_int64[use_parser_data] (self .parser, i,
1160
+ start, end, na_filter ,
1161
+ na_hashset, NULL )
1170
1162
if user_dtype and na_count is not None :
1171
1163
if na_count > 0 :
1172
1164
raise ValueError (" Integer column has NA values in "
1173
1165
" column {column}" .format(column = i))
1174
1166
1175
- if result is not None and dtype[ 1 :] != ' i8 ' :
1167
+ if result is not None and dtype != ' int64 ' :
1176
1168
result = result.astype(dtype)
1177
1169
1178
1170
return result, na_count
1179
1171
1180
- elif dtype[ 1 ] == ' f ' :
1181
- result, na_count = _try_double(self .parser, i, start, end,
1182
- na_filter, na_hashset, na_flist,
1183
- < use_parser_data > NULL )
1172
+ elif is_float_dtype( dtype) :
1173
+ result, na_count = _try_double[use_parser_data] (self .parser, i, start, end,
1174
+ na_filter, na_hashset, na_flist,
1175
+ NULL )
1184
1176
1185
- if result is not None and dtype[ 1 :] != ' f8 ' :
1177
+ if result is not None and dtype != ' float64 ' :
1186
1178
result = result.astype(dtype)
1187
1179
return result, na_count
1188
1180
1189
- elif dtype[ 1 ] == ' b ' :
1190
- result, na_count = _try_bool_flex(self .parser, i, start, end,
1191
- na_filter, na_hashset,
1192
- self .true_set, self .false_set,
1193
- < use_parser_data > NULL )
1181
+ elif is_bool_dtype( dtype) :
1182
+ result, na_count = _try_bool_flex[use_parser_data] (self .parser, i, start, end,
1183
+ na_filter, na_hashset,
1184
+ self .true_set, self .false_set,
1185
+ NULL )
1194
1186
return result, na_count
1195
- elif dtype[1 ] == ' c' :
1196
- raise NotImplementedError (" the dtype %s is not supported for parsing" % dtype)
1197
-
1198
- elif dtype[1 ] == ' S' :
1187
+ elif dtype.kind == ' S' :
1199
1188
# TODO: na handling
1200
- width = int ( dtype[ 2 :])
1189
+ width = dtype.itemsize
1201
1190
if width > 0 :
1202
1191
result = _to_fw_string(self .parser, i, start, end, width)
1203
1192
return result, 0
1204
1193
1205
1194
# treat as a regular string parsing
1206
1195
return self ._string_convert(i, start, end, na_filter,
1207
1196
na_hashset)
1208
- elif dtype[ 1 ] == ' U' :
1209
- width = int ( dtype[ 2 :])
1197
+ elif dtype.kind == ' U' :
1198
+ width = dtype.itemsize
1210
1199
if width > 0 :
1211
1200
raise NotImplementedError (" the dtype %s is not supported for parsing" % dtype)
1212
1201
1213
1202
# unicode variable width
1214
1203
return self ._string_convert(i, start, end, na_filter,
1215
1204
na_hashset)
1216
1205
# is this comparison good enough?
1217
- elif dtype == ' |O08 ' :
1206
+ elif is_categorical_dtype( dtype) :
1218
1207
codes, cats, na_count = _categorical_convert(self .parser, i, start,
1219
1208
end, na_filter, na_hashset,
1220
1209
na_flist, self .true_set,
1221
1210
self .false_set, self .c_encoding)
1222
-
1223
1211
return Categorical(codes, categories = cats, ordered = False ,
1224
1212
fastpath = True ), na_count
1225
- elif dtype[ 1 ] == ' O ' :
1213
+ elif is_object_dtype( dtype) :
1226
1214
return self ._string_convert(i, start, end, na_filter,
1227
1215
na_hashset)
1228
1216
else :
1229
- if dtype[ 1 ] == ' M ' :
1217
+ if is_datetime64_dtype( dtype) :
1230
1218
raise TypeError (" the dtype %s is not supported for parsing, "
1231
1219
" pass this column using parse_dates instead" % dtype)
1232
1220
raise TypeError (" the dtype %s is not supported for parsing" % dtype)
@@ -1588,7 +1576,7 @@ cdef _categorical_convert(parser_t *parser, int col,
1588
1576
1589
1577
codes[i] = table.vals[k]
1590
1578
1591
-
1579
+ # Codes are complete, now inference on cats
1592
1580
# follow the same inference attempts as
1593
1581
# normal data (int64, float64, bool, object)
1594
1582
result, result_na = _try_int64(parser, col, 0 , table.n_occupied,
@@ -1603,9 +1591,10 @@ cdef _categorical_convert(parser_t *parser, int col,
1603
1591
result, result_na = _try_bool_flex(parser, col, 0 , table.n_occupied,
1604
1592
na_filter, na_hashset, true_hashset,
1605
1593
false_hashset, table)
1606
- # duplicated logic here, but doesn't make sense to reuse
1607
- # other string logic since those paths factorize where we
1608
- # already have guaranteed uniques
1594
+
1595
+ # if no numeric types parsed, convert to object.
1596
+ # Note that the decoding path logic should sync up with that
1597
+ # of `TextReader.string_convert`
1609
1598
if result is None :
1610
1599
i = 0
1611
1600
result = np.empty(table.n_occupied, dtype = np.object_)
@@ -1694,10 +1683,10 @@ cdef inline _try_double(parser_t *parser, int col, int line_start, int line_end,
1694
1683
1695
1684
1696
1685
cdef inline int _try_double_nogil(parser_t * parser, int col, int line_start, int line_end,
1697
- bint na_filter, kh_str_t * na_hashset, bint use_na_flist,
1698
- const kh_float64_t * na_flist,
1699
- double NA, double * data, int * na_count,
1700
- inference_data_t inference_data) nogil:
1686
+ bint na_filter, kh_str_t * na_hashset, bint use_na_flist,
1687
+ const kh_float64_t * na_flist,
1688
+ double NA, double * data, int * na_count,
1689
+ inference_data_t inference_data) nogil:
1701
1690
cdef:
1702
1691
int error,
1703
1692
size_t i
@@ -1783,7 +1772,6 @@ cdef _try_int64(parser_t *parser, int col, int line_start, int line_end,
1783
1772
lines = line_end - line_start
1784
1773
result = np.empty(lines, dtype = np.int64)
1785
1774
data = < int64_t * > result.data
1786
- # compile time
1787
1775
with nogil:
1788
1776
error = _try_int64_nogil(parser, col, line_start, line_end, na_filter,
1789
1777
na_hashset, NA, data, & na_count, inference_data)
@@ -2104,7 +2092,6 @@ def _concatenate_chunks(list chunks):
2104
2092
2105
2093
if is_categorical_dtype(dtypes.pop()):
2106
2094
result[name] = union_categoricals(arrs)
2107
- # np.concatenate([c.codes for c in arrs])
2108
2095
else :
2109
2096
result[name] = np.concatenate(arrs)
2110
2097
0 commit comments