diff --git a/doc/source/release.rst b/doc/source/release.rst index 9324d8b28f107..d4b2ee612965f 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -119,6 +119,7 @@ Bug Fixes - Regresssion in handling of empty Series as indexers to Series (:issue:`5877`) - Bug in internal caching, related to (:issue:`5727`) - Testing bug in reading json/msgpack from a non-filepath on windows under py3 (:issue:`5874`) + - Fix performance regression in JSON IO (:issue:`5765`) - Bug when assigning to .ix[tuple(...)] (:issue:`5896`) - Bug in fully reindexing a Panel (:issue:`5905`) - Bug in idxmin/max with object dtypes (:issue:`5914`) diff --git a/pandas/src/ujson/lib/ultrajsondec.c b/pandas/src/ujson/lib/ultrajsondec.c index 85a8387547641..bae075b4376b1 100644 --- a/pandas/src/ujson/lib/ultrajsondec.c +++ b/pandas/src/ujson/lib/ultrajsondec.c @@ -894,15 +894,23 @@ JSOBJ JSON_DecodeObject(JSONObjectDecoder *dec, const char *buffer, size_t cbBuf ds.dec = dec; - locale = strdup(setlocale(LC_NUMERIC, NULL)); - if (!locale) + locale = setlocale(LC_NUMERIC, NULL); + if (strcmp(locale, "C")) { - return SetError(&ds, -1, "Could not reserve memory block"); + locale = strdup(locale); + if (!locale) + { + return SetError(&ds, -1, "Could not reserve memory block"); + } + setlocale(LC_NUMERIC, "C"); + ret = decode_any (&ds); + setlocale(LC_NUMERIC, locale); + free(locale); + } + else + { + ret = decode_any (&ds); } - setlocale(LC_NUMERIC, "C"); - ret = decode_any (&ds); - setlocale(LC_NUMERIC, locale); - free(locale); if (ds.escHeap) { diff --git a/pandas/src/ujson/lib/ultrajsonenc.c b/pandas/src/ujson/lib/ultrajsonenc.c index 17048bd86adc2..5e2a226ae8d63 100644 --- a/pandas/src/ujson/lib/ultrajsonenc.c +++ b/pandas/src/ujson/lib/ultrajsonenc.c @@ -917,16 +917,24 @@ char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc, char *_buffer, size_t enc->end = enc->start + _cbBuffer; enc->offset = enc->start; - locale = strdup(setlocale(LC_NUMERIC, NULL)); - if (!locale) + locale = setlocale(LC_NUMERIC, NULL); + if (strcmp(locale, "C")) { - SetError(NULL, enc, "Could not reserve memory block"); - return NULL; + locale = strdup(locale); + if (!locale) + { + SetError(NULL, enc, "Could not reserve memory block"); + return NULL; + } + setlocale(LC_NUMERIC, "C"); + encode (obj, enc, NULL, 0); + setlocale(LC_NUMERIC, locale); + free(locale); + } + else + { + encode (obj, enc, NULL, 0); } - setlocale(LC_NUMERIC, "C"); - encode (obj, enc, NULL, 0); - setlocale(LC_NUMERIC, locale); - free(locale); Buffer_Reserve(enc, 1); if (enc->errorMsg) diff --git a/pandas/src/ujson/python/objToJSON.c b/pandas/src/ujson/python/objToJSON.c index 13d403cdb2b7b..f6cb5b9803e25 100644 --- a/pandas/src/ujson/python/objToJSON.c +++ b/pandas/src/ujson/python/objToJSON.c @@ -74,6 +74,8 @@ typedef struct __NpyArrContext npy_intp stride; npy_intp ndim; npy_intp index[NPY_MAXDIMS]; + int type_num; + PyArray_GetItemFunc* getitem; char** rowLabels; char** columnLabels; @@ -114,6 +116,10 @@ typedef struct __PyObjectEncoder // pass through the NpyArrContext when encoding multi-dimensional arrays NpyArrContext* npyCtxtPassthru; + // pass through a request for a specific encoding context + int requestType; + TypeContext* requestTypeContext; + int datetimeIso; PANDAS_DATETIMEUNIT datetimeUnit; @@ -182,6 +188,35 @@ void initObjToJSON(void) return NUMPY_IMPORT_ARRAY_RETVAL; } +TypeContext* createTypeContext() +{ + TypeContext *pc; + + pc = PyObject_Malloc(sizeof(TypeContext)); + if (!pc) + { + PyErr_NoMemory(); + return NULL; + } + pc->newObj = NULL; + pc->dictObj = NULL; + pc->itemValue = NULL; + pc->itemName = NULL; + pc->attrList = NULL; + pc->index = 0; + pc->size = 0; + pc->longValue = 0; + pc->cStr = NULL; + pc->npyarr = NULL; + pc->rowLabels = NULL; + pc->columnLabels = NULL; + pc->transpose = 0; + pc->rowLabelsLen = 0; + pc->columnLabelsLen = 0; + + return pc; +} + static void *PyIntToINT32(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) { PyObject *obj = (PyObject *) _obj; @@ -288,6 +323,7 @@ static void *PyDateTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, void *outValue, s pandas_datetimestruct dts; PyObject *obj = (PyObject *) _obj; + PRINTMARK(); if (!convert_pydatetime_to_datetimestruct(obj, &dts, NULL, 1)) { @@ -310,6 +346,8 @@ static void *NpyDatetime64ToJSON(JSOBJ _obj, JSONTypeContext *tc, void *outValue pandas_datetimestruct dts; PyObject *obj = (PyObject *) _obj; + PRINTMARK(); + pandas_datetime_to_datetimestruct(PyLong_AsLongLong(obj), PANDAS_FR_ns, &dts); return PandasDateTimeStructToJSON(&dts, tc, outValue, _outLen); } @@ -338,6 +376,26 @@ static void *PyTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_ return outValue; } +void requestDateEncoding(PyObject* obj, PyObjectEncoder* pyenc) +{ + if (obj == Py_None) { + pyenc->requestType = JT_NULL; + return; + } + + if (pyenc->datetimeIso) + { + pyenc->requestType = JT_UTF8; + } + else + { + pyenc->requestType = JT_LONG; + } + pyenc->requestTypeContext = createTypeContext(); + pyenc->requestTypeContext->PyTypeToJSON = NpyDatetime64ToJSON; +} + + //============================================================================= // Numpy array iteration functions //============================================================================= @@ -374,9 +432,11 @@ void NpyArr_iterBegin(JSOBJ _obj, JSONTypeContext *tc) } npyarr->array = (PyObject*) obj; + npyarr->getitem = (PyArray_GetItemFunc*) PyArray_DESCR(obj)->f->getitem; npyarr->dataptr = PyArray_DATA(obj); npyarr->ndim = PyArray_NDIM(obj) - 1; npyarr->curdim = 0; + npyarr->type_num = PyArray_DESCR(obj)->type_num; if (GET_TC(tc)->transpose) { @@ -450,7 +510,9 @@ void NpyArrPassThru_iterEnd(JSOBJ obj, JSONTypeContext *tc) int NpyArr_iterNextItem(JSOBJ _obj, JSONTypeContext *tc) { NpyArrContext* npyarr; + PRINTMARK(); + npyarr = GET_TC(tc)->npyarr; if (PyErr_Occurred()) @@ -469,7 +531,22 @@ int NpyArr_iterNextItem(JSOBJ _obj, JSONTypeContext *tc) return 0; } - GET_TC(tc)->itemValue = PyArray_ToScalar(npyarr->dataptr, npyarr->array); +#if NPY_API_VERSION < 0x00000007 + if(PyTypeNum_ISDATETIME(npyarr->type_num)) + { + GET_TC(tc)->itemValue = PyArray_ToScalar(npyarr->dataptr, npyarr->array); + } + else + { + GET_TC(tc)->itemValue = npyarr->getitem(npyarr->dataptr, npyarr->array); + } +#else + GET_TC(tc)->itemValue = npyarr->getitem(npyarr->dataptr, npyarr->array); + if(PyTypeNum_ISDATETIME(npyarr->type_num)) + { + requestDateEncoding(GET_TC(tc)->itemValue, (PyObjectEncoder*) tc->encoder); + } +#endif npyarr->dataptr += npyarr->stride; npyarr->index[npyarr->stridedim]++; @@ -1104,6 +1181,8 @@ char** NpyArr_encodeLabels(PyArrayObject* labels, JSONObjectEncoder* enc, npy_in char** ret; char *dataptr, *cLabel, *origend, *origst, *origoffset; char labelBuffer[NPY_JSON_BUFSIZE]; + PyArray_GetItemFunc* getitem; + int type_num; PRINTMARK(); if (PyArray_SIZE(labels) < num) @@ -1132,10 +1211,27 @@ char** NpyArr_encodeLabels(PyArrayObject* labels, JSONObjectEncoder* enc, npy_in stride = PyArray_STRIDE(labels, 0); dataptr = PyArray_DATA(labels); + getitem = (PyArray_GetItemFunc*) PyArray_DESCR(labels)->f->getitem; + type_num = PyArray_DESCR(labels)->type_num; for (i = 0; i < num; i++) { - item = PyArray_ToScalar(dataptr, labels); +#if NPY_API_VERSION < 0x00000007 + if(PyTypeNum_ISDATETIME(type_num)) + { + item = PyArray_ToScalar(dataptr, labels); + } + else + { + item = getitem(dataptr, labels); + } +#else + item = getitem(dataptr, labels); + if(PyTypeNum_ISDATETIME(type_num)) + { + requestDateEncoding(item, (PyObjectEncoder*) enc); + } +#endif if (!item) { NpyArr_freeLabels(ret, num); @@ -1202,39 +1298,28 @@ void Object_beginTypeContext (JSOBJ _obj, JSONTypeContext *tc) obj = (PyObject*) _obj; enc = (PyObjectEncoder*) tc->encoder; - tc->prv = PyObject_Malloc(sizeof(TypeContext)); - pc = (TypeContext *) tc->prv; - if (!pc) + if (enc->requestType) { - tc->type = JT_INVALID; - PyErr_NoMemory(); + PRINTMARK(); + tc->type = enc->requestType; + tc->prv = enc->requestTypeContext; + + enc->requestType = 0; + enc->requestTypeContext = NULL; return; } - pc->newObj = NULL; - pc->dictObj = NULL; - pc->itemValue = NULL; - pc->itemName = NULL; - pc->attrList = NULL; - pc->index = 0; - pc->size = 0; - pc->longValue = 0; - pc->cStr = NULL; - pc->npyarr = NULL; - pc->rowLabels = NULL; - pc->columnLabels = NULL; - pc->transpose = 0; - pc->rowLabelsLen = 0; - pc->columnLabelsLen = 0; - - if (PyIter_Check(obj)) + pc = createTypeContext(); + if (!pc) { - PRINTMARK(); - goto ISITERABLE; + tc->type = JT_INVALID; + return; } + tc->prv = pc; if (PyIter_Check(obj) || PyArray_Check(obj)) { + PRINTMARK(); goto ISITERABLE; } @@ -1263,28 +1348,6 @@ void Object_beginTypeContext (JSOBJ _obj, JSONTypeContext *tc) return; } else - if (PyArray_IsScalar(obj, Datetime)) - { - PRINTMARK(); - if (((PyDatetimeScalarObject*) obj)->obval == get_nat()) { - PRINTMARK(); - tc->type = JT_NULL; - return; - } - - PRINTMARK(); - pc->PyTypeToJSON = NpyDateTimeToJSON; - if (enc->datetimeIso) - { - tc->type = JT_UTF8; - } - else - { - tc->type = JT_LONG; - } - return; - } - else if (PyInt_Check(obj)) { PRINTMARK(); @@ -1297,29 +1360,18 @@ void Object_beginTypeContext (JSOBJ _obj, JSONTypeContext *tc) return; } else - if (PyArray_IsScalar(obj, Bool)) - { - PRINTMARK(); - PyArray_CastScalarToCtype(obj, &(GET_TC(tc)->longValue), PyArray_DescrFromType(NPY_BOOL)); - tc->type = (GET_TC(tc)->longValue) ? JT_TRUE : JT_FALSE; - return; - } - else - if (PyArray_IsScalar(obj, Integer)) + if (PyFloat_Check(obj)) { PRINTMARK(); - pc->PyTypeToJSON = PyLongToINT64; - tc->type = JT_LONG; - PyArray_CastScalarToCtype(obj, &(GET_TC(tc)->longValue), PyArray_DescrFromType(NPY_INT64)); - - exc = PyErr_Occurred(); - - if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) + val = PyFloat_AS_DOUBLE (obj); + if (npy_isnan(val) || npy_isinf(val)) { - PRINTMARK(); - goto INVALID; + tc->type = JT_NULL; + } + else + { + pc->PyTypeToJSON = PyFloatToDOUBLE; tc->type = JT_DOUBLE; } - return; } else @@ -1337,18 +1389,10 @@ void Object_beginTypeContext (JSOBJ _obj, JSONTypeContext *tc) return; } else - if (PyFloat_Check(obj)) + if (obj == Py_None) { PRINTMARK(); - val = PyFloat_AS_DOUBLE (obj); - if (npy_isnan(val) || npy_isinf(val)) - { - tc->type = JT_NULL; - } - else - { - pc->PyTypeToJSON = PyFloatToDOUBLE; tc->type = JT_DOUBLE; - } + tc->type = JT_NULL; return; } else @@ -1359,13 +1403,6 @@ void Object_beginTypeContext (JSOBJ _obj, JSONTypeContext *tc) return; } else - if (PyArray_IsScalar(obj, Float)) - { - PRINTMARK(); - pc->PyTypeToJSON = NpyFloatToDOUBLE; tc->type = JT_DOUBLE; - return; - } - else if (PyDateTime_Check(obj) || PyDate_Check(obj)) { if (PyObject_TypeCheck(obj, cls_nat)) @@ -1397,67 +1434,63 @@ void Object_beginTypeContext (JSOBJ _obj, JSONTypeContext *tc) return; } else - if (obj == Py_None) + if (PyArray_IsScalar(obj, Datetime)) { PRINTMARK(); - tc->type = JT_NULL; - return; - } - - -ISITERABLE: - - if (PyDict_Check(obj)) - { + if (((PyDatetimeScalarObject*) obj)->obval == get_nat()) { PRINTMARK(); - tc->type = JT_OBJECT; - pc->iterBegin = Dict_iterBegin; - pc->iterEnd = Dict_iterEnd; - pc->iterNext = Dict_iterNext; - pc->iterGetValue = Dict_iterGetValue; - pc->iterGetName = Dict_iterGetName; - pc->dictObj = obj; - Py_INCREF(obj); - + tc->type = JT_NULL; return; + } + + PRINTMARK(); + pc->PyTypeToJSON = NpyDateTimeToJSON; + if (enc->datetimeIso) + { + tc->type = JT_UTF8; + } + else + { + tc->type = JT_LONG; + } + return; } else - if (PyList_Check(obj)) + if (PyArray_IsScalar(obj, Integer)) { + PRINTMARK(); + pc->PyTypeToJSON = PyLongToINT64; + tc->type = JT_LONG; + PyArray_CastScalarToCtype(obj, &(GET_TC(tc)->longValue), PyArray_DescrFromType(NPY_INT64)); + + exc = PyErr_Occurred(); + + if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) + { PRINTMARK(); - tc->type = JT_ARRAY; - pc->iterBegin = List_iterBegin; - pc->iterEnd = List_iterEnd; - pc->iterNext = List_iterNext; - pc->iterGetValue = List_iterGetValue; - pc->iterGetName = List_iterGetName; - return; + goto INVALID; + } + + return; } else - if (PyTuple_Check(obj)) + if (PyArray_IsScalar(obj, Bool)) { - PRINTMARK(); - tc->type = JT_ARRAY; - pc->iterBegin = Tuple_iterBegin; - pc->iterEnd = Tuple_iterEnd; - pc->iterNext = Tuple_iterNext; - pc->iterGetValue = Tuple_iterGetValue; - pc->iterGetName = Tuple_iterGetName; - return; + PRINTMARK(); + PyArray_CastScalarToCtype(obj, &(GET_TC(tc)->longValue), PyArray_DescrFromType(NPY_BOOL)); + tc->type = (GET_TC(tc)->longValue) ? JT_TRUE : JT_FALSE; + return; } else - if (PyAnySet_Check(obj)) + if (PyArray_IsScalar(obj, Float) || PyArray_IsScalar(obj, Double)) { PRINTMARK(); - tc->type = JT_ARRAY; - pc->iterBegin = Iter_iterBegin; - pc->iterEnd = Iter_iterEnd; - pc->iterNext = Iter_iterNext; - pc->iterGetValue = Iter_iterGetValue; - pc->iterGetName = Iter_iterGetName; + pc->PyTypeToJSON = NpyFloatToDOUBLE; tc->type = JT_DOUBLE; return; } - else + +ISITERABLE: + if (PyObject_TypeCheck(obj, cls_index)) { if (enc->outputFormat == SPLIT) @@ -1629,6 +1662,57 @@ void Object_beginTypeContext (JSOBJ _obj, JSONTypeContext *tc) } return; } + else + if (PyDict_Check(obj)) + { + PRINTMARK(); + tc->type = JT_OBJECT; + pc->iterBegin = Dict_iterBegin; + pc->iterEnd = Dict_iterEnd; + pc->iterNext = Dict_iterNext; + pc->iterGetValue = Dict_iterGetValue; + pc->iterGetName = Dict_iterGetName; + pc->dictObj = obj; + Py_INCREF(obj); + + return; + } + else + if (PyList_Check(obj)) + { + PRINTMARK(); + tc->type = JT_ARRAY; + pc->iterBegin = List_iterBegin; + pc->iterEnd = List_iterEnd; + pc->iterNext = List_iterNext; + pc->iterGetValue = List_iterGetValue; + pc->iterGetName = List_iterGetName; + return; + } + else + if (PyTuple_Check(obj)) + { + PRINTMARK(); + tc->type = JT_ARRAY; + pc->iterBegin = Tuple_iterBegin; + pc->iterEnd = Tuple_iterEnd; + pc->iterNext = Tuple_iterNext; + pc->iterGetValue = Tuple_iterGetValue; + pc->iterGetName = Tuple_iterGetName; + return; + } + else + if (PyAnySet_Check(obj)) + { + PRINTMARK(); + tc->type = JT_ARRAY; + pc->iterBegin = Iter_iterBegin; + pc->iterEnd = Iter_iterEnd; + pc->iterNext = Iter_iterNext; + pc->iterGetValue = Iter_iterGetValue; + pc->iterGetName = Iter_iterGetName; + return; + } toDictFunc = PyObject_GetAttrString(obj, "toDict"); @@ -1810,6 +1894,8 @@ PyObject* objToJSON(PyObject* self, PyObject *args, PyObject *kwargs) JSONObjectEncoder* encoder = (JSONObjectEncoder*) &pyEncoder; pyEncoder.npyCtxtPassthru = NULL; + pyEncoder.requestType = 0; + pyEncoder.requestTypeContext = NULL; pyEncoder.datetimeIso = 0; pyEncoder.datetimeUnit = PANDAS_FR_ms; pyEncoder.outputFormat = COLUMNS; diff --git a/vb_suite/packers.py b/vb_suite/packers.py index 9af6a6b1b0c4e..f2eac0e28cd44 100644 --- a/vb_suite/packers.py +++ b/vb_suite/packers.py @@ -92,3 +92,23 @@ def remove(f): packers_write_hdf_table = Benchmark("df.to_hdf(f,'df',table=True)", setup, cleanup="remove(f)", start_date=start_date) +#---------------------------------------------------------------------- +# json + +setup_int_index = """ +import numpy as np +df.index = np.arange(50000) +""" + +setup = common_setup + """ +df.to_json(f,orient='split') +""" +packers_read_json_date_index = Benchmark("pd.read_json(f, orient='split')", setup, start_date=start_date) +setup = setup + setup_int_index +packers_read_json = Benchmark("pd.read_json(f, orient='split')", setup, start_date=start_date) + +setup = common_setup + """ +""" +packers_write_json_date_index = Benchmark("df.to_json(f,orient='split')", setup, cleanup="remove(f)", start_date=start_date) +setup = setup + setup_int_index +packers_write_json = Benchmark("df.to_json(f,orient='split')", setup, cleanup="remove(f)", start_date=start_date)