From 95c20dbbc785efb9661495abdd08ed58392ff6d6 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Fri, 29 May 2020 23:47:17 +0000 Subject: [PATCH 01/79] BUG: overflow on to_json with numbers larger than sys.maxsize --- pandas/io/json/_json.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index ac6f9ff372601..dc111e9cae8ce 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -25,7 +25,14 @@ from pandas.io.parsers import _validate_integer loads = json.loads -dumps = json.dumps + + +def dumps(obj, default_handler=str, **kwargs): + try: + return json.dumps(obj, **kwargs) + except OverflowError: + return json.dumps(default_handler(obj), **kwargs) + TABLE_SCHEMA_VERSION = "0.20.0" From 6d2f8bd2d2135c1bdb39f2ad52d873527a146f21 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sat, 30 May 2020 00:03:34 +0000 Subject: [PATCH 02/79] TST: overflow on to_json with numbers larger than sys.maxsize (#34395) --- pandas/tests/io/json/test_ujson.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index 28b043e65b848..d4132b222641a 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -5,6 +5,7 @@ import locale import math import re +import sys import time import dateutil @@ -19,6 +20,8 @@ from pandas import DataFrame, DatetimeIndex, Index, NaT, Series, Timedelta, date_range import pandas._testing as tm +from pandas.io.json import dumps + def _clean_dict(d): """ @@ -559,6 +562,17 @@ def test_encode_long_conversion(self): assert output == json.dumps(long_input) assert long_input == ujson.decode(output) + def test_dumps_ints_larger_than_maxsize(self): + long_input = sys.maxsize + 1 + long_input_as_str = str(long_input) + with pytest.raises(OverflowError): + output = ujson.encode(long_input) + output = dumps(long_input) + + assert long_input_as_str == json.loads(output) + assert output == json.dumps(long_input_as_str) + assert long_input_as_str == ujson.decode(output) + @pytest.mark.parametrize( "int_exp", ["1337E40", "1.337E40", "1337E+9", "1.337e+40", "1.337E-4"] ) From 4fc5b87a2a7b66e4ee5ef1cff202177c26790f3b Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sat, 30 May 2020 00:34:15 +0000 Subject: [PATCH 03/79] DOC: update with issue #34395 --- doc/source/whatsnew/v1.1.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 88bf0e005a221..04ccd336ce52b 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -901,6 +901,7 @@ I/O - Bug in :meth:`~DataFrame.read_feather` was raising an `ArrowIOError` when reading an s3 or http file path (:issue:`29055`) - Bug in :meth:`~DataFrame.to_excel` could not handle the column name `render` and was raising an ``KeyError`` (:issue:`34331`) - Bug in :meth:`~SQLDatabase.execute` was raising a ``ProgrammingError`` for some DB-API drivers when the SQL statement contained the `%` character and no parameters were present (:issue:`34211`) +- Bug in :meth:`json.dumps` was raising an `OverflowError` with numbers larger than sys.maxsize (:issue: `34395`) Plotting ^^^^^^^^ From abfca37d861d1a5ac72d53c8ce23edaf47a3f2ea Mon Sep 17 00:00:00 2001 From: arw2019 Date: Wed, 3 Jun 2020 02:30:36 +0000 Subject: [PATCH 04/79] TST: removed unused import --- pandas/tests/io/json/test_ujson.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index d4132b222641a..87e78330bb69c 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -20,8 +20,6 @@ from pandas import DataFrame, DatetimeIndex, Index, NaT, Series, Timedelta, date_range import pandas._testing as tm -from pandas.io.json import dumps - def _clean_dict(d): """ @@ -563,15 +561,12 @@ def test_encode_long_conversion(self): assert long_input == ujson.decode(output) def test_dumps_ints_larger_than_maxsize(self): - long_input = sys.maxsize + 1 - long_input_as_str = str(long_input) - with pytest.raises(OverflowError): - output = ujson.encode(long_input) - output = dumps(long_input) + # GH34395 + big_num = sys.maxsize + 1 + encoding = ujson.dumps(big_num) - assert long_input_as_str == json.loads(output) - assert output == json.dumps(long_input_as_str) - assert long_input_as_str == ujson.decode(output) + assert encoding == json.dumps(big_num) + assert ujson.loads(big_num) == big_num @pytest.mark.parametrize( "int_exp", ["1337E40", "1.337E40", "1337E+9", "1.337e+40", "1.337E-4"] From 7e63941d79a16e4f51c40f2179f0fb36aa90f060 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Wed, 3 Jun 2020 02:32:59 +0000 Subject: [PATCH 05/79] ENH: added case JT_BIGNUM to encode --- pandas/_libs/src/ujson/lib/ultrajsonenc.c | 44 +++++++++++++++++++++-- 1 file changed, 42 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/src/ujson/lib/ultrajsonenc.c b/pandas/_libs/src/ujson/lib/ultrajsonenc.c index 065e3b2c60cf9..8d1397a45ad6d 100644 --- a/pandas/_libs/src/ujson/lib/ultrajsonenc.c +++ b/pandas/_libs/src/ujson/lib/ultrajsonenc.c @@ -925,6 +925,8 @@ void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, size_t szlen; JSONTypeContext tc; tc.encoder = enc; + + printf("running encode \n"); if (enc->level > enc->recursionMax) { SetError(obj, enc, "Maximum recursion level reached"); @@ -937,13 +939,14 @@ void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, length of _name as encoded worst case + maxLength of double to string OR maxLength of JSLONG to string */ - + printf("Reserving buffer"); Buffer_Reserve(enc, 256 + RESERVE_STRING(cbName)); if (enc->errorMsg) { return; } if (name) { + printf("if (name)"); Buffer_AppendCharUnchecked(enc, '\"'); if (enc->forceASCII) { @@ -963,7 +966,7 @@ void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, Buffer_AppendCharUnchecked(enc, ' '); #endif } - + printf("enc->beginTypeContext\n"); enc->beginTypeContext(obj, &tc); switch (tc.type) { @@ -1003,6 +1006,7 @@ void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, } case JT_OBJECT: { + printf("running JT_OBJECT \n"); count = 0; enc->iterBegin(obj, &tc); @@ -1035,11 +1039,13 @@ void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, } case JT_LONG: { + printf("case: JT_LONG\n"); Buffer_AppendLongUnchecked(enc, enc->getLongValue(obj, &tc)); break; } case JT_INT: { + printf("case: JT_INT\n"); Buffer_AppendIntUnchecked(enc, enc->getIntValue(obj, &tc)); break; } @@ -1070,6 +1076,7 @@ void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, } case JT_DOUBLE: { + printf("case: JT_DOUBLE\n"); if (!Buffer_AppendDoubleUnchecked(obj, enc, enc->getDoubleValue(obj, &tc))) { enc->endTypeContext(obj, &tc); @@ -1080,6 +1087,39 @@ void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, } case JT_UTF8: { + printf("case: JT_UTF8\n"); + value = enc->getStringValue(obj, &tc, &szlen); + Buffer_Reserve(enc, RESERVE_STRING(szlen)); + if (enc->errorMsg) { + enc->endTypeContext(obj, &tc); + return; + } + Buffer_AppendCharUnchecked(enc, '\"'); + + if (enc->forceASCII) { + if (!Buffer_EscapeStringValidated(obj, enc, value, + value + szlen)) { + enc->endTypeContext(obj, &tc); + enc->level--; + return; + } + } else { + if (!Buffer_EscapeStringUnvalidated(enc, value, + value + szlen)) { + enc->endTypeContext(obj, &tc); + enc->level--; + return; + } + } + + Buffer_AppendCharUnchecked(enc, '\"'); + break; + } + + case JT_BIGNUM: { + printf("case JT_BIGNUM:\n"); + // printf("%s", enc->bigNum); + break; value = enc->getStringValue(obj, &tc, &szlen); Buffer_Reserve(enc, RESERVE_STRING(szlen)); if (enc->errorMsg) { From 3353420d1c0af301504a9e26ebe379c597a5f413 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Wed, 3 Jun 2020 02:33:15 +0000 Subject: [PATCH 06/79] ENH: added JT_BIGNUM to JSTYPES --- pandas/_libs/src/ujson/lib/ultrajson.h | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/_libs/src/ujson/lib/ultrajson.h b/pandas/_libs/src/ujson/lib/ultrajson.h index acb66b668e8dc..5db6640a5d6bb 100644 --- a/pandas/_libs/src/ujson/lib/ultrajson.h +++ b/pandas/_libs/src/ujson/lib/ultrajson.h @@ -150,6 +150,7 @@ enum JSTYPES { JT_INT, // (JSINT32 (signed 32-bit)) JT_LONG, // (JSINT64 (signed 64-bit)) JT_DOUBLE, // (double) + JT_BIGNUM, // integer larger than sys.maxsize JT_UTF8, // (char 8-bit) JT_ARRAY, // Array structure JT_OBJECT, // Key/Value structure From c9574b8cab8a18f28e2c8c530319db57810720c3 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Wed, 3 Jun 2020 02:35:37 +0000 Subject: [PATCH 07/79] BUG: changed error for ints>sys.maxsize into JT_BIGNUM --- pandas/_libs/src/ujson/python/objToJSON.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index c71e941f7d6e8..8d5600cfe1d00 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -106,6 +106,8 @@ typedef struct __TypeContext { double doubleValue; JSINT64 longValue; + char bigNum; + char *cStr; NpyArrContext *npyarr; PdBlockContext *pdblock; @@ -1635,7 +1637,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) { PRINTMARK(); - goto INVALID; + tc->type = JT_BIGNUM; } return; From 94c112fb0028949b8b076a0725ce53a2d243b842 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Wed, 3 Jun 2020 02:39:14 +0000 Subject: [PATCH 08/79] ENH: removed debug statements --- pandas/_libs/src/ujson/lib/ultrajsonenc.c | 40 ++--------------------- 1 file changed, 2 insertions(+), 38 deletions(-) diff --git a/pandas/_libs/src/ujson/lib/ultrajsonenc.c b/pandas/_libs/src/ujson/lib/ultrajsonenc.c index 8d1397a45ad6d..c2d7de9f16f4d 100644 --- a/pandas/_libs/src/ujson/lib/ultrajsonenc.c +++ b/pandas/_libs/src/ujson/lib/ultrajsonenc.c @@ -925,8 +925,6 @@ void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, size_t szlen; JSONTypeContext tc; tc.encoder = enc; - - printf("running encode \n"); if (enc->level > enc->recursionMax) { SetError(obj, enc, "Maximum recursion level reached"); @@ -939,14 +937,13 @@ void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, length of _name as encoded worst case + maxLength of double to string OR maxLength of JSLONG to string */ - printf("Reserving buffer"); + Buffer_Reserve(enc, 256 + RESERVE_STRING(cbName)); if (enc->errorMsg) { return; } if (name) { - printf("if (name)"); Buffer_AppendCharUnchecked(enc, '\"'); if (enc->forceASCII) { @@ -966,7 +963,7 @@ void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, Buffer_AppendCharUnchecked(enc, ' '); #endif } - printf("enc->beginTypeContext\n"); + enc->beginTypeContext(obj, &tc); switch (tc.type) { @@ -1006,7 +1003,6 @@ void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, } case JT_OBJECT: { - printf("running JT_OBJECT \n"); count = 0; enc->iterBegin(obj, &tc); @@ -1039,13 +1035,11 @@ void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, } case JT_LONG: { - printf("case: JT_LONG\n"); Buffer_AppendLongUnchecked(enc, enc->getLongValue(obj, &tc)); break; } case JT_INT: { - printf("case: JT_INT\n"); Buffer_AppendIntUnchecked(enc, enc->getIntValue(obj, &tc)); break; } @@ -1076,7 +1070,6 @@ void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, } case JT_DOUBLE: { - printf("case: JT_DOUBLE\n"); if (!Buffer_AppendDoubleUnchecked(obj, enc, enc->getDoubleValue(obj, &tc))) { enc->endTypeContext(obj, &tc); @@ -1087,7 +1080,6 @@ void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, } case JT_UTF8: { - printf("case: JT_UTF8\n"); value = enc->getStringValue(obj, &tc, &szlen); Buffer_Reserve(enc, RESERVE_STRING(szlen)); if (enc->errorMsg) { @@ -1117,34 +1109,6 @@ void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, } case JT_BIGNUM: { - printf("case JT_BIGNUM:\n"); - // printf("%s", enc->bigNum); - break; - value = enc->getStringValue(obj, &tc, &szlen); - Buffer_Reserve(enc, RESERVE_STRING(szlen)); - if (enc->errorMsg) { - enc->endTypeContext(obj, &tc); - return; - } - Buffer_AppendCharUnchecked(enc, '\"'); - - if (enc->forceASCII) { - if (!Buffer_EscapeStringValidated(obj, enc, value, - value + szlen)) { - enc->endTypeContext(obj, &tc); - enc->level--; - return; - } - } else { - if (!Buffer_EscapeStringUnvalidated(enc, value, - value + szlen)) { - enc->endTypeContext(obj, &tc); - enc->level--; - return; - } - } - - Buffer_AppendCharUnchecked(enc, '\"'); break; } } From 76576b82a51d729c884caa580ef293fa02e14945 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Wed, 3 Jun 2020 02:41:38 +0000 Subject: [PATCH 09/79] BUG: removed dumps wrapper --- pandas/io/json/_json.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index dc111e9cae8ce..d4797adeaa97b 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -25,13 +25,7 @@ from pandas.io.parsers import _validate_integer loads = json.loads - - -def dumps(obj, default_handler=str, **kwargs): - try: - return json.dumps(obj, **kwargs) - except OverflowError: - return json.dumps(default_handler(obj), **kwargs) +dumps = json.dumps TABLE_SCHEMA_VERSION = "0.20.0" From 9f211a5d00e206a7708be6724912abb449263739 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Wed, 3 Jun 2020 21:52:55 +0000 Subject: [PATCH 10/79] removed bigNum from TypeContext --- pandas/_libs/src/ujson/python/objToJSON.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 8d5600cfe1d00..bc6ccb8351778 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -106,8 +106,6 @@ typedef struct __TypeContext { double doubleValue; JSINT64 longValue; - char bigNum; - char *cStr; NpyArrContext *npyarr; PdBlockContext *pdblock; From 2b7a271948d715b9bb0725427a0d735a4f2d17ef Mon Sep 17 00:00:00 2001 From: arw2019 Date: Wed, 3 Jun 2020 22:07:58 +0000 Subject: [PATCH 11/79] TST: fixed bug in the test --- pandas/tests/io/json/test_ujson.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index 87e78330bb69c..b6a5834282ed4 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -566,7 +566,7 @@ def test_dumps_ints_larger_than_maxsize(self): encoding = ujson.dumps(big_num) assert encoding == json.dumps(big_num) - assert ujson.loads(big_num) == big_num + assert ujson.loads(encoding) == big_num @pytest.mark.parametrize( "int_exp", ["1337E40", "1.337E40", "1337E+9", "1.337e+40", "1.337E-4"] From 5e06109b154b069add334cbeb2938b21b4d0d45e Mon Sep 17 00:00:00 2001 From: arw2019 Date: Wed, 3 Jun 2020 22:29:46 +0000 Subject: [PATCH 12/79] added pointer to string rep converter for BigNum --- pandas/_libs/src/ujson/lib/ultrajson.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/_libs/src/ujson/lib/ultrajson.h b/pandas/_libs/src/ujson/lib/ultrajson.h index 5db6640a5d6bb..69284e1c3f2ab 100644 --- a/pandas/_libs/src/ujson/lib/ultrajson.h +++ b/pandas/_libs/src/ujson/lib/ultrajson.h @@ -188,6 +188,8 @@ typedef struct __JSONObjectEncoder { JSINT64 (*getLongValue)(JSOBJ obj, JSONTypeContext *tc); JSINT32 (*getIntValue)(JSOBJ obj, JSONTypeContext *tc); double (*getDoubleValue)(JSOBJ obj, JSONTypeContext *tc); + const char *(*getBigNumStringValue)(JSOBJ obj, JSONTypeContext *tc, + size_t *_outLen); /* Begin iteration of an iteratable object (JS_ARRAY or JS_OBJECT) From 755ef4788c79bc929265cd6d82d9ab256029be13 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Wed, 3 Jun 2020 23:04:47 +0000 Subject: [PATCH 13/79] TST: removed ujson.loads from the test --- pandas/tests/io/json/test_ujson.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index b6a5834282ed4..43b64286ffad7 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -566,7 +566,8 @@ def test_dumps_ints_larger_than_maxsize(self): encoding = ujson.dumps(big_num) assert encoding == json.dumps(big_num) - assert ujson.loads(encoding) == big_num + # ujson.loads to be fixed in the future + # assert ujson.loads(encoding) == big_num @pytest.mark.parametrize( "int_exp", ["1337E40", "1.337E40", "1337E+9", "1.337e+40", "1.337E-4"] From 0e768f8523ecaca9fab634d8280ab035bbfc3d3d Mon Sep 17 00:00:00 2001 From: arw2019 Date: Thu, 4 Jun 2020 03:44:35 +0000 Subject: [PATCH 14/79] added getBigNumStringValue --- pandas/_libs/src/ujson/lib/ultrajson.h | 3 +-- pandas/_libs/src/ujson/python/objToJSON.c | 8 ++++++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/src/ujson/lib/ultrajson.h b/pandas/_libs/src/ujson/lib/ultrajson.h index 69284e1c3f2ab..d823a8b100027 100644 --- a/pandas/_libs/src/ujson/lib/ultrajson.h +++ b/pandas/_libs/src/ujson/lib/ultrajson.h @@ -188,8 +188,7 @@ typedef struct __JSONObjectEncoder { JSINT64 (*getLongValue)(JSOBJ obj, JSONTypeContext *tc); JSINT32 (*getIntValue)(JSOBJ obj, JSONTypeContext *tc); double (*getDoubleValue)(JSOBJ obj, JSONTypeContext *tc); - const char *(*getBigNumStringValue)(JSOBJ obj, JSONTypeContext *tc, - size_t *_outLen); + const char *(*getBigNumStringValue)(JSOBJ obj, JSONTypeContext *tc); /* Begin iteration of an iteratable object (JS_ARRAY or JS_OBJECT) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index bc6ccb8351778..90357f6cc7cfc 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1636,6 +1636,8 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) { PRINTMARK(); tc->type = JT_BIGNUM; + /* Question (arw2019): is this where I convert obj into a string + and load result into tc->cStr? */ } return; @@ -2126,6 +2128,12 @@ double Object_getDoubleValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->doubleValue; } +const char *Object_getBigNumStringValue(JSOBJ obj, JSONTypeContext *tc) { + // Question (arw2019): is this function right? + // Do I need the outLen argument for example? + return GET_TC(tc)->cStr; +} + static void Object_releaseObject(JSOBJ _obj) { Py_DECREF((PyObject *)_obj); } void Object_iterBegin(JSOBJ obj, JSONTypeContext *tc) { From 12d73b044a09d86871c22b5ba1b7cb0af296e983 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Thu, 4 Jun 2020 03:45:43 +0000 Subject: [PATCH 15/79] added code to JT_BIGNUM handler by analogy with JT_UTF8 --- pandas/_libs/src/ujson/lib/ultrajsonenc.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/pandas/_libs/src/ujson/lib/ultrajsonenc.c b/pandas/_libs/src/ujson/lib/ultrajsonenc.c index c2d7de9f16f4d..94fba6b348fc0 100644 --- a/pandas/_libs/src/ujson/lib/ultrajsonenc.c +++ b/pandas/_libs/src/ujson/lib/ultrajsonenc.c @@ -1109,7 +1109,24 @@ void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, } case JT_BIGNUM: { + + value = enc->getBigNumStringValue(obj, &tc); + + Buffer_Reserve(enc, RESERVE_STRING(szlen)); + if (enc->errorMsg) { + enc->endTypeContext(obj, &tc); + return; + } + + if (!Buffer_EscapeStringValidated(obj, enc, value, + value + szlen)) { + enc->endTypeContext(obj, &tc); + enc->level--; + return; + } + break; + } } From 6c2aa9f730cf9e7db66cae61f0d5474c4cf2cfe0 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska <48889395+arw2019@users.noreply.github.com> Date: Fri, 5 Jun 2020 23:31:43 -0400 Subject: [PATCH 16/79] TST: update pandas/tests/io/json/test_ujson.py Co-authored-by: William Ayd --- pandas/tests/io/json/test_ujson.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index 43b64286ffad7..6e7348d58d842 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -565,7 +565,7 @@ def test_dumps_ints_larger_than_maxsize(self): big_num = sys.maxsize + 1 encoding = ujson.dumps(big_num) - assert encoding == json.dumps(big_num) + assert str(encoding) == json.dumps(big_num) # ujson.loads to be fixed in the future # assert ujson.loads(encoding) == big_num From 1a8051ffd6a0f2f55c2d8f75a68d55ca87f34bb4 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sat, 6 Jun 2020 20:26:16 +0000 Subject: [PATCH 17/79] added Object_getBigNumStringValue to pyEncoder --- pandas/_libs/src/ujson/python/objToJSON.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 90357f6cc7cfc..cf3d31cf227bc 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1636,8 +1636,6 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) { PRINTMARK(); tc->type = JT_BIGNUM; - /* Question (arw2019): is this where I convert obj into a string - and load result into tc->cStr? */ } return; @@ -2128,11 +2126,14 @@ double Object_getDoubleValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->doubleValue; } +// /* const char *Object_getBigNumStringValue(JSOBJ obj, JSONTypeContext *tc) { - // Question (arw2019): is this function right? - // Do I need the outLen argument for example? + // char wstr[100]; + int sign = (obj<0) ? -1 : 1; + printf("sign %d\n", sign); return GET_TC(tc)->cStr; } +// */ static void Object_releaseObject(JSOBJ _obj) { Py_DECREF((PyObject *)_obj); } @@ -2187,6 +2188,7 @@ PyObject *objToJSON(PyObject *Py_UNUSED(self), PyObject *args, Object_endTypeContext, Object_getStringValue, Object_getLongValue, + Object_getBigNumStringValue, NULL, // getIntValue is unused Object_getDoubleValue, Object_iterBegin, From 552194e6d6bae5b42bf8f1e13d97680a9a8b239c Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sat, 6 Jun 2020 23:33:42 +0000 Subject: [PATCH 18/79] added skeletal code for Object_GetBigNumStringValue --- pandas/_libs/src/ujson/python/objToJSON.c | 37 ++++++++++++++++++++--- 1 file changed, 33 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index cf3d31cf227bc..3e7f94ca7ce0d 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1636,6 +1636,9 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) { PRINTMARK(); tc->type = JT_BIGNUM; + + // This line generates compiler errors. + GET_TC(tc)->cStr = Object_getBigNumStringValue(obj); } return; @@ -2127,10 +2130,36 @@ double Object_getDoubleValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { } // /* -const char *Object_getBigNumStringValue(JSOBJ obj, JSONTypeContext *tc) { - // char wstr[100]; - int sign = (obj<0) ? -1 : 1; - printf("sign %d\n", sign); +const char *Object_getBigNumStringValue(JSOBJ obj, JSONTypeContext *tc, + size_t *_outLen) { + /* here goes the code that converts obj into a string + char *wstr; + if (obj<0) {wstr++ = "-";} + int digit; + PyObject rem; + PyObject ten; + long ten_as_long = 10, rem_as_long; + ten = PyNumber_FromLong(ten_as_long); + do { + rem = PyNumber_Remainder(obj, ten); + obj = PyNumber_FloorDivide(obj, ten); + + rem_as_long = PyLong_AsLong(rem); + wstr++ = char(48 + (int) rem_as_long); + } while (obj>10); + */ + + // we then load that string into tc->cStr + GET_TC(tc)->str = wstr; + + /* _outLen: do we set that here? + I can imagine counting the number of digits in the + do-while loop and then setting + _outLen = (number of digits); + I'm not quite sure how that would work though + since _outLen is an argument to this function + */ + return GET_TC(tc)->cStr; } // */ From e2898ef7680580ef9a75b1a68ad497008af561ba Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sun, 7 Jun 2020 04:10:54 +0000 Subject: [PATCH 19/79] completed Object_getBigNumStringValue using PyObject_Repr --- pandas/_libs/src/ujson/python/objToJSON.c | 37 +++-------------------- 1 file changed, 5 insertions(+), 32 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 3e7f94ca7ce0d..e6782cb8480ee 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1636,9 +1636,6 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) { PRINTMARK(); tc->type = JT_BIGNUM; - - // This line generates compiler errors. - GET_TC(tc)->cStr = Object_getBigNumStringValue(obj); } return; @@ -2129,40 +2126,16 @@ double Object_getDoubleValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->doubleValue; } -// /* const char *Object_getBigNumStringValue(JSOBJ obj, JSONTypeContext *tc, size_t *_outLen) { - /* here goes the code that converts obj into a string - char *wstr; - if (obj<0) {wstr++ = "-";} - int digit; - PyObject rem; - PyObject ten; - long ten_as_long = 10, rem_as_long; - ten = PyNumber_FromLong(ten_as_long); - do { - rem = PyNumber_Remainder(obj, ten); - obj = PyNumber_FloorDivide(obj, ten); - - rem_as_long = PyLong_AsLong(rem); - wstr++ = char(48 + (int) rem_as_long); - } while (obj>10); - */ - - // we then load that string into tc->cStr - GET_TC(tc)->str = wstr; - - /* _outLen: do we set that here? - I can imagine counting the number of digits in the - do-while loop and then setting - _outLen = (number of digits); - I'm not quite sure how that would work though - since _outLen is an argument to this function - */ + PyObject* repr = PyObject_Repr(obj); + PyObject* str = PyUnicode_AsEncodedString(repr, "utf-8", "~E~"); + const char *bytes = PyBytes_AS_STRING(str); + GET_TC(tc)->str = bytes; + return GET_TC(tc)->cStr; } -// */ static void Object_releaseObject(JSOBJ _obj) { Py_DECREF((PyObject *)_obj); } From 29439955c6cb3995bd9348401e7449844b99d364 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sun, 7 Jun 2020 04:20:31 +0000 Subject: [PATCH 20/79] BUG: changed Object_getBigNumStringValue --- pandas/_libs/src/ujson/python/objToJSON.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index e6782cb8480ee..3f88586fd9b03 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -2130,9 +2130,10 @@ const char *Object_getBigNumStringValue(JSOBJ obj, JSONTypeContext *tc, size_t *_outLen) { PyObject* repr = PyObject_Repr(obj); PyObject* str = PyUnicode_AsEncodedString(repr, "utf-8", "~E~"); - const char *bytes = PyBytes_AS_STRING(str); + char *bytes = PyBytes_AS_STRING(str); - GET_TC(tc)->str = bytes; + PyObject_Free(GET_TC(tc)->cStr); + GET_TC(tc)->cStr = bytes; return GET_TC(tc)->cStr; } From 771ec5df27e639b111e8633bc75f42a531237080 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Wed, 10 Jun 2020 05:17:49 +0000 Subject: [PATCH 21/79] improved Object_getBigNumStringValue some more --- pandas/_libs/src/ujson/python/objToJSON.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 3f88586fd9b03..f160499ca8474 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -2128,11 +2128,13 @@ double Object_getDoubleValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { const char *Object_getBigNumStringValue(JSOBJ obj, JSONTypeContext *tc, size_t *_outLen) { - PyObject* repr = PyObject_Repr(obj); + PyObject* repr = PyObject_Str(obj); PyObject* str = PyUnicode_AsEncodedString(repr, "utf-8", "~E~"); char *bytes = PyBytes_AS_STRING(str); - PyObject_Free(GET_TC(tc)->cStr); + Py_XDECREF(repr); + Py_XDECREF(str); + GET_TC(tc)->cStr = bytes; return GET_TC(tc)->cStr; From 92bc6ef8aa383087cb039e097b669e5a9611a61b Mon Sep 17 00:00:00 2001 From: arw2019 Date: Wed, 10 Jun 2020 05:27:10 +0000 Subject: [PATCH 22/79] update getBigNumStringValue argument --- pandas/_libs/src/ujson/lib/ultrajsonenc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/src/ujson/lib/ultrajsonenc.c b/pandas/_libs/src/ujson/lib/ultrajsonenc.c index 94fba6b348fc0..c27c4b9883220 100644 --- a/pandas/_libs/src/ujson/lib/ultrajsonenc.c +++ b/pandas/_libs/src/ujson/lib/ultrajsonenc.c @@ -1110,7 +1110,7 @@ void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, case JT_BIGNUM: { - value = enc->getBigNumStringValue(obj, &tc); + value = enc->getBigNumStringValue(obj, &tc, &szlen); Buffer_Reserve(enc, RESERVE_STRING(szlen)); if (enc->errorMsg) { @@ -1124,9 +1124,9 @@ void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, enc->level--; return; } - - break; + break; + } } From 8f3af8c018ef992c81cec6ed70f3a4295b3fd0ad Mon Sep 17 00:00:00 2001 From: arw2019 Date: Wed, 10 Jun 2020 06:44:32 +0000 Subject: [PATCH 23/79] corrected Object_getBigNumStringValue --- pandas/_libs/src/ujson/python/objToJSON.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index f160499ca8474..a4861fc353e74 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -2129,13 +2129,14 @@ double Object_getDoubleValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { const char *Object_getBigNumStringValue(JSOBJ obj, JSONTypeContext *tc, size_t *_outLen) { PyObject* repr = PyObject_Str(obj); - PyObject* str = PyUnicode_AsEncodedString(repr, "utf-8", "~E~"); - char *bytes = PyBytes_AS_STRING(str); + PyObject* str = PyUnicode_AsUTF8AndSize(repr, _outLen); + char bytes[_outlen]; + memcpy(bytes, PyBytes_AS_STRING(str), _outLen); + memcpy(GET_TC(tc)->cStr, bytes, _outLen); Py_XDECREF(repr); Py_XDECREF(str); - - GET_TC(tc)->cStr = bytes; + free(bytes); return GET_TC(tc)->cStr; } From cdae92e8ffa3337e3057226994bf0caa6ee879ac Mon Sep 17 00:00:00 2001 From: arw2019 Date: Wed, 10 Jun 2020 07:26:23 +0000 Subject: [PATCH 24/79] more fixes to Object_getBigNumStringValue --- pandas/_libs/src/ujson/python/objToJSON.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index a4861fc353e74..5bf847f31f5f7 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -2129,10 +2129,12 @@ double Object_getDoubleValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { const char *Object_getBigNumStringValue(JSOBJ obj, JSONTypeContext *tc, size_t *_outLen) { PyObject* repr = PyObject_Str(obj); - PyObject* str = PyUnicode_AsUTF8AndSize(repr, _outLen); - char bytes[_outlen]; - memcpy(bytes, PyBytes_AS_STRING(str), _outLen); - memcpy(GET_TC(tc)->cStr, bytes, _outLen); + Py_ssize_t* _py_ssize_t_outLen; + const char *str = PyUnicode_AsUTF8AndSize(repr,_py_ssize_t_outLen); + // need to cast _py_ssize_t_outLen as C size_t type (_outLen) + char* bytes = malloc(&_outLen); + memcpy(bytes, PyBytes_AS_STRING(str), &_outLen); + memcpy(GET_TC(tc)->cStr, bytes, &_outLen); Py_XDECREF(repr); Py_XDECREF(str); From 1009168b93a2c175cb9794083114056aac3e5e03 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska <48889395+arw2019@users.noreply.github.com> Date: Wed, 10 Jun 2020 17:01:24 -0400 Subject: [PATCH 25/79] Update pandas/_libs/src/ujson/python/objToJSON.c Co-authored-by: William Ayd --- pandas/_libs/src/ujson/python/objToJSON.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 5bf847f31f5f7..af50e7fa8c3ba 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -2132,7 +2132,7 @@ const char *Object_getBigNumStringValue(JSOBJ obj, JSONTypeContext *tc, Py_ssize_t* _py_ssize_t_outLen; const char *str = PyUnicode_AsUTF8AndSize(repr,_py_ssize_t_outLen); // need to cast _py_ssize_t_outLen as C size_t type (_outLen) - char* bytes = malloc(&_outLen); + char* bytes = malloc(*_outLen); memcpy(bytes, PyBytes_AS_STRING(str), &_outLen); memcpy(GET_TC(tc)->cStr, bytes, &_outLen); From 759ad8a2bfc167a808a29ee267ddf6bd9e65ed65 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska <48889395+arw2019@users.noreply.github.com> Date: Wed, 10 Jun 2020 17:02:00 -0400 Subject: [PATCH 26/79] Update pandas/_libs/src/ujson/python/objToJSON.c Co-authored-by: William Ayd --- pandas/_libs/src/ujson/python/objToJSON.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index af50e7fa8c3ba..32e4755dcb8eb 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -2137,7 +2137,7 @@ const char *Object_getBigNumStringValue(JSOBJ obj, JSONTypeContext *tc, memcpy(GET_TC(tc)->cStr, bytes, &_outLen); Py_XDECREF(repr); - Py_XDECREF(str); + Py_DECREF(str); free(bytes); return GET_TC(tc)->cStr; From 5e01ed0ed3be84c672514cb187e2f50adf1f8029 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska <48889395+arw2019@users.noreply.github.com> Date: Wed, 10 Jun 2020 17:02:08 -0400 Subject: [PATCH 27/79] Update pandas/_libs/src/ujson/python/objToJSON.c Co-authored-by: William Ayd --- pandas/_libs/src/ujson/python/objToJSON.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 32e4755dcb8eb..aec280c308e6b 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -2136,7 +2136,7 @@ const char *Object_getBigNumStringValue(JSOBJ obj, JSONTypeContext *tc, memcpy(bytes, PyBytes_AS_STRING(str), &_outLen); memcpy(GET_TC(tc)->cStr, bytes, &_outLen); - Py_XDECREF(repr); + Py_DECREF(repr); Py_DECREF(str); free(bytes); From 8a08a3820cb589296de4fd68aa7394069e2d4d28 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska <48889395+arw2019@users.noreply.github.com> Date: Wed, 10 Jun 2020 17:02:23 -0400 Subject: [PATCH 28/79] Update pandas/_libs/src/ujson/python/objToJSON.c Co-authored-by: William Ayd --- pandas/_libs/src/ujson/python/objToJSON.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index aec280c308e6b..b161d940dcb8d 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -2134,7 +2134,7 @@ const char *Object_getBigNumStringValue(JSOBJ obj, JSONTypeContext *tc, // need to cast _py_ssize_t_outLen as C size_t type (_outLen) char* bytes = malloc(*_outLen); memcpy(bytes, PyBytes_AS_STRING(str), &_outLen); - memcpy(GET_TC(tc)->cStr, bytes, &_outLen); + GET_TC(tc)->cStr = bytes; Py_DECREF(repr); Py_DECREF(str); From 0441fe702b6037a234d2abbfe801821b51e7b49a Mon Sep 17 00:00:00 2001 From: Andrew Wieteska <48889395+arw2019@users.noreply.github.com> Date: Wed, 10 Jun 2020 17:07:12 -0400 Subject: [PATCH 29/79] Update pandas/_libs/src/ujson/python/objToJSON.c Co-authored-by: William Ayd --- pandas/_libs/src/ujson/python/objToJSON.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index b161d940dcb8d..b5388aefecf9c 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -2133,7 +2133,7 @@ const char *Object_getBigNumStringValue(JSOBJ obj, JSONTypeContext *tc, const char *str = PyUnicode_AsUTF8AndSize(repr,_py_ssize_t_outLen); // need to cast _py_ssize_t_outLen as C size_t type (_outLen) char* bytes = malloc(*_outLen); - memcpy(bytes, PyBytes_AS_STRING(str), &_outLen); + memcpy(bytes, str, *_outLen); GET_TC(tc)->cStr = bytes; Py_DECREF(repr); From 4630c0d8d387ee777736c653aad84f3eee2cb1ff Mon Sep 17 00:00:00 2001 From: Andrew Wieteska <48889395+arw2019@users.noreply.github.com> Date: Wed, 10 Jun 2020 17:23:17 -0400 Subject: [PATCH 30/79] Update pandas/_libs/src/ujson/python/objToJSON.c Co-authored-by: William Ayd --- pandas/_libs/src/ujson/python/objToJSON.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index b5388aefecf9c..07a914c4fa13d 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -2130,7 +2130,7 @@ const char *Object_getBigNumStringValue(JSOBJ obj, JSONTypeContext *tc, size_t *_outLen) { PyObject* repr = PyObject_Str(obj); Py_ssize_t* _py_ssize_t_outLen; - const char *str = PyUnicode_AsUTF8AndSize(repr,_py_ssize_t_outLen); + const char *str = PyUnicode_AsUTF8AndSize(repr, *_outLen); // need to cast _py_ssize_t_outLen as C size_t type (_outLen) char* bytes = malloc(*_outLen); memcpy(bytes, str, *_outLen); From 2e06a8b3c274b1470c3b4720fac084279bd21554 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Wed, 10 Jun 2020 21:24:34 +0000 Subject: [PATCH 31/79] Update pandas/_libs/src/ujson/python/objToJSON.c --- pandas/_libs/src/ujson/python/objToJSON.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 07a914c4fa13d..4f91029c47c0b 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -2129,9 +2129,7 @@ double Object_getDoubleValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { const char *Object_getBigNumStringValue(JSOBJ obj, JSONTypeContext *tc, size_t *_outLen) { PyObject* repr = PyObject_Str(obj); - Py_ssize_t* _py_ssize_t_outLen; const char *str = PyUnicode_AsUTF8AndSize(repr, *_outLen); - // need to cast _py_ssize_t_outLen as C size_t type (_outLen) char* bytes = malloc(*_outLen); memcpy(bytes, str, *_outLen); GET_TC(tc)->cStr = bytes; From 63056fcb4d5b6b4e1499a539b5b4b2f72d3f876d Mon Sep 17 00:00:00 2001 From: arw2019 Date: Wed, 10 Jun 2020 21:59:50 +0000 Subject: [PATCH 32/79] Update pandas/_libs/src/ujson/python/objToJSON.c --- pandas/_libs/src/ujson/python/objToJSON.c | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 4f91029c47c0b..b2bafbfea1e81 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -2136,7 +2136,6 @@ const char *Object_getBigNumStringValue(JSOBJ obj, JSONTypeContext *tc, Py_DECREF(repr); Py_DECREF(str); - free(bytes); return GET_TC(tc)->cStr; } From 6ec960e536b8f7abdff0f3645987b53fbffa633d Mon Sep 17 00:00:00 2001 From: arw2019 Date: Wed, 10 Jun 2020 22:23:58 +0000 Subject: [PATCH 33/79] updated pyEncoder for JT_BIGNUM --- pandas/_libs/src/ujson/python/objToJSON.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index b2bafbfea1e81..359b97521381a 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1636,6 +1636,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) { PRINTMARK(); tc->type = JT_BIGNUM; + pc->PyTypeToUTF8 = Object_getBigNumStringValue; } return; @@ -2129,7 +2130,7 @@ double Object_getDoubleValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { const char *Object_getBigNumStringValue(JSOBJ obj, JSONTypeContext *tc, size_t *_outLen) { PyObject* repr = PyObject_Str(obj); - const char *str = PyUnicode_AsUTF8AndSize(repr, *_outLen); + const char *str = PyUnicode_AsUTF8AndSize(repr, ( *_outLen)); char* bytes = malloc(*_outLen); memcpy(bytes, str, *_outLen); GET_TC(tc)->cStr = bytes; From c63a5c9f1c875b7cab77231d56ca4c7488d4e6ac Mon Sep 17 00:00:00 2001 From: arw2019 Date: Wed, 17 Jun 2020 06:59:46 +0000 Subject: [PATCH 34/79] updated pyEncoder --- pandas/_libs/src/ujson/python/objToJSON.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 359b97521381a..9c95e413270f2 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -2130,7 +2130,7 @@ double Object_getDoubleValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { const char *Object_getBigNumStringValue(JSOBJ obj, JSONTypeContext *tc, size_t *_outLen) { PyObject* repr = PyObject_Str(obj); - const char *str = PyUnicode_AsUTF8AndSize(repr, ( *_outLen)); + const char *str = PyUnicode_AsUTF8AndSize(repr, (Py_ssize_t *) _outLen); char* bytes = malloc(*_outLen); memcpy(bytes, str, *_outLen); GET_TC(tc)->cStr = bytes; From b2f8f46fd40f21d4acc8fdb4d17f8e0f6073e1a8 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Wed, 17 Jun 2020 21:17:28 +0000 Subject: [PATCH 35/79] moved getBigNumStringValue to pyEncoder --- pandas/_libs/src/ujson/lib/ultrajson.h | 1 - pandas/_libs/src/ujson/python/objToJSON.c | 4 +++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/src/ujson/lib/ultrajson.h b/pandas/_libs/src/ujson/lib/ultrajson.h index d823a8b100027..5db6640a5d6bb 100644 --- a/pandas/_libs/src/ujson/lib/ultrajson.h +++ b/pandas/_libs/src/ujson/lib/ultrajson.h @@ -188,7 +188,6 @@ typedef struct __JSONObjectEncoder { JSINT64 (*getLongValue)(JSOBJ obj, JSONTypeContext *tc); JSINT32 (*getIntValue)(JSOBJ obj, JSONTypeContext *tc); double (*getDoubleValue)(JSOBJ obj, JSONTypeContext *tc); - const char *(*getBigNumStringValue)(JSOBJ obj, JSONTypeContext *tc); /* Begin iteration of an iteratable object (JS_ARRAY or JS_OBJECT) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 9c95e413270f2..129f9d4ee6705 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -136,6 +136,8 @@ typedef struct __PyObjectEncoder { int outputFormat; int originalOutputFormat; + const char *Object_getBigNumStringValue; + PyObject *defaultHandler; } PyObjectEncoder; @@ -1636,7 +1638,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) { PRINTMARK(); tc->type = JT_BIGNUM; - pc->PyTypeToUTF8 = Object_getBigNumStringValue; + pc->PyTypeToUTF8 = enc->Object_getBigNumStringValue; } return; From fea9348bd2c28bd6e9a9dd60ef57f8d47b6ad5c5 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Thu, 18 Jun 2020 04:10:59 +0000 Subject: [PATCH 36/79] fixed declaration of Object_getBigNumStringValue --- pandas/_libs/src/ujson/lib/ultrajson.h | 2 ++ pandas/_libs/src/ujson/lib/ultrajsonenc.c | 11 +---------- pandas/_libs/src/ujson/python/objToJSON.c | 5 +---- 3 files changed, 4 insertions(+), 14 deletions(-) diff --git a/pandas/_libs/src/ujson/lib/ultrajson.h b/pandas/_libs/src/ujson/lib/ultrajson.h index 5db6640a5d6bb..69284e1c3f2ab 100644 --- a/pandas/_libs/src/ujson/lib/ultrajson.h +++ b/pandas/_libs/src/ujson/lib/ultrajson.h @@ -188,6 +188,8 @@ typedef struct __JSONObjectEncoder { JSINT64 (*getLongValue)(JSOBJ obj, JSONTypeContext *tc); JSINT32 (*getIntValue)(JSOBJ obj, JSONTypeContext *tc); double (*getDoubleValue)(JSOBJ obj, JSONTypeContext *tc); + const char *(*getBigNumStringValue)(JSOBJ obj, JSONTypeContext *tc, + size_t *_outLen); /* Begin iteration of an iteratable object (JS_ARRAY or JS_OBJECT) diff --git a/pandas/_libs/src/ujson/lib/ultrajsonenc.c b/pandas/_libs/src/ujson/lib/ultrajsonenc.c index c27c4b9883220..80ddf081b15ae 100644 --- a/pandas/_libs/src/ujson/lib/ultrajsonenc.c +++ b/pandas/_libs/src/ujson/lib/ultrajsonenc.c @@ -1109,22 +1109,13 @@ void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, } case JT_BIGNUM: { - value = enc->getBigNumStringValue(obj, &tc, &szlen); - Buffer_Reserve(enc, RESERVE_STRING(szlen)); if (enc->errorMsg) { enc->endTypeContext(obj, &tc); return; } - - if (!Buffer_EscapeStringValidated(obj, enc, value, - value + szlen)) { - enc->endTypeContext(obj, &tc); - enc->level--; - return; - } - + break; } diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 129f9d4ee6705..2529f791e564c 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -136,8 +136,6 @@ typedef struct __PyObjectEncoder { int outputFormat; int originalOutputFormat; - const char *Object_getBigNumStringValue; - PyObject *defaultHandler; } PyObjectEncoder; @@ -1638,7 +1636,6 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) { PRINTMARK(); tc->type = JT_BIGNUM; - pc->PyTypeToUTF8 = enc->Object_getBigNumStringValue; } return; @@ -2196,9 +2193,9 @@ PyObject *objToJSON(PyObject *Py_UNUSED(self), PyObject *args, Object_endTypeContext, Object_getStringValue, Object_getLongValue, - Object_getBigNumStringValue, NULL, // getIntValue is unused Object_getDoubleValue, + Object_getBigNumStringValue, Object_iterBegin, Object_iterNext, Object_iterEnd, From 651607805c7bb33ee768f5e10a00a558a7deadf0 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Thu, 18 Jun 2020 05:22:06 +0000 Subject: [PATCH 37/79] fixed Object_getBigNumStringValue --- pandas/_libs/src/ujson/python/objToJSON.c | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 2529f791e564c..bbc4938395dd8 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -2309,7 +2309,6 @@ PyObject *objToJSON(PyObject *Py_UNUSED(self), PyObject *args, if (ret != buffer) { encoder->free(ret); } - PyErr_Format(PyExc_OverflowError, "%s", encoder->errorMsg); return NULL; } From aa2dbca58ddebc920837a8afae8534b22331b59e Mon Sep 17 00:00:00 2001 From: arw2019 Date: Thu, 18 Jun 2020 16:09:20 +0000 Subject: [PATCH 38/79] catch overflow error with PyLong_AsLongLongAndOverflow --- pandas/_libs/src/ujson/lib/ultrajsonenc.c | 19 ++++++++++++++++++- pandas/_libs/src/ujson/python/objToJSON.c | 12 +++++++++--- 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/src/ujson/lib/ultrajsonenc.c b/pandas/_libs/src/ujson/lib/ultrajsonenc.c index 80ddf081b15ae..51aa39a16920e 100644 --- a/pandas/_libs/src/ujson/lib/ultrajsonenc.c +++ b/pandas/_libs/src/ujson/lib/ultrajsonenc.c @@ -1110,12 +1110,29 @@ void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, case JT_BIGNUM: { value = enc->getBigNumStringValue(obj, &tc, &szlen); + Buffer_Reserve(enc, RESERVE_STRING(szlen)); if (enc->errorMsg) { enc->endTypeContext(obj, &tc); return; } - + + if (enc->forceASCII) { + if (!Buffer_EscapeStringValidated(obj, enc, value, + value + szlen)) { + enc->endTypeContext(obj, &tc); + enc->level--; + return; + } + } else { + if (!Buffer_EscapeStringUnvalidated(enc, value, + value + szlen)) { + enc->endTypeContext(obj, &tc); + enc->level--; + return; + } + } + break; } diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index bbc4938395dd8..51a20dd1109ec 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1629,15 +1629,21 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { if (PyLong_Check(obj)) { PRINTMARK(); tc->type = JT_LONG; - GET_TC(tc)->longValue = PyLong_AsLongLong(obj); - + int _overflow = 0; + GET_TC(tc)->longValue = PyLong_AsLongLongAndOverflow(obj, &_overflow); exc = PyErr_Occurred(); - if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) { + if (!exc && (_overflow!=0)){ PRINTMARK(); tc->type = JT_BIGNUM; } + if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) { + PRINTMARK(); + goto INVALID; + } + + return; } else if (PyFloat_Check(obj)) { PRINTMARK(); From 7eaf42da48eba4d13cc51d6d70f2a3d2fe1692bf Mon Sep 17 00:00:00 2001 From: arw2019 Date: Thu, 18 Jun 2020 16:22:53 +0000 Subject: [PATCH 39/79] remove unnecessary error check --- pandas/_libs/src/ujson/python/objToJSON.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 51a20dd1109ec..6c75431d97f6d 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1633,7 +1633,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { GET_TC(tc)->longValue = PyLong_AsLongLongAndOverflow(obj, &_overflow); exc = PyErr_Occurred(); - if (!exc && (_overflow!=0)){ + if (_overflow){ PRINTMARK(); tc->type = JT_BIGNUM; } From 56d5bacf59d4006870813f0bedc22b81b2459004 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Thu, 18 Jun 2020 16:26:02 +0000 Subject: [PATCH 40/79] added shortcircuit for error check --- pandas/_libs/src/ujson/python/objToJSON.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 6c75431d97f6d..72d314290aa32 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1633,7 +1633,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { GET_TC(tc)->longValue = PyLong_AsLongLongAndOverflow(obj, &_overflow); exc = PyErr_Occurred(); - if (_overflow){ + if ((GET_TC(tc)->longValue == -1) && _overflow && !exc){ PRINTMARK(); tc->type = JT_BIGNUM; } From 1cdb1ba45e40ec377f14ca57d31b45e36be8b2de Mon Sep 17 00:00:00 2001 From: Andrew Wieteska <48889395+arw2019@users.noreply.github.com> Date: Thu, 18 Jun 2020 12:41:30 -0400 Subject: [PATCH 41/79] simplify int overflow error catching Co-authored-by: William Ayd --- pandas/_libs/src/ujson/python/objToJSON.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 72d314290aa32..510fdaf9ad14b 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1631,7 +1631,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { tc->type = JT_LONG; int _overflow = 0; GET_TC(tc)->longValue = PyLong_AsLongLongAndOverflow(obj, &_overflow); - exc = PyErr_Occurred(); + exc = GET_TC(tc)->LongValue == -1 && PyErr_Occurred(); if ((GET_TC(tc)->longValue == -1) && _overflow && !exc){ PRINTMARK(); From 821d51fa80bfa7849576cea393a17833c02fc8cc Mon Sep 17 00:00:00 2001 From: Andrew Wieteska <48889395+arw2019@users.noreply.github.com> Date: Thu, 18 Jun 2020 12:42:19 -0400 Subject: [PATCH 42/79] Update long int test in pandas/tests/io/json/test_ujson.py Co-authored-by: William Ayd --- pandas/tests/io/json/test_ujson.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index 6e7348d58d842..c9781122eccc3 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -565,7 +565,7 @@ def test_dumps_ints_larger_than_maxsize(self): big_num = sys.maxsize + 1 encoding = ujson.dumps(big_num) - assert str(encoding) == json.dumps(big_num) + assert str(big_num) == json.dumps(big_num) # ujson.loads to be fixed in the future # assert ujson.loads(encoding) == big_num From 1001ac141c0ada9c99ab344c25dda7dfc799fe81 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Thu, 18 Jun 2020 16:57:35 +0000 Subject: [PATCH 43/79] removed tests expecting numeric overflow --- pandas/tests/io/json/test_ujson.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index c9781122eccc3..c0f0e7f5221c9 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -563,9 +563,8 @@ def test_encode_long_conversion(self): def test_dumps_ints_larger_than_maxsize(self): # GH34395 big_num = sys.maxsize + 1 - encoding = ujson.dumps(big_num) - assert str(big_num) == json.dumps(big_num) + assert str(big_num) == ujson.dumps(big_num) # ujson.loads to be fixed in the future # assert ujson.loads(encoding) == big_num @@ -589,8 +588,7 @@ class Nested: x = 12839128391289382193812939 for _ in range(0, 100): - with pytest.raises(OverflowError): - ujson.encode(Nested()) + ujson.encode(Nested()) @pytest.mark.parametrize("val", [3590016419, 2 ** 31, 2 ** 32, (2 ** 32) - 1]) def test_decode_number_with_32bit_sign_bit(self, val): From b8f16b6f43edbaf2805f0f301fe7324b8968d5e1 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska <48889395+arw2019@users.noreply.github.com> Date: Thu, 18 Jun 2020 12:58:45 -0400 Subject: [PATCH 44/79] remove underscore from overflow Co-authored-by: William Ayd --- pandas/_libs/src/ujson/python/objToJSON.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 510fdaf9ad14b..c463b1eb4679e 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1629,7 +1629,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { if (PyLong_Check(obj)) { PRINTMARK(); tc->type = JT_LONG; - int _overflow = 0; + int overflow = 0; GET_TC(tc)->longValue = PyLong_AsLongLongAndOverflow(obj, &_overflow); exc = GET_TC(tc)->LongValue == -1 && PyErr_Occurred(); From a6e83c730578b439d2dedd39a9208e6e79837726 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Thu, 18 Jun 2020 16:59:47 +0000 Subject: [PATCH 45/79] removed underscores from _overflow everywhere --- pandas/_libs/src/ujson/python/objToJSON.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index c463b1eb4679e..3a6d61250ed05 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1630,10 +1630,10 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { PRINTMARK(); tc->type = JT_LONG; int overflow = 0; - GET_TC(tc)->longValue = PyLong_AsLongLongAndOverflow(obj, &_overflow); + GET_TC(tc)->longValue = PyLong_AsLongLongAndOverflow(obj, &overflow); exc = GET_TC(tc)->LongValue == -1 && PyErr_Occurred(); - if ((GET_TC(tc)->longValue == -1) && _overflow && !exc){ + if ((GET_TC(tc)->longValue == -1) && overflow && !exc){ PRINTMARK(); tc->type = JT_BIGNUM; } From ccc5b4781346cbd3dd8143c7086def7c615eae1c Mon Sep 17 00:00:00 2001 From: arw2019 Date: Thu, 18 Jun 2020 17:00:40 +0000 Subject: [PATCH 46/79] fixed small typo --- pandas/_libs/src/ujson/python/objToJSON.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 3a6d61250ed05..0b393a98f420b 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1631,7 +1631,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { tc->type = JT_LONG; int overflow = 0; GET_TC(tc)->longValue = PyLong_AsLongLongAndOverflow(obj, &overflow); - exc = GET_TC(tc)->LongValue == -1 && PyErr_Occurred(); + exc = GET_TC(tc)->longValue == -1 && PyErr_Occurred(); if ((GET_TC(tc)->longValue == -1) && overflow && !exc){ PRINTMARK(); From 585b9856225b4c3491d5ba47ff0a384001993d69 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Thu, 18 Jun 2020 17:15:15 +0000 Subject: [PATCH 47/79] fix type of exc --- pandas/_libs/src/ujson/python/objToJSON.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 0b393a98f420b..962c470197741 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1631,14 +1631,15 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { tc->type = JT_LONG; int overflow = 0; GET_TC(tc)->longValue = PyLong_AsLongLongAndOverflow(obj, &overflow); - exc = GET_TC(tc)->longValue == -1 && PyErr_Occurred(); + _Bool err; + err = (GET_TC(tc)->longValue == -1) && PyErr_Occurred(); - if ((GET_TC(tc)->longValue == -1) && overflow && !exc){ + if (overflow && !err){ PRINTMARK(); tc->type = JT_BIGNUM; } - if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) { + if (err && PyErr_ExceptionMatches(PyExc_OverflowError)) { PRINTMARK(); goto INVALID; } From 7586698014313ece8ef555ed62fba7a7c029c404 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Thu, 18 Jun 2020 17:29:35 +0000 Subject: [PATCH 48/79] deleted numeric overflow tests --- pandas/tests/io/json/test_ujson.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index c0f0e7f5221c9..6b3d0c9185279 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -563,8 +563,9 @@ def test_encode_long_conversion(self): def test_dumps_ints_larger_than_maxsize(self): # GH34395 big_num = sys.maxsize + 1 + encoding = ujson.encode(big_num) - assert str(big_num) == ujson.dumps(big_num) + assert str(big_num) == encoding # ujson.loads to be fixed in the future # assert ujson.loads(encoding) == big_num @@ -579,17 +580,6 @@ def test_loads_non_str_bytes_raises(self): with pytest.raises(TypeError, match=msg): ujson.loads(None) - def test_encode_numeric_overflow(self): - with pytest.raises(OverflowError): - ujson.encode(12839128391289382193812939) - - def test_encode_numeric_overflow_nested(self): - class Nested: - x = 12839128391289382193812939 - - for _ in range(0, 100): - ujson.encode(Nested()) - @pytest.mark.parametrize("val", [3590016419, 2 ** 31, 2 ** 32, (2 ** 32) - 1]) def test_decode_number_with_32bit_sign_bit(self, val): # Test that numbers that fit within 32 bits but would have the From 0e6768f3968ea89424e9279ee2dd78414de8f88e Mon Sep 17 00:00:00 2001 From: Andrew Wieteska <48889395+arw2019@users.noreply.github.com> Date: Thu, 18 Jun 2020 13:31:36 -0400 Subject: [PATCH 49/79] remove extraneous condition in if statement Co-authored-by: William Ayd --- pandas/_libs/src/ujson/python/objToJSON.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 962c470197741..1b6b666f333de 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1634,7 +1634,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { _Bool err; err = (GET_TC(tc)->longValue == -1) && PyErr_Occurred(); - if (overflow && !err){ + if (overflow){ PRINTMARK(); tc->type = JT_BIGNUM; } From 7c19bd2e5aa9740667eabd9b70e1ff0d12049912 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska <48889395+arw2019@users.noreply.github.com> Date: Thu, 18 Jun 2020 13:32:03 -0400 Subject: [PATCH 50/79] remove extraneous condition in if statement Co-authored-by: William Ayd --- pandas/_libs/src/ujson/python/objToJSON.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 1b6b666f333de..cdc172734489f 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1639,7 +1639,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { tc->type = JT_BIGNUM; } - if (err && PyErr_ExceptionMatches(PyExc_OverflowError)) { + if (err) { PRINTMARK(); goto INVALID; } From 9809d7ce35a5012798f677fef63639e06fad6435 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska <48889395+arw2019@users.noreply.github.com> Date: Thu, 18 Jun 2020 13:37:07 -0400 Subject: [PATCH 51/79] change _Bool into int Co-authored-by: William Ayd --- pandas/_libs/src/ujson/python/objToJSON.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index cdc172734489f..034be9095516a 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1631,7 +1631,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { tc->type = JT_LONG; int overflow = 0; GET_TC(tc)->longValue = PyLong_AsLongLongAndOverflow(obj, &overflow); - _Bool err; + int err; err = (GET_TC(tc)->longValue == -1) && PyErr_Occurred(); if (overflow){ From 2739f3d945d0fbb190d4a1b00d6832bb57edb20e Mon Sep 17 00:00:00 2001 From: Andrew Wieteska <48889395+arw2019@users.noreply.github.com> Date: Thu, 18 Jun 2020 13:55:17 -0400 Subject: [PATCH 52/79] Update pandas/_libs/src/ujson/python/objToJSON.c Co-authored-by: William Ayd --- pandas/_libs/src/ujson/python/objToJSON.c | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 034be9095516a..e4e81324e2272 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -2142,7 +2142,6 @@ const char *Object_getBigNumStringValue(JSOBJ obj, JSONTypeContext *tc, GET_TC(tc)->cStr = bytes; Py_DECREF(repr); - Py_DECREF(str); return GET_TC(tc)->cStr; } From 77d69b7a0b02563355371186e0cd607636b248f1 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska <48889395+arw2019@users.noreply.github.com> Date: Thu, 18 Jun 2020 14:29:58 -0400 Subject: [PATCH 53/79] Update pandas/_libs/src/ujson/lib/ultrajsonenc.c Co-authored-by: William Ayd --- pandas/_libs/src/ujson/lib/ultrajsonenc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/src/ujson/lib/ultrajsonenc.c b/pandas/_libs/src/ujson/lib/ultrajsonenc.c index 51aa39a16920e..6aa9da1286abf 100644 --- a/pandas/_libs/src/ujson/lib/ultrajsonenc.c +++ b/pandas/_libs/src/ujson/lib/ultrajsonenc.c @@ -1111,7 +1111,7 @@ void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, case JT_BIGNUM: { value = enc->getBigNumStringValue(obj, &tc, &szlen); - Buffer_Reserve(enc, RESERVE_STRING(szlen)); + Buffer_Reserve(enc, szlen); if (enc->errorMsg) { enc->endTypeContext(obj, &tc); return; From f003d6ba2de6b446559226f112f32ce26ba092a7 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska <48889395+arw2019@users.noreply.github.com> Date: Thu, 18 Jun 2020 20:56:10 -0400 Subject: [PATCH 54/79] allocate an extra byte in Object_getBigNumStringValue Co-authored-by: William Ayd --- pandas/_libs/src/ujson/python/objToJSON.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index e4e81324e2272..77345ebb0194a 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -2137,7 +2137,7 @@ const char *Object_getBigNumStringValue(JSOBJ obj, JSONTypeContext *tc, size_t *_outLen) { PyObject* repr = PyObject_Str(obj); const char *str = PyUnicode_AsUTF8AndSize(repr, (Py_ssize_t *) _outLen); - char* bytes = malloc(*_outLen); + char* bytes = malloc(*_outLen + 1); memcpy(bytes, str, *_outLen); GET_TC(tc)->cStr = bytes; From ee505c9b73259c526595f0a9692c72bc80e0d663 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska <48889395+arw2019@users.noreply.github.com> Date: Thu, 18 Jun 2020 20:56:27 -0400 Subject: [PATCH 55/79] allocate an extra byte in Object_getBigNumStringValue Co-authored-by: William Ayd --- pandas/_libs/src/ujson/python/objToJSON.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 77345ebb0194a..96c0dbdc80b9f 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -2138,7 +2138,7 @@ const char *Object_getBigNumStringValue(JSOBJ obj, JSONTypeContext *tc, PyObject* repr = PyObject_Str(obj); const char *str = PyUnicode_AsUTF8AndSize(repr, (Py_ssize_t *) _outLen); char* bytes = malloc(*_outLen + 1); - memcpy(bytes, str, *_outLen); + memcpy(bytes, str, *_outLen + 1); GET_TC(tc)->cStr = bytes; Py_DECREF(repr); From cdc0870fadaaa8ab93ca0f3a6b21c6e422a181f4 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Fri, 19 Jun 2020 01:11:40 +0000 Subject: [PATCH 56/79] reinstate RESERVE_STRING(szlen) in JT_BIGNUM case --- pandas/_libs/src/ujson/lib/ultrajsonenc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/src/ujson/lib/ultrajsonenc.c b/pandas/_libs/src/ujson/lib/ultrajsonenc.c index 6aa9da1286abf..51aa39a16920e 100644 --- a/pandas/_libs/src/ujson/lib/ultrajsonenc.c +++ b/pandas/_libs/src/ujson/lib/ultrajsonenc.c @@ -1111,7 +1111,7 @@ void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, case JT_BIGNUM: { value = enc->getBigNumStringValue(obj, &tc, &szlen); - Buffer_Reserve(enc, szlen); + Buffer_Reserve(enc, RESERVE_STRING(szlen)); if (enc->errorMsg) { enc->endTypeContext(obj, &tc); return; From 0fba3d5d9d33234a38d925145ef609b08162a908 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Fri, 19 Jun 2020 01:13:12 +0000 Subject: [PATCH 57/79] replaced (private) with (public) in whatnew --- doc/source/whatsnew/v1.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 04ccd336ce52b..691780bd49f29 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -901,7 +901,7 @@ I/O - Bug in :meth:`~DataFrame.read_feather` was raising an `ArrowIOError` when reading an s3 or http file path (:issue:`29055`) - Bug in :meth:`~DataFrame.to_excel` could not handle the column name `render` and was raising an ``KeyError`` (:issue:`34331`) - Bug in :meth:`~SQLDatabase.execute` was raising a ``ProgrammingError`` for some DB-API drivers when the SQL statement contained the `%` character and no parameters were present (:issue:`34211`) -- Bug in :meth:`json.dumps` was raising an `OverflowError` with numbers larger than sys.maxsize (:issue: `34395`) +- Bug in :meth:`ujson.encode` was raising an `OverflowError` with numbers larger than sys.maxsize (:issue: `34395`) Plotting ^^^^^^^^ From 259018d6b1bb033236258246ad18cf9fad7aa7df Mon Sep 17 00:00:00 2001 From: arw2019 Date: Fri, 19 Jun 2020 01:24:03 +0000 Subject: [PATCH 58/79] release bytes in Object_endTypeContext --- pandas/_libs/src/ujson/python/objToJSON.c | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 96c0dbdc80b9f..3525d5c34dd01 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -2114,6 +2114,7 @@ void Object_endTypeContext(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { GET_TC(tc)->columnLabels = NULL; PyObject_Free(GET_TC(tc)->cStr); + free(bytes); GET_TC(tc)->cStr = NULL; PyObject_Free(tc->prv); tc->prv = NULL; From a856a4173dc78d6010ce513c8ec8e2753b8b5de0 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Fri, 19 Jun 2020 01:50:54 +0000 Subject: [PATCH 59/79] in JT_BIGNUM change if+if into if+else if --- pandas/_libs/src/ujson/python/objToJSON.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 3525d5c34dd01..f54712a4af5ea 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1638,8 +1638,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { PRINTMARK(); tc->type = JT_BIGNUM; } - - if (err) { + else if (err) { PRINTMARK(); goto INVALID; } @@ -2114,7 +2113,6 @@ void Object_endTypeContext(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { GET_TC(tc)->columnLabels = NULL; PyObject_Free(GET_TC(tc)->cStr); - free(bytes); GET_TC(tc)->cStr = NULL; PyObject_Free(tc->prv); tc->prv = NULL; From 1bbfdc2058db7adc249650080df6d7a33bc210c5 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Fri, 19 Jun 2020 03:48:01 +0000 Subject: [PATCH 60/79] added reallocation of bigNum_bytes --- pandas/_libs/src/ujson/python/objToJSON.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index f54712a4af5ea..cdb9d2030b24b 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -106,6 +106,8 @@ typedef struct __TypeContext { double doubleValue; JSINT64 longValue; + char *bigNum_bytes; // JT_BIGNUM storage + char *cStr; NpyArrContext *npyarr; PdBlockContext *pdblock; @@ -119,6 +121,7 @@ typedef struct __TypeContext { typedef struct __PyObjectEncoder { JSONObjectEncoder enc; + // pass through the NpyArrContext when encoding multi-dimensional arrays NpyArrContext *npyCtxtPassthru; @@ -2111,7 +2114,7 @@ void Object_endTypeContext(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { NpyArr_freeLabels(GET_TC(tc)->columnLabels, GET_TC(tc)->columnLabelsLen); GET_TC(tc)->columnLabels = NULL; - + if (tc->type == JT_BIGNUM) free(GET_TC(tc)->bigNum_bytes); PyObject_Free(GET_TC(tc)->cStr); GET_TC(tc)->cStr = NULL; PyObject_Free(tc->prv); @@ -2139,6 +2142,7 @@ const char *Object_getBigNumStringValue(JSOBJ obj, JSONTypeContext *tc, char* bytes = malloc(*_outLen + 1); memcpy(bytes, str, *_outLen + 1); GET_TC(tc)->cStr = bytes; + GET_TC(tc)->bigNum_bytes = bytes; Py_DECREF(repr); From 665b1461d5706d65269460372637bb5399da2ac3 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Fri, 19 Jun 2020 04:09:41 +0000 Subject: [PATCH 61/79] removed bigNum_bytes --- pandas/_libs/src/ujson/python/objToJSON.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index cdb9d2030b24b..5218f2f7ff5c7 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -106,8 +106,6 @@ typedef struct __TypeContext { double doubleValue; JSINT64 longValue; - char *bigNum_bytes; // JT_BIGNUM storage - char *cStr; NpyArrContext *npyarr; PdBlockContext *pdblock; @@ -2114,7 +2112,6 @@ void Object_endTypeContext(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { NpyArr_freeLabels(GET_TC(tc)->columnLabels, GET_TC(tc)->columnLabelsLen); GET_TC(tc)->columnLabels = NULL; - if (tc->type == JT_BIGNUM) free(GET_TC(tc)->bigNum_bytes); PyObject_Free(GET_TC(tc)->cStr); GET_TC(tc)->cStr = NULL; PyObject_Free(tc->prv); @@ -2142,7 +2139,6 @@ const char *Object_getBigNumStringValue(JSOBJ obj, JSONTypeContext *tc, char* bytes = malloc(*_outLen + 1); memcpy(bytes, str, *_outLen + 1); GET_TC(tc)->cStr = bytes; - GET_TC(tc)->bigNum_bytes = bytes; Py_DECREF(repr); From 360829742e5b8cde1127f92d20d544eb1cda6e5e Mon Sep 17 00:00:00 2001 From: arw2019 Date: Fri, 19 Jun 2020 05:11:59 +0000 Subject: [PATCH 62/79] added to_json test for ints>sys.maxsize --- pandas/tests/io/json/test_pandas.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 137e4c991d080..a714b72505ebd 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -4,6 +4,7 @@ from io import StringIO import json import os +import sys import numpy as np import pytest @@ -1242,6 +1243,25 @@ def test_read_jsonl_unicode_chars(self): expected = DataFrame([["foo\u201d", "bar"], ["foo", "bar"]], columns=["a", "b"]) tm.assert_frame_equal(result, expected) + def test_to_json_large_numbers(self): + # GH34473 + bigNum = sys.maxsize + 1 + + originalSeries = Series(bigNum, dtype=object, index=["articleId"]) + originalSeries.to_json() + # json = StringIO(json) + # newSeries = read_json(json, typ="series") + # tm.assert_series_equal(originalSeries, newSeries) + + originalDataFrame = DataFrame( + bigNum, dtype=object, index=["articleId"], columns=[0] + ) + originalDataFrame.to_json() + # json = originalDataFrame.to_json() + # json = StringIO(json) + # newDataFrame = read_json(json) + # tm.assert_frame_equal(originalDataFrame, newDataFrame) + def test_read_json_large_numbers(self): # GH18842 json = '{"articleId": "1404366058080022500245"}' From 176f2126f156892c1aa6fbdca2509cfa2d4d5049 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska <48889395+arw2019@users.noreply.github.com> Date: Fri, 19 Jun 2020 01:21:55 -0400 Subject: [PATCH 63/79] Use python malloc to match PyObject_Free in endTypeContext Co-authored-by: William Ayd --- pandas/_libs/src/ujson/python/objToJSON.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 5218f2f7ff5c7..095cb0c43ef0e 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -2136,7 +2136,7 @@ const char *Object_getBigNumStringValue(JSOBJ obj, JSONTypeContext *tc, size_t *_outLen) { PyObject* repr = PyObject_Str(obj); const char *str = PyUnicode_AsUTF8AndSize(repr, (Py_ssize_t *) _outLen); - char* bytes = malloc(*_outLen + 1); + char* bytes = PyObject_Malloc(*_outLen + 1); memcpy(bytes, str, *_outLen + 1); GET_TC(tc)->cStr = bytes; From 9b587589994cee558045bab0252993dc274dc372 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Fri, 19 Jun 2020 05:36:10 +0000 Subject: [PATCH 64/79] TST: added manually constructed strs to compare encodings --- pandas/tests/io/json/test_pandas.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index a714b72505ebd..de0a997c152b5 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1248,19 +1248,16 @@ def test_to_json_large_numbers(self): bigNum = sys.maxsize + 1 originalSeries = Series(bigNum, dtype=object, index=["articleId"]) - originalSeries.to_json() - # json = StringIO(json) - # newSeries = read_json(json, typ="series") - # tm.assert_series_equal(originalSeries, newSeries) + result = originalSeries.to_json() + expected = '{"articleId":' + str(bigNum) + "}" + assert result == expected originalDataFrame = DataFrame( bigNum, dtype=object, index=["articleId"], columns=[0] ) - originalDataFrame.to_json() - # json = originalDataFrame.to_json() - # json = StringIO(json) - # newDataFrame = read_json(json) - # tm.assert_frame_equal(originalDataFrame, newDataFrame) + result = originalDataFrame.to_json() + expected = '{"0":{"articleId":' + str(bigNum) + "}}" + assert result == expected def test_read_json_large_numbers(self): # GH18842 From 9cbf5963506c21addc60cd0de4fe4e7a6363dceb Mon Sep 17 00:00:00 2001 From: arw2019 Date: Fri, 19 Jun 2020 18:05:33 +0000 Subject: [PATCH 65/79] fixed styling to minimize diff with master --- pandas/_libs/src/ujson/python/objToJSON.c | 1 - pandas/io/json/_json.py | 63 +++++++---------------- 2 files changed, 18 insertions(+), 46 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 095cb0c43ef0e..17182a43e2b4e 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1643,7 +1643,6 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { PRINTMARK(); goto INVALID; } - return; } else if (PyFloat_Check(obj)) { diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 043f7c608155c..ac6f9ff372601 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -27,7 +27,6 @@ loads = json.loads dumps = json.dumps - TABLE_SCHEMA_VERSION = "0.20.0" @@ -356,15 +355,14 @@ def read_json( dtype=None, convert_axes=None, convert_dates=True, - keep_default_dates: bool = True, - numpy: bool = False, - precise_float: bool = False, + keep_default_dates=True, + numpy=False, + precise_float=False, date_unit=None, encoding=None, - lines: bool = False, - chunksize: Optional[int] = None, + lines=False, + chunksize=None, compression="infer", - nrows: Optional[int] = None, ): """ Convert a JSON string to pandas object. @@ -495,7 +493,6 @@ def read_json( for more information on ``chunksize``. This can only be passed if `lines=True`. If this is None, the file will be read into memory all at once. - compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' For on-the-fly decompression of on-disk data. If 'infer', then use gzip, bz2, zip or xz if path_or_buf is a string ending in @@ -503,13 +500,6 @@ def read_json( otherwise. If using 'zip', the ZIP file must contain only one data file to be read in. Set to None for no decompression. - nrows : int, optional - The number of lines from the line-delimited jsonfile that has to be read. - This can only be passed if `lines=True`. - If this is None, all the rows will be returned. - - .. versionadded:: 1.1 - Returns ------- Series or DataFrame @@ -610,7 +600,6 @@ def read_json( lines=lines, chunksize=chunksize, compression=compression, - nrows=nrows, ) if chunksize: @@ -640,17 +629,17 @@ def __init__( dtype, convert_axes, convert_dates, - keep_default_dates: bool, - numpy: bool, - precise_float: bool, + keep_default_dates, + numpy, + precise_float, date_unit, encoding, - lines: bool, - chunksize: Optional[int], + lines, + chunksize, compression, - nrows: Optional[int], ): + self.path_or_buf = filepath_or_buffer self.orient = orient self.typ = typ self.dtype = dtype @@ -666,16 +655,11 @@ def __init__( self.chunksize = chunksize self.nrows_seen = 0 self.should_close = False - self.nrows = nrows if self.chunksize is not None: self.chunksize = _validate_integer("chunksize", self.chunksize, 1) if not self.lines: raise ValueError("chunksize can only be passed if lines=True") - if self.nrows is not None: - self.nrows = _validate_integer("nrows", self.nrows, 0) - if not self.lines: - raise ValueError("nrows can only be passed if lines=True") data = self._get_data_from_filepath(filepath_or_buffer) self.data = self._preprocess_data(data) @@ -688,9 +672,9 @@ def _preprocess_data(self, data): If self.chunksize, we prepare the data for the `__next__` method. Otherwise, we read it into memory for the `read` method. """ - if hasattr(data, "read") and (not self.chunksize or not self.nrows): + if hasattr(data, "read") and not self.chunksize: data = data.read() - if not hasattr(data, "read") and (self.chunksize or self.nrows): + if not hasattr(data, "read") and self.chunksize: data = StringIO(data) return data @@ -738,17 +722,11 @@ def read(self): """ Read the whole JSON input into a pandas object. """ - if self.lines: - if self.chunksize: - obj = concat(self) - elif self.nrows: - lines = list(islice(self.data, self.nrows)) - lines_json = self._combine_lines(lines) - obj = self._get_object_parser(lines_json) - else: - data = ensure_str(self.data) - data = data.split("\n") - obj = self._get_object_parser(self._combine_lines(data)) + if self.lines and self.chunksize: + obj = concat(self) + elif self.lines: + data = ensure_str(self.data) + obj = self._get_object_parser(self._combine_lines(data.split("\n"))) else: obj = self._get_object_parser(self.data) self.close() @@ -795,11 +773,6 @@ def close(self): pass def __next__(self): - if self.nrows: - if self.nrows_seen >= self.nrows: - self.close() - raise StopIteration - lines = list(islice(self.data, self.chunksize)) if lines: lines_json = self._combine_lines(lines) From 7ee21eb1c7d2d7d95c1e1400515255c569eff641 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Fri, 19 Jun 2020 18:15:51 +0000 Subject: [PATCH 66/79] fixed styling --- pandas/_libs/src/ujson/python/objToJSON.c | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 095cb0c43ef0e..ed6093ba70e15 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1644,7 +1644,6 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { goto INVALID; } - return; } else if (PyFloat_Check(obj)) { PRINTMARK(); From ff2e25e86aa534031b69ff2e67f7175365ea4f5d Mon Sep 17 00:00:00 2001 From: arw2019 Date: Fri, 19 Jun 2020 18:36:56 +0000 Subject: [PATCH 67/79] fixed conflicts with master --- pandas/io/json/_json.py | 63 +++++++++++++++++++++++++++++------------ 1 file changed, 45 insertions(+), 18 deletions(-) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index ac6f9ff372601..043f7c608155c 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -27,6 +27,7 @@ loads = json.loads dumps = json.dumps + TABLE_SCHEMA_VERSION = "0.20.0" @@ -355,14 +356,15 @@ def read_json( dtype=None, convert_axes=None, convert_dates=True, - keep_default_dates=True, - numpy=False, - precise_float=False, + keep_default_dates: bool = True, + numpy: bool = False, + precise_float: bool = False, date_unit=None, encoding=None, - lines=False, - chunksize=None, + lines: bool = False, + chunksize: Optional[int] = None, compression="infer", + nrows: Optional[int] = None, ): """ Convert a JSON string to pandas object. @@ -493,6 +495,7 @@ def read_json( for more information on ``chunksize``. This can only be passed if `lines=True`. If this is None, the file will be read into memory all at once. + compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' For on-the-fly decompression of on-disk data. If 'infer', then use gzip, bz2, zip or xz if path_or_buf is a string ending in @@ -500,6 +503,13 @@ def read_json( otherwise. If using 'zip', the ZIP file must contain only one data file to be read in. Set to None for no decompression. + nrows : int, optional + The number of lines from the line-delimited jsonfile that has to be read. + This can only be passed if `lines=True`. + If this is None, all the rows will be returned. + + .. versionadded:: 1.1 + Returns ------- Series or DataFrame @@ -600,6 +610,7 @@ def read_json( lines=lines, chunksize=chunksize, compression=compression, + nrows=nrows, ) if chunksize: @@ -629,17 +640,17 @@ def __init__( dtype, convert_axes, convert_dates, - keep_default_dates, - numpy, - precise_float, + keep_default_dates: bool, + numpy: bool, + precise_float: bool, date_unit, encoding, - lines, - chunksize, + lines: bool, + chunksize: Optional[int], compression, + nrows: Optional[int], ): - self.path_or_buf = filepath_or_buffer self.orient = orient self.typ = typ self.dtype = dtype @@ -655,11 +666,16 @@ def __init__( self.chunksize = chunksize self.nrows_seen = 0 self.should_close = False + self.nrows = nrows if self.chunksize is not None: self.chunksize = _validate_integer("chunksize", self.chunksize, 1) if not self.lines: raise ValueError("chunksize can only be passed if lines=True") + if self.nrows is not None: + self.nrows = _validate_integer("nrows", self.nrows, 0) + if not self.lines: + raise ValueError("nrows can only be passed if lines=True") data = self._get_data_from_filepath(filepath_or_buffer) self.data = self._preprocess_data(data) @@ -672,9 +688,9 @@ def _preprocess_data(self, data): If self.chunksize, we prepare the data for the `__next__` method. Otherwise, we read it into memory for the `read` method. """ - if hasattr(data, "read") and not self.chunksize: + if hasattr(data, "read") and (not self.chunksize or not self.nrows): data = data.read() - if not hasattr(data, "read") and self.chunksize: + if not hasattr(data, "read") and (self.chunksize or self.nrows): data = StringIO(data) return data @@ -722,11 +738,17 @@ def read(self): """ Read the whole JSON input into a pandas object. """ - if self.lines and self.chunksize: - obj = concat(self) - elif self.lines: - data = ensure_str(self.data) - obj = self._get_object_parser(self._combine_lines(data.split("\n"))) + if self.lines: + if self.chunksize: + obj = concat(self) + elif self.nrows: + lines = list(islice(self.data, self.nrows)) + lines_json = self._combine_lines(lines) + obj = self._get_object_parser(lines_json) + else: + data = ensure_str(self.data) + data = data.split("\n") + obj = self._get_object_parser(self._combine_lines(data)) else: obj = self._get_object_parser(self.data) self.close() @@ -773,6 +795,11 @@ def close(self): pass def __next__(self): + if self.nrows: + if self.nrows_seen >= self.nrows: + self.close() + raise StopIteration + lines = list(islice(self.data, self.chunksize)) if lines: lines_json = self._combine_lines(lines) From ce37048eee1015e8e3c1f7bb988489b37f659678 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Fri, 19 Jun 2020 18:39:32 +0000 Subject: [PATCH 68/79] fix styling to minimize diff --- pandas/_libs/src/ujson/python/objToJSON.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index ea204990bf916..ed6093ba70e15 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1643,11 +1643,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { PRINTMARK(); goto INVALID; } -<<<<<<< HEAD -======= - ->>>>>>> 9cbf5963506c21addc60cd0de4fe4e7a6363dceb return; } else if (PyFloat_Check(obj)) { PRINTMARK(); From 2db12c066b360381af71faee267d46ed8a464db3 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Fri, 19 Jun 2020 18:42:14 +0000 Subject: [PATCH 69/79] fix styling to minimize diff --- pandas/_libs/src/ujson/python/objToJSON.c | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index ed6093ba70e15..1de9642761961 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -119,7 +119,6 @@ typedef struct __TypeContext { typedef struct __PyObjectEncoder { JSONObjectEncoder enc; - // pass through the NpyArrContext when encoding multi-dimensional arrays NpyArrContext *npyCtxtPassthru; From 3e820acd3105793e803b137c20638aa7327d0b28 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Fri, 19 Jun 2020 19:19:23 +0000 Subject: [PATCH 70/79] fixed styling --- pandas/io/json/_json.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 043f7c608155c..b973553a767ba 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -27,7 +27,6 @@ loads = json.loads dumps = json.dumps - TABLE_SCHEMA_VERSION = "0.20.0" From 7afeadb144cd3b587fd0bae990346898e2dd4f80 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Fri, 19 Jun 2020 19:48:30 +0000 Subject: [PATCH 71/79] added negative nigNum to test_to_json_large_numers --- pandas/tests/io/json/test_pandas.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 96714f33cecf5..deadcdb2b995d 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1243,10 +1243,9 @@ def test_read_jsonl_unicode_chars(self): expected = DataFrame([["foo\u201d", "bar"], ["foo", "bar"]], columns=["a", "b"]) tm.assert_frame_equal(result, expected) - def test_to_json_large_numbers(self): + @pytest.mark.parametrize("bigNum", [sys.maxsize + 1, -(sys.maxsize + 1)]) + def test_to_json_large_numbers(self, bigNum): # GH34473 - bigNum = sys.maxsize + 1 - originalSeries = Series(bigNum, dtype=object, index=["articleId"]) result = originalSeries.to_json() expected = '{"articleId":' + str(bigNum) + "}" From e4df0f8cb3c1fbf2fa2655cf7a6e899abeff3f20 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Fri, 19 Jun 2020 20:06:21 +0000 Subject: [PATCH 72/79] added negative nigNum to test_to_json_large_numers --- pandas/tests/io/json/test_ujson.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index 22fd595035f5d..7244776b21747 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -560,14 +560,15 @@ def test_encode_long_conversion(self): assert output == json.dumps(long_input) assert long_input == ujson.decode(output) - def test_dumps_ints_larger_than_maxsize(self): + @pytest.mark.parametrize("bigNum", [sys.maxsize + 1, -(sys.maxsize + 1)]) + def test_dumps_ints_larger_than_maxsize(self, bigNum): # GH34395 - big_num = sys.maxsize + 1 - encoding = ujson.encode(big_num) + bigNum = sys.maxsize + 1 + encoding = ujson.encode(bigNum) - assert str(big_num) == encoding + assert str(bigNum) == encoding # ujson.loads to be fixed in the future - # assert ujson.loads(encoding) == big_num + # assert ujson.loads(encoding) == bigNum @pytest.mark.parametrize( "int_exp", ["1337E40", "1.337E40", "1337E+9", "1.337e+40", "1.337E-4"] From 7b041fe48e51d179d5ad90db57887284b6e45de7 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska <48889395+arw2019@users.noreply.github.com> Date: Fri, 19 Jun 2020 19:07:05 -0400 Subject: [PATCH 73/79] Update pandas/tests/io/json/test_ujson.py Co-authored-by: William Ayd --- pandas/tests/io/json/test_ujson.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index 7244776b21747..3bcfc29872337 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -560,7 +560,7 @@ def test_encode_long_conversion(self): assert output == json.dumps(long_input) assert long_input == ujson.decode(output) - @pytest.mark.parametrize("bigNum", [sys.maxsize + 1, -(sys.maxsize + 1)]) + @pytest.mark.parametrize("bigNum", [sys.maxsize + 1, -(sys.maxsize + 2)]) def test_dumps_ints_larger_than_maxsize(self, bigNum): # GH34395 bigNum = sys.maxsize + 1 From 21d9e982d7ebd8bb266db63aee29a52fe94809f9 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Fri, 19 Jun 2020 23:09:34 +0000 Subject: [PATCH 74/79] fixe test_to_json_for_large_nums for -ve --- pandas/tests/io/json/test_pandas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index deadcdb2b995d..02d3b5314966b 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1243,7 +1243,7 @@ def test_read_jsonl_unicode_chars(self): expected = DataFrame([["foo\u201d", "bar"], ["foo", "bar"]], columns=["a", "b"]) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("bigNum", [sys.maxsize + 1, -(sys.maxsize + 1)]) + @pytest.mark.parametrize("bigNum", [sys.maxsize + 1, -(sys.maxsize + 2)]) def test_to_json_large_numbers(self, bigNum): # GH34473 originalSeries = Series(bigNum, dtype=object, index=["articleId"]) From 2d43001ca56340f02398ea6d64f4cda0b8adf522 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Mon, 22 Jun 2020 06:08:48 +0000 Subject: [PATCH 75/79] TST: added xfail for ujson.encode with long int input --- pandas/tests/io/json/test_ujson.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index 3bcfc29872337..e1a136e1a3728 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -565,10 +565,11 @@ def test_dumps_ints_larger_than_maxsize(self, bigNum): # GH34395 bigNum = sys.maxsize + 1 encoding = ujson.encode(bigNum) - assert str(bigNum) == encoding - # ujson.loads to be fixed in the future - # assert ujson.loads(encoding) == bigNum + + # GH20599 + with pytest.raises(ValueError): + assert ujson.loads(encoding) == bigNum @pytest.mark.parametrize( "int_exp", ["1337E40", "1.337E40", "1337E+9", "1.337e+40", "1.337E-4"] From 6053227948e6b329cc7555ea71b98c1ef5089017 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Mon, 22 Jun 2020 06:13:11 +0000 Subject: [PATCH 76/79] TST: fixed variable names in test_to_json_large_numbers --- pandas/tests/io/json/test_pandas.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 02d3b5314966b..5882cbd13244c 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1246,15 +1246,13 @@ def test_read_jsonl_unicode_chars(self): @pytest.mark.parametrize("bigNum", [sys.maxsize + 1, -(sys.maxsize + 2)]) def test_to_json_large_numbers(self, bigNum): # GH34473 - originalSeries = Series(bigNum, dtype=object, index=["articleId"]) - result = originalSeries.to_json() + series = Series(bigNum, dtype=object, index=["articleId"]) + result = series.to_json() expected = '{"articleId":' + str(bigNum) + "}" assert result == expected - originalDataFrame = DataFrame( - bigNum, dtype=object, index=["articleId"], columns=[0] - ) - result = originalDataFrame.to_json() + df = DataFrame(bigNum, dtype=object, index=["articleId"], columns=[0]) + result = df.to_json() expected = '{"0":{"articleId":' + str(bigNum) + "}}" assert result == expected From a6884688ebc7e43c5a736402104d78ce8f6f6c85 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Mon, 22 Jun 2020 06:17:57 +0000 Subject: [PATCH 77/79] TST: added xfail test for json.decode Series with long int --- pandas/tests/io/json/test_pandas.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 5882cbd13244c..f92659a7aa09e 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1247,9 +1247,14 @@ def test_read_jsonl_unicode_chars(self): def test_to_json_large_numbers(self, bigNum): # GH34473 series = Series(bigNum, dtype=object, index=["articleId"]) - result = series.to_json() + json = series.to_json() expected = '{"articleId":' + str(bigNum) + "}" - assert result == expected + assert json == expected + # GH 20599 + with pytest.raises(ValueError): + json = StringIO(json) + result = read_json(json) + tm.assert_series_equal(series, result) df = DataFrame(bigNum, dtype=object, index=["articleId"], columns=[0]) result = df.to_json() From 9e1b95f721de87d83a2644f49a39528b4b81f536 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Mon, 22 Jun 2020 06:33:58 +0000 Subject: [PATCH 78/79] TST: added xfail test for json.decode DataFrame with long int --- pandas/tests/io/json/test_pandas.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index f92659a7aa09e..39b0be6513004 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1257,9 +1257,14 @@ def test_to_json_large_numbers(self, bigNum): tm.assert_series_equal(series, result) df = DataFrame(bigNum, dtype=object, index=["articleId"], columns=[0]) - result = df.to_json() + json = df.to_json() expected = '{"0":{"articleId":' + str(bigNum) + "}}" - assert result == expected + assert json == expected + # GH 20599 + with pytest.raises(ValueError): + json = StringIO(json) + result = read_json(json) + tm.assert_frame_equal(df, result) def test_read_json_large_numbers(self): # GH18842 From 4e53974b8a5fb2ea83d8ea30fcb066ffa452b4c6 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Mon, 22 Jun 2020 20:44:10 +0000 Subject: [PATCH 79/79] BENCH: added benchmarks for long ints --- asv_bench/benchmarks/io/json.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/asv_bench/benchmarks/io/json.py b/asv_bench/benchmarks/io/json.py index a490e250943f5..ed0fb5b8fe342 100644 --- a/asv_bench/benchmarks/io/json.py +++ b/asv_bench/benchmarks/io/json.py @@ -1,3 +1,5 @@ +import sys + import numpy as np from pandas import DataFrame, concat, date_range, read_json, timedelta_range @@ -82,6 +84,7 @@ def setup(self, orient, frame): timedeltas = timedelta_range(start=1, periods=N, freq="s") datetimes = date_range(start=1, periods=N, freq="s") ints = np.random.randint(100000000, size=N) + longints = sys.maxsize * np.random.randint(100000000, size=N) floats = np.random.randn(N) strings = tm.makeStringIndex(N) self.df = DataFrame(np.random.randn(N, ncols), index=np.arange(N)) @@ -120,6 +123,18 @@ def setup(self, orient, frame): index=index, ) + self.df_longint_float_str = DataFrame( + { + "longint_1": longints, + "longint_2": longints, + "float_1": floats, + "float_2": floats, + "str_1": strings, + "str_2": strings, + }, + index=index, + ) + def time_to_json(self, orient, frame): getattr(self, frame).to_json(self.fname, orient=orient) @@ -172,6 +187,7 @@ def setup(self): timedeltas = timedelta_range(start=1, periods=N, freq="s") datetimes = date_range(start=1, periods=N, freq="s") ints = np.random.randint(100000000, size=N) + longints = sys.maxsize * np.random.randint(100000000, size=N) floats = np.random.randn(N) strings = tm.makeStringIndex(N) self.df = DataFrame(np.random.randn(N, ncols), index=np.arange(N)) @@ -209,6 +225,17 @@ def setup(self): }, index=index, ) + self.df_longint_float_str = DataFrame( + { + "longint_1": longints, + "longint_2": longints, + "float_1": floats, + "float_2": floats, + "str_1": strings, + "str_2": strings, + }, + index=index, + ) def time_floats_with_int_idex_lines(self): self.df.to_json(self.fname, orient="records", lines=True) @@ -225,6 +252,9 @@ def time_float_int_lines(self): def time_float_int_str_lines(self): self.df_int_float_str.to_json(self.fname, orient="records", lines=True) + def time_float_longint_str_lines(self): + self.df_longint_float_str.to_json(self.fname, orient="records", lines=True) + class ToJSONMem: def setup_cache(self):