From 77020c3742a3d29c4ce766b9184de71201339070 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 16 Dec 2019 19:59:28 -0800 Subject: [PATCH 1/7] Added test --- pandas/tests/io/json/test_pandas.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index bce3d1de849aa..1e8adc3a4d096 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1601,3 +1601,10 @@ def test_json_indent_all_orients(self, orient, expected): def test_json_negative_indent_raises(self): with pytest.raises(ValueError, match="must be a nonnegative integer"): pd.DataFrame().to_json(indent=-1) + + def test_emca_262_nan_inf_support(self): + # GH 12213 + data = "[NaN, Infinity, -Infinity]" + result = pd.read_json(data) + expected = pd.DataFrame([np.nan, np.inf, -np.inf]) + tm.assert_frame_equal(result, expected) From 4462ebb64f5a21a138fd5a80e3ffbdfcdc59d4d5 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 16 Dec 2019 20:44:12 -0800 Subject: [PATCH 2/7] Added support for NaN, Inf --- pandas/_libs/src/ujson/lib/ultrajson.h | 4 ++ pandas/_libs/src/ujson/lib/ultrajsondec.c | 53 ++++++++++++++++++++++- pandas/_libs/src/ujson/python/JSONtoObj.c | 13 ++++-- 3 files changed, 65 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/src/ujson/lib/ultrajson.h b/pandas/_libs/src/ujson/lib/ultrajson.h index 05c3ae4096ad5..8d04874b4c9bf 100644 --- a/pandas/_libs/src/ujson/lib/ultrajson.h +++ b/pandas/_libs/src/ujson/lib/ultrajson.h @@ -154,6 +154,8 @@ enum JSTYPES { JT_ARRAY, // Array structure JT_OBJECT, // Key/Value structure JT_INVALID, // Internal, do not return nor expect + JT_POS_INF, // Positive infinity + JT_NEG_INF, // Negative infinity }; typedef void * JSOBJ; @@ -290,6 +292,8 @@ typedef struct __JSONObjectDecoder { JSOBJ (*newTrue)(void *prv); JSOBJ (*newFalse)(void *prv); JSOBJ (*newNull)(void *prv); + JSOBJ (*newPosInf)(void *prv); + JSOBJ (*newNegInf)(void *prv); JSOBJ (*newObject)(void *prv, void *decoder); JSOBJ (*endObject)(void *prv, JSOBJ obj); JSOBJ (*newArray)(void *prv, void *decoder); diff --git a/pandas/_libs/src/ujson/lib/ultrajsondec.c b/pandas/_libs/src/ujson/lib/ultrajsondec.c index a847b0f5d5102..30c494894b046 100644 --- a/pandas/_libs/src/ujson/lib/ultrajsondec.c +++ b/pandas/_libs/src/ujson/lib/ultrajsondec.c @@ -127,9 +127,16 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) { JSUINT64 overflowLimit = LLONG_MAX; - if (*(offset) == '-') { + if (*(offset) == 'I') { + goto DECODE_INF; + } else if (*(offset) == 'N') { + goto DECODE_NAN; + } else if (*(offset) == '-') { offset++; intNeg = -1; + if ((*offset) == 'I') { + goto DECODE_INF; + } overflowLimit = LLONG_MIN; } @@ -281,6 +288,48 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) { } } +DECODE_NAN: + offset++; + if (*(offset++) != 'a') goto SET_NAN_ERROR; + if (*(offset++) != 'N') goto SET_NAN_ERROR; + + ds->lastType = JT_NULL; + ds->start = offset; + return ds->dec->newNull(ds->prv); + +SET_NAN_ERROR: + return SetError(ds, -1, "Unexpected character found when decoding 'NaN'"); + +DECODE_INF: + offset++; + if (*(offset++) != 'n') goto SET_INF_ERROR; + if (*(offset++) != 'f') goto SET_INF_ERROR; + if (*(offset++) != 'i') goto SET_INF_ERROR; + if (*(offset++) != 'n') goto SET_INF_ERROR; + if (*(offset++) != 'i') goto SET_INF_ERROR; + if (*(offset++) != 't') goto SET_INF_ERROR; + if (*(offset++) != 'y') goto SET_INF_ERROR; + + ds->start = offset; + + if (intNeg == 1) { + ds->lastType = JT_POS_INF; + return ds->dec->newPosInf(ds->prv); + } else { + ds->lastType = JT_NEG_INF; + return ds->dec->newNegInf(ds->prv); + } + +SET_INF_ERROR: + if (intNeg == 1) { + const char *msg = "Unexpected character found when decoding 'Infinity'"; + return SetError(ds, -1, msg); + } else { + const char *msg = "Unexpected character found when decoding '-Infinity'"; + return SetError(ds, -1, msg); + } + + BREAK_EXP_LOOP: // FIXME: Check for arithemtic overflow here ds->lastType = JT_DOUBLE; @@ -1070,6 +1119,8 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_any(struct DecoderState *ds) { case '7': case '8': case '9': + case 'I': + case 'N': case '-': return decode_numeric(ds); diff --git a/pandas/_libs/src/ujson/python/JSONtoObj.c b/pandas/_libs/src/ujson/python/JSONtoObj.c index 7a2e5a584443a..f30285c1228f4 100644 --- a/pandas/_libs/src/ujson/python/JSONtoObj.c +++ b/pandas/_libs/src/ujson/python/JSONtoObj.c @@ -459,6 +459,10 @@ JSOBJ Object_newFalse(void *prv) { Py_RETURN_FALSE; } JSOBJ Object_newNull(void *prv) { Py_RETURN_NONE; } +JSOBJ Object_newPosInf(void *prv) { return PyFloat_FromDouble(Py_HUGE_VAL); } + +JSOBJ Object_newNegInf(void *prv) { return PyFloat_FromDouble(-Py_HUGE_VAL); } + JSOBJ Object_newObject(void *prv, void *decoder) { return PyDict_New(); } JSOBJ Object_endObject(void *prv, JSOBJ obj) { return obj; } @@ -502,10 +506,11 @@ PyObject *JSONToObj(PyObject *self, PyObject *args, PyObject *kwargs) { JSONObjectDecoder dec = { Object_newString, Object_objectAddKey, Object_arrayAddItem, Object_newTrue, Object_newFalse, Object_newNull, - Object_newObject, Object_endObject, Object_newArray, - Object_endArray, Object_newInteger, Object_newLong, - Object_newDouble, Object_releaseObject, PyObject_Malloc, - PyObject_Free, PyObject_Realloc}; + Object_newPosInf, Object_newNegInf, Object_newObject, + Object_endObject, Object_newArray, Object_endArray, + Object_newInteger, Object_newLong, Object_newDouble, + Object_releaseObject, PyObject_Malloc, PyObject_Free, + PyObject_Realloc}; dec.preciseFloat = 0; dec.prv = NULL; From edb29ba97c47b5610f07fd8dead700494d972fdd Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 16 Dec 2019 20:46:11 -0800 Subject: [PATCH 3/7] Whatsnew --- doc/source/whatsnew/v1.0.0.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index c288a008777cf..6d3ea5e0d9eab 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -205,6 +205,8 @@ Other enhancements (:meth:`~DataFrame.to_parquet` / :func:`read_parquet`) using the `'pyarrow'` engine now preserve those data types with pyarrow >= 1.0.0 (:issue:`20612`). - The ``partition_cols`` argument in :meth:`DataFrame.to_parquet` now accepts a string (:issue:`27117`) +- :func:`pandas.read_json` now parses ``NaN``, ``Infinity`` and ``-Infinity`` (:issue:`12213`) + Build Changes ^^^^^^^^^^^^^ From 8cd6488676ef470afb3aaa358a6f37ef36b1e9b3 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 16 Dec 2019 20:52:42 -0800 Subject: [PATCH 4/7] index fix --- pandas/_libs/src/ujson/lib/ultrajsondec.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/src/ujson/lib/ultrajsondec.c b/pandas/_libs/src/ujson/lib/ultrajsondec.c index 30c494894b046..b54285639c971 100644 --- a/pandas/_libs/src/ujson/lib/ultrajsondec.c +++ b/pandas/_libs/src/ujson/lib/ultrajsondec.c @@ -323,10 +323,10 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) { SET_INF_ERROR: if (intNeg == 1) { const char *msg = "Unexpected character found when decoding 'Infinity'"; - return SetError(ds, -1, msg); + return SetError(ds, -1, msg); } else { const char *msg = "Unexpected character found when decoding '-Infinity'"; - return SetError(ds, -1, msg); + return SetError(ds, -1, msg); } From 80d675e44cbf975e115c3275ce7d6b848a8f345f Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 18 Dec 2019 10:40:25 -0800 Subject: [PATCH 5/7] Stylistic updates --- pandas/_libs/src/ujson/lib/ultrajsondec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/src/ujson/lib/ultrajsondec.c b/pandas/_libs/src/ujson/lib/ultrajsondec.c index b54285639c971..20f14271c5f69 100644 --- a/pandas/_libs/src/ujson/lib/ultrajsondec.c +++ b/pandas/_libs/src/ujson/lib/ultrajsondec.c @@ -134,7 +134,7 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) { } else if (*(offset) == '-') { offset++; intNeg = -1; - if ((*offset) == 'I') { + if (*(offset) == 'I') { goto DECODE_INF; } overflowLimit = LLONG_MIN; From bd34513faa45e81cce6d3f762ad23a4e3d6fe2f1 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 20 Dec 2019 21:51:57 -0500 Subject: [PATCH 6/7] Tabs with spaces --- pandas/_libs/src/ujson/python/JSONtoObj.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/src/ujson/python/JSONtoObj.c b/pandas/_libs/src/ujson/python/JSONtoObj.c index f30285c1228f4..b2fc788478864 100644 --- a/pandas/_libs/src/ujson/python/JSONtoObj.c +++ b/pandas/_libs/src/ujson/python/JSONtoObj.c @@ -506,11 +506,11 @@ PyObject *JSONToObj(PyObject *self, PyObject *args, PyObject *kwargs) { JSONObjectDecoder dec = { Object_newString, Object_objectAddKey, Object_arrayAddItem, Object_newTrue, Object_newFalse, Object_newNull, - Object_newPosInf, Object_newNegInf, Object_newObject, - Object_endObject, Object_newArray, Object_endArray, - Object_newInteger, Object_newLong, Object_newDouble, - Object_releaseObject, PyObject_Malloc, PyObject_Free, - PyObject_Realloc}; + Object_newPosInf, Object_newNegInf, Object_newObject, + Object_endObject, Object_newArray, Object_endArray, + Object_newInteger, Object_newLong, Object_newDouble, + Object_releaseObject, PyObject_Malloc, PyObject_Free, + PyObject_Realloc}; dec.preciseFloat = 0; dec.prv = NULL; From 2cbd5ddfb2a47c5def69ec09c2cedb334d1112ed Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 27 Dec 2019 13:56:57 -0500 Subject: [PATCH 7/7] add strings --- pandas/tests/io/json/test_pandas.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index e229e8ec3c976..3068261b9b3a5 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1600,7 +1600,9 @@ def test_json_negative_indent_raises(self): def test_emca_262_nan_inf_support(self): # GH 12213 - data = "[NaN, Infinity, -Infinity]" + data = '["a", NaN, "NaN", Infinity, "Infinity", -Infinity, "-Infinity"]' result = pd.read_json(data) - expected = pd.DataFrame([np.nan, np.inf, -np.inf]) + expected = pd.DataFrame( + ["a", np.nan, "NaN", np.inf, "Infinity", -np.inf, "-Infinity"] + ) tm.assert_frame_equal(result, expected)