From 4e8e2ff3585e57301c1cc33440fb9f676cb41a54 Mon Sep 17 00:00:00 2001 From: Tobias Gustafsson Date: Thu, 9 Feb 2017 20:28:35 +0100 Subject: [PATCH 1/2] BUG: Fix #15344 by backporting ujson usage of PEP 393 APIs for compact ascii --- doc/source/whatsnew/v0.20.0.txt | 2 ++ pandas/io/tests/json/test_pandas.py | 10 ++++++++++ pandas/src/ujson/python/objToJSON.c | 10 ++++++++++ 3 files changed, 22 insertions(+) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 9afcf85c929a7..68ff10d239a79 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -517,3 +517,5 @@ Bug Fixes - Bug in ``DataFrame.boxplot`` where ``fontsize`` was not applied to the tick labels on both axes (:issue:`15108`) - Bug in ``Series.replace`` and ``DataFrame.replace`` which failed on empty replacement dicts (:issue:`15289`) + +- Bug in ``to_json`` causing single byte ascii characters to be expanded to four byte unicode (:issue:`15344`) \ No newline at end of file diff --git a/pandas/io/tests/json/test_pandas.py b/pandas/io/tests/json/test_pandas.py index ee5039c38b182..440f5c13d5121 100644 --- a/pandas/io/tests/json/test_pandas.py +++ b/pandas/io/tests/json/test_pandas.py @@ -1044,3 +1044,13 @@ def roundtrip(s, encoding='latin-1'): for s in examples: roundtrip(s) + + def test_data_frame_size_after_to_json(self): + # GH15344 + df = DataFrame({'a': [str(1)]}) + + size_before = df.memory_usage(index=True, deep=True).sum() + df.to_json() + size_after = df.memory_usage(index=True, deep=True).sum() + + self.assertEqual(size_before, size_after) diff --git a/pandas/src/ujson/python/objToJSON.c b/pandas/src/ujson/python/objToJSON.c index 42c0b62a57511..3a3f239e29cd2 100644 --- a/pandas/src/ujson/python/objToJSON.c +++ b/pandas/src/ujson/python/objToJSON.c @@ -402,6 +402,16 @@ static void *PyStringToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue, static void *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) { PyObject *obj = (PyObject *)_obj; + +#if (PY_VERSION_HEX >= 0x03030000) + if(PyUnicode_IS_COMPACT_ASCII(obj)) { + Py_ssize_t len; + char *data = PyUnicode_AsUTF8AndSize(obj, &len); + *_outLen = len; + return data; + } +#endif + PyObject *newObj = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(obj), PyUnicode_GET_SIZE(obj), NULL); From 44de133bb512c37352d5ff39ccaec73318dcdf25 Mon Sep 17 00:00:00 2001 From: Tobias Gustafsson Date: Fri, 10 Feb 2017 13:28:25 +0100 Subject: [PATCH 2/2] Fix C-code formatting to pass linting of GH15344 --- pandas/src/ujson/python/objToJSON.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/src/ujson/python/objToJSON.c b/pandas/src/ujson/python/objToJSON.c index 3a3f239e29cd2..e3c75d3b6e081 100644 --- a/pandas/src/ujson/python/objToJSON.c +++ b/pandas/src/ujson/python/objToJSON.c @@ -404,7 +404,7 @@ static void *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue, PyObject *obj = (PyObject *)_obj; #if (PY_VERSION_HEX >= 0x03030000) - if(PyUnicode_IS_COMPACT_ASCII(obj)) { + if (PyUnicode_IS_COMPACT_ASCII(obj)) { Py_ssize_t len; char *data = PyUnicode_AsUTF8AndSize(obj, &len); *_outLen = len;