From 0f784de42849bfa04401f3f8d21d15f0ea264b61 Mon Sep 17 00:00:00 2001 From: Kaushal Rohit Date: Thu, 19 Dec 2019 00:49:01 +0530 Subject: [PATCH 1/4] BUG: Integer Overflow in read_json with big number in string --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/io/json/_json.py | 2 +- pandas/tests/io/json/test_pandas.py | 13 +++++++++++++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index e44fec112c5c1..6e0fc2b3ecece 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -803,6 +803,7 @@ I/O - Bug in :func:`read_json` where default encoding was not set to ``utf-8`` (:issue:`29565`) - Bug in :class:`PythonParser` where str and bytes were being mixed when dealing with the decimal field (:issue:`29650`) - :meth:`read_gbq` now accepts ``progress_bar_type`` to display progress bar while the data downloads. (:issue:`29857`) +- Bug in :meth: `read_json` where integer overflow was occuring when json contains big number strings. (:issue:`30320`) Plotting ^^^^^^^^ diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 30c1c2d59e983..d7abf921e69fc 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -946,7 +946,7 @@ def _try_convert_data(self, name, data, use_dtypes=True, convert_dates=True): if (new_data == data).all(): data = new_data result = True - except (TypeError, ValueError): + except (TypeError, ValueError, OverflowError): pass # coerce ints to 64 diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index bce3d1de849aa..efeb6fd008798 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1601,3 +1601,16 @@ def test_json_indent_all_orients(self, orient, expected): def test_json_negative_indent_raises(self): with pytest.raises(ValueError, match="must be a nonnegative integer"): pd.DataFrame().to_json(indent=-1) + + @pytest.mark.parametrize( + "json,expected", + [ + ( + json.dumps([{"col": "31900441201190696999"}, {"col": "3190044"}]), + DataFrame({"col": [31900441201190696999.0, 3190044.0]}), + ), + ], + ) + def test_frame_int_overflow(self, json, expected): + result = read_json(json) + tm.assert_frame_equal(result, expected) From 68cb4b028647705567a96274faab9eced770d82d Mon Sep 17 00:00:00 2001 From: Kaushal Rohit Date: Thu, 19 Dec 2019 01:11:06 +0530 Subject: [PATCH 2/4] CLN: Removed parametrize from test because single test case --- pandas/tests/io/json/test_pandas.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index efeb6fd008798..5d2ca4d89a40a 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1602,15 +1602,8 @@ def test_json_negative_indent_raises(self): with pytest.raises(ValueError, match="must be a nonnegative integer"): pd.DataFrame().to_json(indent=-1) - @pytest.mark.parametrize( - "json,expected", - [ - ( - json.dumps([{"col": "31900441201190696999"}, {"col": "3190044"}]), - DataFrame({"col": [31900441201190696999.0, 3190044.0]}), - ), - ], - ) - def test_frame_int_overflow(self, json, expected): - result = read_json(json) + def test_frame_int_overflow(self): + encoded_json = json.dumps([{"col": "31900441201190696999"}, {"col": "3190044"}]) + expected = DataFrame({"col": [31900441201190696999.0, 3190044.0]}) + result = read_json(encoded_json) tm.assert_frame_equal(result, expected) From 7ac3f1b23b4421b2fc8e1c7100d8cad2c0d80398 Mon Sep 17 00:00:00 2001 From: Kaushal Rohit Date: Thu, 19 Dec 2019 08:42:58 +0530 Subject: [PATCH 3/4] CLN: rectified test and added issue number --- pandas/tests/io/json/test_pandas.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 5d2ca4d89a40a..9a138d2dd3bea 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1603,7 +1603,8 @@ def test_json_negative_indent_raises(self): pd.DataFrame().to_json(indent=-1) def test_frame_int_overflow(self): - encoded_json = json.dumps([{"col": "31900441201190696999"}, {"col": "3190044"}]) - expected = DataFrame({"col": [31900441201190696999.0, 3190044.0]}) + # GH 30320 + encoded_json = json.dumps([{"col": "31900441201190696999"}, {"col": "Text"}]) + expected = DataFrame({"col": ["31900441201190696999", "Text"]}) result = read_json(encoded_json) tm.assert_frame_equal(result, expected) From ad590d3a5c220d5604f0b5cacf7b7e8a7175bf4a Mon Sep 17 00:00:00 2001 From: Kaushal Rohit Date: Tue, 21 Jan 2020 08:46:25 +0530 Subject: [PATCH 4/4] CLN: Added bug fix to release notes removed whitespace from test_pandas --- doc/source/whatsnew/v1.1.0.rst | 2 +- pandas/tests/io/json/test_pandas.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 1cd325dad9f07..802793b3eb37f 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -120,7 +120,7 @@ MultiIndex I/O ^^^ - +- Bug in :meth:`read_json` where integer overflow was occuring when json contains big number strings. (:issue:`30320`) - - diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index e7c4a2568e0bf..638bcaa21bdf9 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1640,7 +1640,7 @@ def test_deprecate_numpy_argument_read_json(self): with tm.assert_produces_warning(FutureWarning): result = read_json(expected.to_json(), numpy=True) tm.assert_frame_equal(result, expected) - + def test_frame_int_overflow(self): # GH 30320 encoded_json = json.dumps([{"col": "31900441201190696999"}, {"col": "Text"}])