From 5e70d1f46e848a04d068253d665e197ffc724058 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20Nie=C5=82acny?= Date: Thu, 16 Apr 2020 08:53:34 +0200 Subject: [PATCH] BUG: Raise a TypeError when record_path doesn't point to an array When `record_path` points to something that is Iterable but is not a sequence in JSON world we will receive odd results. ``` >>> json_normalize([{'key': 'value'}], record_path='key') 0 0 v 1 a 2 l 3 u 4 e ``` Based on RFC 8259 (https://tools.ietf.org/html/rfc8259) a JSON value MUST be object, array, number, or string, false, null, true. But only two of them should be treated as Iterable. ``` An object is an unordered *collection* of zero or more name/value pairs, where a name is a string and a value is a string, number, boolean, null, object, or array. An array is an ordered *sequence* of zero or more values. -- https://tools.ietf.org/html/rfc8259#page-3 ``` Based on that `[{'key': 'value'}]` and `{'key': 'value'}` should not be treated in the same way. In `json_normalize` documentation `record_path` is described as `Path in each object to list of records`. So when we want to translate JSON to Python like an object we need to take into consideration a list (sequence). Based on that `record_path` should point out to `list`, not `Iterable`. In specs I added all possibile values that are allowed in JSON and should not be treated as a collection. There is a special case for null value that is already implemented. | type | value | Iterable | Should be treated as list | |--------|---------|----------|---------------------------| | object | {} | Yes | No (unordered list) | | array | [] | Yes | Yes | | number | 1 | No | No | | string | "value" | Yes | No | | false | False | No | No | | null | Null | No | No (Check #30148) | | true | True | No | No | --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/io/json/_normalize.py | 12 ++++++------ pandas/tests/io/json/test_normalize.py | 12 +++++++----- 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 82c43811c0444..5762c60a0d10a 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -529,6 +529,7 @@ I/O - Bug in :meth:`DataFrame.to_sql` where an ``AttributeError`` was raised when saving an out of bounds date (:issue:`26761`) - Bug in :meth:`read_excel` did not correctly handle multiple embedded spaces in OpenDocument text cells. (:issue:`32207`) - Bug in :meth:`read_json` was raising ``TypeError`` when reading a list of booleans into a Series. (:issue:`31464`) +- Bug in :func:`pandas.io.json.json_normalize` where location specified by `record_path` doesn't point to an array. (:issue:`26284`) Plotting ^^^^^^^^ diff --git a/pandas/io/json/_normalize.py b/pandas/io/json/_normalize.py index 69e9b111a6c20..e833fdc20d542 100644 --- a/pandas/io/json/_normalize.py +++ b/pandas/io/json/_normalize.py @@ -239,23 +239,23 @@ def _pull_field( result = result[spec] return result - def _pull_records(js: Dict[str, Any], spec: Union[List, str]) -> Iterable: + def _pull_records(js: Dict[str, Any], spec: Union[List, str]) -> List: """ Interal function to pull field for records, and similar to - _pull_field, but require to return Iterable. And will raise error + _pull_field, but require to return list. And will raise error if has non iterable value. """ result = _pull_field(js, spec) - # GH 31507 GH 30145, if result is not Iterable, raise TypeError if not + # GH 31507 GH 30145, GH 26284 if result is not list, raise TypeError if not # null, otherwise return an empty list - if not isinstance(result, Iterable): + if not isinstance(result, list): if pd.isnull(result): result = [] else: raise TypeError( - f"{js} has non iterable value {result} for path {spec}. " - "Must be iterable or null." + f"{js} has non list value {result} for path {spec}. " + "Must be list or null." ) return result diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index b7a9918ff46da..4a32f3809c82b 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -475,13 +475,15 @@ def test_nonetype_record_path(self, nulls_fixture): expected = DataFrame({"i": 2}, index=[0]) tm.assert_equal(result, expected) - def test_non_interable_record_path_errors(self): - # see gh-30148 - test_input = {"state": "Texas", "info": 1} + @pytest.mark.parametrize("value", ["false", "true", "{}", "1", '"text"']) + def test_non_list_record_path_errors(self, value): + # see gh-30148, GH 26284 + parsed_value = json.loads(value) + test_input = {"state": "Texas", "info": parsed_value} test_path = "info" msg = ( - f"{test_input} has non iterable value 1 for path {test_path}. " - "Must be iterable or null." + f"{test_input} has non list value {parsed_value} for path {test_path}. " + "Must be list or null." ) with pytest.raises(TypeError, match=msg): json_normalize([test_input], record_path=[test_path])