diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 7caaec62c0a8a..ca058c919cad5 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -2186,6 +2186,19 @@ into a flat table. json_normalize(data, 'counties', ['state', 'shortname', ['info', 'governor']]) +The max_level parameter provides more control over which level to end normalization. +With max_level=1 the following snippet normalizes until 1st nesting level of the provided dict. + +.. ipython:: python + + data = [{'CreatedBy': {'Name': 'User001'}, + 'Lookup': {'TextField': 'Some text', + 'UserField': {'Id': 'ID001', + 'Name': 'Name001'}}, + 'Image': {'a': 'b'} + }] + json_normalize(data, max_level=1) + .. _io.jsonl: Line delimited json diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 901e4f6942897..3ef85df41aa33 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -106,6 +106,29 @@ the output will truncate, if it's wider than :attr:`options.display.width` (default: 80 characters). +.. _whatsnew_0250.enhancements.json_normalize_with_max_level: + +Json normalize with max_level param support +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:func:`json_normalize` normalizes the provided input dict to all +nested levels. The new max_level parameter provides more control over +which level to end normalization (:issue:`23843`): + +The repr now looks like this: + +.. ipython:: python + + from pandas.io.json import json_normalize + data = [{ + 'CreatedBy': {'Name': 'User001'}, + 'Lookup': {'TextField': 'Some text', + 'UserField': {'Id': 'ID001', 'Name': 'Name001'}}, + 'Image': {'a': 'b'} + }] + json_normalize(data, max_level=1) + + .. _whatsnew_0250.enhancements.other: Other Enhancements diff --git a/pandas/io/json/normalize.py b/pandas/io/json/normalize.py index 2d8bc20b1195e..5c6018d399c82 100644 --- a/pandas/io/json/normalize.py +++ b/pandas/io/json/normalize.py @@ -3,6 +3,7 @@ from collections import defaultdict import copy +from typing import DefaultDict, Dict, List, Optional, Union import numpy as np @@ -25,9 +26,11 @@ def _convert_to_line_delimits(s): return convert_json_to_lines(s) -def nested_to_record(ds, prefix="", sep=".", level=0): +def nested_to_record(ds, prefix: str = "", + sep: str = ".", level: int = 0, + max_level: Optional[int] = None): """ - A simplified json_normalize. + A simplified json_normalize Converts a nested dict into a flat dict ("record"), unlike json_normalize, it does not attempt to extract a subset of the data. @@ -36,13 +39,19 @@ def nested_to_record(ds, prefix="", sep=".", level=0): ---------- ds : dict or list of dicts prefix: the prefix, optional, default: "" - sep : string, default '.' + sep : str, default '.' Nested records will generate names separated by sep, e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar .. versionadded:: 0.20.0 - level: the number of levels in the jason string, optional, default: 0 + level: int, optional, default: 0 + The number of levels in the json string. + + max_level: int, optional, default: None + The max depth to normalize. + + .. versionadded:: 0.25.0 Returns ------- @@ -65,10 +74,8 @@ def nested_to_record(ds, prefix="", sep=".", level=0): if isinstance(ds, dict): ds = [ds] singleton = True - new_ds = [] for d in ds: - new_d = copy.deepcopy(d) for k, v in d.items(): # each key gets renamed with prefix @@ -79,16 +86,20 @@ def nested_to_record(ds, prefix="", sep=".", level=0): else: newkey = prefix + sep + k + # flatten if type is dict and + # current dict level < maximum level provided and # only dicts gets recurse-flattened # only at level>1 do we rename the rest of the keys - if not isinstance(v, dict): + if (not isinstance(v, dict) or + (max_level is not None and level >= max_level)): if level != 0: # so we skip copying for top level, common case v = new_d.pop(k) new_d[newkey] = v continue else: v = new_d.pop(k) - new_d.update(nested_to_record(v, newkey, sep, level + 1)) + new_d.update(nested_to_record(v, newkey, sep, level + 1, + max_level)) new_ds.append(new_d) if singleton: @@ -96,45 +107,58 @@ def nested_to_record(ds, prefix="", sep=".", level=0): return new_ds -def json_normalize(data, record_path=None, meta=None, - meta_prefix=None, - record_prefix=None, - errors='raise', - sep='.'): +def json_normalize(data: List[Dict], + record_path: Optional[Union[str, List]] = None, + meta: Optional[Union[str, List]] = None, + meta_prefix: Optional[str] = None, + record_prefix: Optional[str] = None, + errors: Optional[str] = 'raise', + sep: str = '.', + max_level: Optional[int] = None): """ Normalize semi-structured JSON data into a flat table. Parameters ---------- data : dict or list of dicts - Unserialized JSON objects - record_path : string or list of strings, default None + Unserialized JSON objects. + record_path : str or list of str, default None Path in each object to list of records. If not passed, data will be - assumed to be an array of records - meta : list of paths (string or list of strings), default None - Fields to use as metadata for each record in resulting table - meta_prefix : string, default None - record_prefix : string, default None + assumed to be an array of records. + meta : list of paths (str or list of str), default None + Fields to use as metadata for each record in resulting table. + meta_prefix : str, default None If True, prefix records with dotted (?) path, e.g. foo.bar.field if - path to records is ['foo', 'bar'] + meta is ['foo', 'bar']. + record_prefix : str, default None + If True, prefix records with dotted (?) path, e.g. foo.bar.field if + path to records is ['foo', 'bar']. errors : {'raise', 'ignore'}, default 'raise' + Configures error handling. * 'ignore' : will ignore KeyError if keys listed in meta are not - always present + always present. * 'raise' : will raise KeyError if keys listed in meta are not - always present + always present. .. versionadded:: 0.20.0 - sep : string, default '.' - Nested records will generate names separated by sep, - e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar + sep : str, default '.' + Nested records will generate names separated by sep. + e.g., for sep='.', {'foo': {'bar': 0}} -> foo.bar. .. versionadded:: 0.20.0 + max_level : int, default None + Max number of levels(depth of dict) to normalize. + if None, normalizes all levels. + + .. versionadded:: 0.25.0 + Returns ------- frame : DataFrame + Normalize semi-structured JSON data into a flat table. Examples -------- @@ -149,36 +173,62 @@ def json_normalize(data, record_path=None, meta=None, 1 NaN NaN Regner NaN Mose NaN 2 2.0 Faye Raker NaN NaN NaN NaN + >>> data = [{'id': 1, + ... 'name': "Cole Volk", + ... 'fitness': {'height': 130, 'weight': 60}}, + ... {'name': "Mose Reg", + ... 'fitness': {'height': 130, 'weight': 60}}, + ... {'id': 2, 'name': 'Faye Raker', + ... 'fitness': {'height': 130, 'weight': 60}}] + >>> json_normalize(data, max_level=0) + fitness id name + 0 {'height': 130, 'weight': 60} 1.0 Cole Volk + 1 {'height': 130, 'weight': 60} NaN Mose Reg + 2 {'height': 130, 'weight': 60} 2.0 Faye Raker + + Normalizes nested data upto level 1. + + >>> data = [{'id': 1, + ... 'name': "Cole Volk", + ... 'fitness': {'height': 130, 'weight': 60}}, + ... {'name': "Mose Reg", + ... 'fitness': {'height': 130, 'weight': 60}}, + ... {'id': 2, 'name': 'Faye Raker', + ... 'fitness': {'height': 130, 'weight': 60}}] + >>> json_normalize(data, max_level=1) + fitness.height fitness.weight id name + 0 130 60 1.0 Cole Volk + 1 130 60 NaN Mose Reg + 2 130 60 2.0 Faye Raker + >>> data = [{'state': 'Florida', ... 'shortname': 'FL', - ... 'info': { - ... 'governor': 'Rick Scott' - ... }, + ... 'info': {'governor': 'Rick Scott'}, ... 'counties': [{'name': 'Dade', 'population': 12345}, - ... {'name': 'Broward', 'population': 40000}, - ... {'name': 'Palm Beach', 'population': 60000}]}, + ... {'name': 'Broward', 'population': 40000}, + ... {'name': 'Palm Beach', 'population': 60000}]}, ... {'state': 'Ohio', ... 'shortname': 'OH', - ... 'info': { - ... 'governor': 'John Kasich' - ... }, + ... 'info': {'governor': 'John Kasich'}, ... 'counties': [{'name': 'Summit', 'population': 1234}, ... {'name': 'Cuyahoga', 'population': 1337}]}] >>> result = json_normalize(data, 'counties', ['state', 'shortname', - ... ['info', 'governor']]) + ... ['info', 'governor']]) >>> result - name population info.governor state shortname - 0 Dade 12345 Rick Scott Florida FL - 1 Broward 40000 Rick Scott Florida FL - 2 Palm Beach 60000 Rick Scott Florida FL - 3 Summit 1234 John Kasich Ohio OH - 4 Cuyahoga 1337 John Kasich Ohio OH + name population state shortname info.governor + 0 Dade 12345 Florida FL Rick Scott + 1 Broward 40000 Florida FL Rick Scott + 2 Palm Beach 60000 Florida FL Rick Scott + 3 Summit 1234 Ohio OH John Kasich + 4 Cuyahoga 1337 Ohio OH John Kasich >>> data = {'A': [1, 2]} >>> json_normalize(data, 'A', record_prefix='Prefix.') Prefix.0 0 1 1 2 + + Returns normalized data with columns prefixed with the given string. """ def _pull_field(js, spec): result = js @@ -206,7 +256,8 @@ def _pull_field(js, spec): # # TODO: handle record value which are lists, at least error # reasonably - data = nested_to_record(data, sep=sep) + data = nested_to_record(data, sep=sep, + max_level=max_level) return DataFrame(data) elif not isinstance(record_path, list): record_path = [record_path] @@ -219,10 +270,10 @@ def _pull_field(js, spec): meta = [m if isinstance(m, list) else [m] for m in meta] # Disastrously inefficient for now - records = [] + records = [] # type: List lengths = [] - meta_vals = defaultdict(list) + meta_vals = defaultdict(list) # type: DefaultDict if not isinstance(sep, str): sep = str(sep) meta_keys = [sep.join(val) for val in meta] @@ -241,10 +292,12 @@ def _recursive_extract(data, path, seen_meta, level=0): else: for obj in data: recs = _pull_field(obj, path[0]) + recs = [nested_to_record(r, sep=sep, + max_level=max_level) + if isinstance(r, dict) else r for r in recs] # For repeating the metadata later lengths.append(len(recs)) - for val, key in zip(meta, meta_keys): if level + 1 > len(val): meta_val = seen_meta[key] @@ -260,7 +313,6 @@ def _recursive_extract(data, path, seen_meta, level=0): "{err} is not always present" .format(err=e)) meta_vals[key].append(meta_val) - records.extend(recs) _recursive_extract(data, record_path, {}, level=0) @@ -279,8 +331,5 @@ def _recursive_extract(data, path, seen_meta, level=0): if k in result: raise ValueError('Conflicting metadata name {name}, ' 'need distinguishing prefix '.format(name=k)) - - # forcing dtype to object to avoid the metadata being casted to string result[k] = np.array(v, dtype=object).repeat(lengths) - return result diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index a7407d843c6c9..3210f7bc83bdd 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -85,6 +85,19 @@ def missing_metadata(): ] +@pytest.fixture +def max_level_test_input_data(): + """ + input data to test json_normalize with max_level param + """ + return [{ + 'CreatedBy': {'Name': 'User001'}, + 'Lookup': {'TextField': 'Some text', + 'UserField': {'Id': 'ID001', 'Name': 'Name001'}}, + 'Image': {'a': 'b'} + }] + + class TestJSONNormalize: def test_simple_records(self): @@ -168,8 +181,6 @@ def test_more_deeply_nested(self, deep_nested): result = json_normalize(deep_nested, ['states', 'cities'], meta=['country', ['states', 'name']]) - # meta_prefix={'states': 'state_'}) - ex_data = {'country': ['USA'] * 4 + ['Germany'] * 3, 'states.name': ['California', 'California', 'Ohio', 'Ohio', 'Bayern', 'Nordrhein-Westfalen', @@ -294,6 +305,50 @@ def test_missing_field(self, author_missing_data): expected = DataFrame(ex_data) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("max_level,expected", [ + (0, [{"TextField": "Some text", + 'UserField': {'Id': 'ID001', + 'Name': 'Name001'}, + "CreatedBy": {"Name": "User001"}, + 'Image': {'a': 'b'}}, + {"TextField": "Some text", + 'UserField': {'Id': 'ID001', + 'Name': 'Name001'}, + "CreatedBy": {"Name": "User001"}, + 'Image': {'a': 'b'}}]), + (1, [{"TextField": "Some text", + "UserField.Id": "ID001", + "UserField.Name": "Name001", + "CreatedBy": {"Name": "User001"}, + 'Image': {'a': 'b'}}, + {"TextField": "Some text", + "UserField.Id": "ID001", + "UserField.Name": "Name001", + "CreatedBy": {"Name": "User001"}, + 'Image': {'a': 'b'}}])]) + def test_max_level_with_records_path(self, max_level, expected): + # GH23843: Enhanced JSON normalize + test_input = [{'CreatedBy': {'Name': 'User001'}, + 'Lookup': [{'TextField': 'Some text', + 'UserField': {'Id': 'ID001', + 'Name': 'Name001'}}, + {'TextField': 'Some text', + 'UserField': {'Id': 'ID001', + 'Name': 'Name001'}} + ], + 'Image': {'a': 'b'}, + 'tags': [{'foo': 'something', 'bar': 'else'}, + {'foo': 'something2', 'bar': 'else2'}] + }] + + result = json_normalize(test_input, + record_path=["Lookup"], + meta=[["CreatedBy"], ["Image"]], + max_level=max_level) + expected_df = DataFrame(data=expected, + columns=result.columns.values) + tm.assert_equal(expected_df, result) + class TestNestedToRecord: @@ -301,7 +356,6 @@ def test_flat_stays_flat(self): recs = [dict(flat1=1, flat2=2), dict(flat1=3, flat2=4), ] - result = nested_to_record(recs) expected = recs assert result == expected @@ -356,20 +410,6 @@ def test_missing_meta(self, missing_metadata): record_path='addresses', meta='name', errors='ignore') - ex_data = [ - {'city': 'Massillon', - 'number': 9562, - 'state': 'OH', - 'street': 'Morris St.', - 'zip': 44646, - 'name': 'Alice'}, - {'city': 'Elizabethton', - 'number': 8449, - 'state': 'TN', - 'street': 'Spring St.', - 'zip': 37643, - 'name': np.nan} - ] ex_data = [ ['Massillon', 9562, 'OH', 'Morris St.', 44646, 'Alice'], ['Elizabethton', 8449, 'TN', 'Spring St.', 37643, np.nan] @@ -460,3 +500,68 @@ def test_nonetype_multiple_levels(self): 'location.country.state.town.info.y': -33.148521423339844, 'location.country.state.town.info.z': 27.572303771972656} assert result == expected + + @pytest.mark.parametrize("max_level, expected", [ + (None, + [{'CreatedBy.Name': 'User001', + 'Lookup.TextField': 'Some text', + 'Lookup.UserField.Id': 'ID001', + 'Lookup.UserField.Name': 'Name001', + 'Image.a': 'b' + }]), + (0, + [{'CreatedBy': {'Name': 'User001'}, + 'Lookup': {'TextField': 'Some text', + 'UserField': {'Id': 'ID001', 'Name': 'Name001'}}, + 'Image': {'a': 'b'} + }]), + (1, + [{'CreatedBy.Name': 'User001', + 'Lookup.TextField': 'Some text', + 'Lookup.UserField': {'Id': 'ID001', + 'Name': 'Name001'}, + 'Image.a': 'b' + }]) + ]) + def test_with_max_level(self, max_level, + expected, max_level_test_input_data): + # GH23843: Enhanced JSON normalize + output = nested_to_record(max_level_test_input_data, + max_level=max_level) + assert output == expected + + def test_with_large_max_level(self): + # GH23843: Enhanced JSON normalize + max_level = 100 + input_data = [{'CreatedBy': { + "user": { + "name": {"firstname": "Leo", + "LastName": "Thomson"}, + "family_tree": { + "father": { + "name": "Father001", + "father": { + "Name": "Father002", + "father": { + "name": "Father003", + "father": { + "Name": "Father004", + }, + }, + } + } + } + } + }}] + expected = [ + {'CreatedBy.user.name.firstname': 'Leo', + 'CreatedBy.user.name.LastName': 'Thomson', + 'CreatedBy.user.family_tree.father.name': 'Father001', + 'CreatedBy.user.family_tree.father.father.Name': 'Father002', + 'CreatedBy.user.family_tree.father.father.father.name': + 'Father003', + 'CreatedBy.user.family_tree.father.father.father.father.Name': + 'Father004'} + ] + output = nested_to_record(input_data, max_level=max_level) + assert output == expected