diff --git a/pandas/io/json/normalize.py b/pandas/io/json/normalize.py index fa4e35b08bf6e..d043166d416ce 100644 --- a/pandas/io/json/normalize.py +++ b/pandas/io/json/normalize.py @@ -25,9 +25,11 @@ def _convert_to_line_delimits(s): return convert_json_to_lines(s) -def nested_to_record(ds, prefix="", sep=".", level=0): +def nested_to_record(ds, prefix="", sep=".", level=0, + max_level=None, ignore_keys=None): """ - A simplified json_normalize. + + A simplified json_normalize Converts a nested dict into a flat dict ("record"), unlike json_normalize, it does not attempt to extract a subset of the data. @@ -36,13 +38,24 @@ def nested_to_record(ds, prefix="", sep=".", level=0): ---------- ds : dict or list of dicts prefix: the prefix, optional, default: "" - sep : string, default '.' + sep : str, default '.' Nested records will generate names separated by sep, e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar .. versionadded:: 0.20.0 - level: the number of levels in the jason string, optional, default: 0 + level: int, optional, default: 0 + The number of levels in the json string. + + max_level: int, optional, default: None + The max depth to normalize. + + .. versionadded:: 0.25.0 + + ignore_keys: list, optional, default None + keys to ignore + + .. versionadded:: 0.25.0 Returns ------- @@ -65,10 +78,9 @@ def nested_to_record(ds, prefix="", sep=".", level=0): if isinstance(ds, dict): ds = [ds] singleton = True - + ignore_keys = ignore_keys if ignore_keys else [] new_ds = [] for d in ds: - new_d = copy.deepcopy(d) for k, v in d.items(): # each key gets renamed with prefix @@ -79,16 +91,21 @@ def nested_to_record(ds, prefix="", sep=".", level=0): else: newkey = prefix + sep + k - # only dicts gets recurse-flattend + # flatten if type is dict and + # current dict level < maximum level provided and + # current dict key not in ignore keys list flatten it # only at level>1 do we rename the rest of the keys - if not isinstance(v, dict): + if (not isinstance(v, dict) or + (max_level is not None and level >= max_level) or + (k in ignore_keys)): if level != 0: # so we skip copying for top level, common case v = new_d.pop(k) new_d[newkey] = v continue else: v = new_d.pop(k) - new_d.update(nested_to_record(v, newkey, sep, level + 1)) + new_d.update(nested_to_record(v, newkey, sep, level + 1, + max_level, ignore_keys)) new_ds.append(new_d) if singleton: @@ -100,41 +117,57 @@ def json_normalize(data, record_path=None, meta=None, meta_prefix=None, record_prefix=None, errors='raise', - sep='.'): + sep='.', + max_level=None, + ignore_keys=None): """ Normalize semi-structured JSON data into a flat table. Parameters ---------- data : dict or list of dicts - Unserialized JSON objects - record_path : string or list of strings, default None + Unserialized JSON objects. + record_path : str or list of str, default None Path in each object to list of records. If not passed, data will be - assumed to be an array of records - meta : list of paths (string or list of strings), default None - Fields to use as metadata for each record in resulting table - meta_prefix : string, default None - record_prefix : string, default None + assumed to be an array of records. + meta : list of paths (str or list of str), default None + Fields to use as metadata for each record in resulting table. + meta_prefix : str, default None + If True, prefix records with dotted (?) path, e.g. foo.bar.field if + meta is ['foo', 'bar']. + record_prefix : str, default None If True, prefix records with dotted (?) path, e.g. foo.bar.field if - path to records is ['foo', 'bar'] + path to records is ['foo', 'bar']. errors : {'raise', 'ignore'}, default 'raise' - + Configures error handling. * 'ignore' : will ignore KeyError if keys listed in meta are not - always present + always present. * 'raise' : will raise KeyError if keys listed in meta are not - always present + always present. .. versionadded:: 0.20.0 - sep : string, default '.' - Nested records will generate names separated by sep, - e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar + sep : str, default '.' + Nested records will generate names separated by sep. + e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar. .. versionadded:: 0.20.0 + max_level : int, default None + Max number of levels(depth of dict) to normalize. + if None, normalizes all levels. + + .. versionadded:: 0.25.0 + + ignore_keys : list, keys to ignore, default None + List of keys that you do not want to normalize. + + .. versionadded:: 0.25.0 + Returns ------- frame : DataFrame + Returns a JSON normalized Dataframe. Examples -------- @@ -149,6 +182,20 @@ def json_normalize(data, record_path=None, meta=None, 1 NaN NaN Regner NaN Mose NaN 2 2.0 Faye Raker NaN NaN NaN NaN + >>> from pandas.io.json import json_normalize + >>> data = [{'id': 1, + ... 'name': {'first': 'Cole', 'last': 'Volk'}, + ... 'fitness': {'height': 130, 'weight': 60}}, + ... {'name': {'given': 'Mose', 'family': 'Reg'}, + ... 'fitness': {'height': 130, 'weight': 60}}, + ... {'id': 2, 'name': 'Faye Raker', + ... 'fitness': {'height': 130, 'weight': 60}}] + >>> json_normalize(data, max_level=1, ignore_keys=['name']) + fitness.height fitness.weight id name + 0 130 60 1.0 {'first': 'Cole', 'last': 'Volk'} + 1 130 60 NaN {'given': 'Mose', 'family': 'Reg'} + 2 130 60 2.0 Faye Raker + >>> data = [{'state': 'Florida', ... 'shortname': 'FL', ... 'info': { @@ -167,12 +214,12 @@ def json_normalize(data, record_path=None, meta=None, >>> result = json_normalize(data, 'counties', ['state', 'shortname', ... ['info', 'governor']]) >>> result - name population info.governor state shortname - 0 Dade 12345 Rick Scott Florida FL - 1 Broward 40000 Rick Scott Florida FL - 2 Palm Beach 60000 Rick Scott Florida FL - 3 Summit 1234 John Kasich Ohio OH - 4 Cuyahoga 1337 John Kasich Ohio OH + name population state shortname info.governor + 0 Dade 12345 Florida FL Rick Scott + 1 Broward 40000 Florida FL Rick Scott + 2 Palm Beach 60000 Florida FL Rick Scott + 3 Summit 1234 Ohio OH John Kasich + 4 Cuyahoga 1337 Ohio OH John Kasich >>> data = {'A': [1, 2]} >>> json_normalize(data, 'A', record_prefix='Prefix.') @@ -197,6 +244,8 @@ def _pull_field(js, spec): if isinstance(data, dict): data = [data] + ignore_keys = ignore_keys if ignore_keys else [] + if record_path is None: if any([isinstance(x, dict) for x in y.values()] for y in data): # naive normalization, this is idempotent for flat records @@ -206,7 +255,9 @@ def _pull_field(js, spec): # # TODO: handle record value which are lists, at least error # reasonably - data = nested_to_record(data, sep=sep) + data = nested_to_record(data, sep=sep, + max_level=max_level, + ignore_keys=ignore_keys) return DataFrame(data) elif not isinstance(record_path, list): record_path = [record_path] @@ -241,10 +292,13 @@ def _recursive_extract(data, path, seen_meta, level=0): else: for obj in data: recs = _pull_field(obj, path[0]) + recs = [nested_to_record(r, sep=sep, + max_level=max_level, + ignore_keys=ignore_keys) + if isinstance(r, dict) else r for r in recs] # For repeating the metadata later lengths.append(len(recs)) - for val, key in zip(meta, meta_keys): if level + 1 > len(val): meta_val = seen_meta[key] @@ -260,7 +314,6 @@ def _recursive_extract(data, path, seen_meta, level=0): "{err} is not always present" .format(err=e)) meta_vals[key].append(meta_val) - records.extend(recs) _recursive_extract(data, record_path, {}, level=0) @@ -279,8 +332,5 @@ def _recursive_extract(data, path, seen_meta, level=0): if k in result: raise ValueError('Conflicting metadata name {name}, ' 'need distinguishing prefix '.format(name=k)) - - # forcing dtype to object to avoid the metadata being casted to string result[k] = np.array(v, dtype=object).repeat(lengths) - return result diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index a7407d843c6c9..93c970e4a94a5 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -66,6 +66,20 @@ def author_missing_data(): }] +@pytest.fixture +def deeply_nested_post_data(): + return [{'CreatedBy': {'Name': 'User001'}, + 'Lookup': [{'TextField': 'Some text', + 'UserField': {'Id': 'ID001', 'Name': 'Name001'}}, + {'TextField': 'Some text', + 'UserField': {'Id': 'ID001', 'Name': 'Name001'}} + ], + 'Image': {'a': 'b'}, + 'tags': [{'foo': 'something', 'bar': 'else'}, + {'foo': 'something2', 'bar': 'else2'}] + }] + + @pytest.fixture def missing_metadata(): return [ @@ -294,6 +308,42 @@ def test_missing_field(self, author_missing_data): expected = DataFrame(ex_data) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("max_level", [0, 1]) + def test_max_level_with_records_path(self, + deeply_nested_post_data, + max_level): + + expected_data = {0: [{"TextField": "Some text", + 'UserField': {'Id': 'ID001', + 'Name': 'Name001'}, + "CreatedBy": {"Name": "User001"}, + 'Image': {'a': 'b'}}, + {"TextField": "Some text", + 'UserField': {'Id': 'ID001', + 'Name': 'Name001'}, + "CreatedBy": {"Name": "User001"}, + 'Image': {'a': 'b'}}], + 1: [{"TextField": "Some text", + "UserField.Id": "ID001", + "UserField.Name": "Name001", + "CreatedBy": {"Name": "User001"}, + 'Image': {'a': 'b'}}, + {"TextField": "Some text", + "UserField.Id": "ID001", + "UserField.Name": "Name001", + "CreatedBy": {"Name": "User001"}, + 'Image': {'a': 'b'}}]} + + test_input = deeply_nested_post_data + expected_data = expected_data[max_level] + result = json_normalize(test_input, + record_path=["Lookup"], + meta=[["CreatedBy"], ["Image"]], + max_level=max_level) + expected_df = DataFrame(data=expected_data, + columns=result.columns.values) + tm.assert_equal(expected_df, result) + class TestNestedToRecord: @@ -301,7 +351,6 @@ def test_flat_stays_flat(self): recs = [dict(flat1=1, flat2=2), dict(flat1=3, flat2=4), ] - result = nested_to_record(recs) expected = recs assert result == expected @@ -370,10 +419,6 @@ def test_missing_meta(self, missing_metadata): 'zip': 37643, 'name': np.nan} ] - ex_data = [ - ['Massillon', 9562, 'OH', 'Morris St.', 44646, 'Alice'], - ['Elizabethton', 8449, 'TN', 'Spring St.', 37643, np.nan] - ] columns = ['city', 'number', 'state', 'street', 'zip', 'name'] expected = DataFrame(ex_data, columns=columns) tm.assert_frame_equal(result, expected) @@ -460,3 +505,94 @@ def test_nonetype_multiple_levels(self): 'location.country.state.town.info.y': -33.148521423339844, 'location.country.state.town.info.z': 27.572303771972656} assert result == expected + + def test_with_max_level_none(self): + # GH23843: Enhanced JSON normalize + data = [{ + 'CreatedBy': {'Name': 'User001'}, + 'Lookup': {'TextField': 'Some text', + 'UserField': {'Id': 'ID001', 'Name': 'Name001'}}, + 'Image': {'a': 'b'} + }] + expected_output = [{ + 'CreatedBy.Name': 'User001', + 'Lookup.TextField': 'Some text', + 'Lookup.UserField.Id': 'ID001', + 'Lookup.UserField.Name': 'Name001', + 'Image': {'a': 'b'} + }] + output = nested_to_record(data, ignore_keys=["Image"]) + assert output == expected_output + + def test_with_max_level_zero(self): + data = [{ + 'CreatedBy': {'Name': 'User001'}, + 'Lookup': {'TextField': 'Some text', + 'UserField': {'Id': 'ID001', 'Name': 'Name001'}}, + 'Image': {'a': 'b'} + }] + output = nested_to_record(data, max_level=0, ignore_keys=["Image"]) + assert output == data + + def test_with_max_level_one(self): + data = [{ + 'CreatedBy': {'Name': 'User001'}, + 'Lookup': {'TextField': 'Some text', + 'UserField': {'Id': 'ID001', 'Name': 'Name001'}}, + 'Image': {'a': 'b'} + }] + expected_output = [{ + 'CreatedBy.Name': 'User001', + 'Lookup.TextField': 'Some text', + 'Lookup.UserField': {'Id': 'ID001', 'Name': 'Name001'}, + 'Image': {'a': 'b'} + }] + output = nested_to_record(data, max_level=1, ignore_keys=["Image"]) + assert output == expected_output + + def test_with_large_max_level(self): + data = [ + {'CreatedBy': { + "user": { + "name": {"firstname": "Leo", + "LastName": "Thomson"}, + "family_tree": { + "father": { + "name": "Father001", + "father": { + "Name": "Father002", + "father": { + "name": "Father003", + "father": { + "Name": "Father004", + }, + }, + } + } + } + } + }} + ] + expected_output = [ + {'CreatedBy.user.name.firstname': 'Leo', + 'CreatedBy.user.name.LastName': 'Thomson', + 'CreatedBy.user.family_tree.father.name': 'Father001', + 'CreatedBy.user.family_tree.father.father.Name': 'Father002', + 'CreatedBy.user.family_tree.father.father.father.name': + 'Father003', + 'CreatedBy.user.family_tree.father.father.father.father.Name': + 'Father004'} + ] + + output = nested_to_record(data, max_level=100) + assert output == expected_output + + def test_with_all_keys_to_ignore(self): + data = [{ + 'CreatedBy': {'Name': 'User001'}, + 'Lookup': {'TextField': 'Some text', + 'UserField': {'Id': 'ID001', 'Name': 'Name001'}}, + 'Image': {'a': 'b'} + }] + output = nested_to_record(data, ignore_keys=list(data[0].keys())) + assert output == data