-
-
Notifications
You must be signed in to change notification settings - Fork 18.6k
Enhanced json normalize #23861
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Enhanced json normalize #23861
Changes from 27 commits
cb53be7
0972746
5a5c708
be7ec0e
a79e126
cd12a23
d3b3503
4ec60bc
e001264
5c88339
55f7b1c
1af2bfc
882a2ca
caba6db
4e22c69
c2eff85
247124f
ab15869
26bf967
fca2a27
7a58456
f3d25e3
7a1297d
177c750
cb82bca
2a7b966
4635591
22fd84e
2e407e3
cf27cae
124fbd9
7b65999
03d3d23
8e61a04
b808d5a
0eaea30
837ba18
217d4ae
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -25,9 +25,11 @@ def _convert_to_line_delimits(s): | |
return convert_json_to_lines(s) | ||
|
||
|
||
def nested_to_record(ds, prefix="", sep=".", level=0): | ||
def nested_to_record(ds, prefix="", sep=".", level=0, | ||
max_level=None, ignore_keys=None): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can u type these parameters |
||
""" | ||
A simplified json_normalize. | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you revert the change to this line? |
||
A simplified json_normalize | ||
|
||
Converts a nested dict into a flat dict ("record"), unlike json_normalize, | ||
it does not attempt to extract a subset of the data. | ||
|
@@ -42,7 +44,15 @@ def nested_to_record(ds, prefix="", sep=".", level=0): | |
|
||
.. versionadded:: 0.20.0 | ||
|
||
level: the number of levels in the jason string, optional, default: 0 | ||
level: int, optional, the number of levels in the json string, default: 0 | ||
bhavaniravi marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
max_level: int, optional, normalize to a maximum level of, default: None | ||
|
||
.. versionadded:: 0.25.0 | ||
|
||
ignore_keys: list, optional, keys to ignore, default None | ||
|
||
.. versionadded:: 0.25.0 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is not lined up |
||
|
||
Returns | ||
------- | ||
|
@@ -65,10 +75,9 @@ def nested_to_record(ds, prefix="", sep=".", level=0): | |
if isinstance(ds, dict): | ||
ds = [ds] | ||
singleton = True | ||
|
||
ignore_keys = ignore_keys if ignore_keys else [] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this getting mutated? |
||
new_ds = [] | ||
for d in ds: | ||
|
||
new_d = copy.deepcopy(d) | ||
for k, v in d.items(): | ||
# each key gets renamed with prefix | ||
|
@@ -79,16 +88,21 @@ def nested_to_record(ds, prefix="", sep=".", level=0): | |
else: | ||
newkey = prefix + sep + k | ||
|
||
# only dicts gets recurse-flattend | ||
# flatten if type is dict and | ||
# current dict level < maximum level provided and | ||
# current dict key not in ignore keys list flatten it | ||
# only at level>1 do we rename the rest of the keys | ||
if not isinstance(v, dict): | ||
if (not isinstance(v, dict) or | ||
(max_level is not None and level >= max_level) or | ||
(k in ignore_keys)): | ||
if level != 0: # so we skip copying for top level, common case | ||
v = new_d.pop(k) | ||
new_d[newkey] = v | ||
continue | ||
else: | ||
v = new_d.pop(k) | ||
new_d.update(nested_to_record(v, newkey, sep, level + 1)) | ||
new_d.update(nested_to_record(v, newkey, sep, level + 1, | ||
max_level, ignore_keys)) | ||
new_ds.append(new_d) | ||
|
||
if singleton: | ||
|
@@ -100,25 +114,26 @@ def json_normalize(data, record_path=None, meta=None, | |
meta_prefix=None, | ||
record_prefix=None, | ||
errors='raise', | ||
sep='.'): | ||
sep='.', | ||
max_level=None, | ||
bhavaniravi marked this conversation as resolved.
Show resolved
Hide resolved
|
||
ignore_keys=None): | ||
bhavaniravi marked this conversation as resolved.
Show resolved
Hide resolved
|
||
""" | ||
Normalize semi-structured JSON data into a flat table. | ||
|
||
Parameters | ||
---------- | ||
data : dict or list of dicts | ||
Unserialized JSON objects | ||
record_path : string or list of strings, default None | ||
record_path : str or list of string, default None | ||
Path in each object to list of records. If not passed, data will be | ||
assumed to be an array of records | ||
assumed to be an array of records. | ||
meta : list of paths (string or list of strings), default None | ||
Fields to use as metadata for each record in resulting table | ||
Fields to use as metadata for each record in resulting table. | ||
meta_prefix : string, default None | ||
record_prefix : string, default None | ||
If True, prefix records with dotted (?) path, e.g. foo.bar.field if | ||
path to records is ['foo', 'bar'] | ||
path to records is ['foo', 'bar']. | ||
errors : {'raise', 'ignore'}, default 'raise' | ||
|
||
* 'ignore' : will ignore KeyError if keys listed in meta are not | ||
always present | ||
* 'raise' : will raise KeyError if keys listed in meta are not | ||
|
@@ -132,6 +147,17 @@ def json_normalize(data, record_path=None, meta=None, | |
|
||
.. versionadded:: 0.20.0 | ||
|
||
max_level : integer, default None | ||
max number of levels(depth of dict) to normalize. | ||
if None, normalizes all levels. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Minor capitalization issue here |
||
|
||
.. versionadded:: 0.25.0 | ||
|
||
ignore_keys : list, keys to ignore, default [] | ||
bhavaniravi marked this conversation as resolved.
Show resolved
Hide resolved
|
||
List of keys that you do not want to normalize. | ||
|
||
.. versionadded:: 0.25.0 | ||
|
||
Returns | ||
------- | ||
frame : DataFrame | ||
|
@@ -149,6 +175,20 @@ def json_normalize(data, record_path=None, meta=None, | |
1 NaN NaN Regner NaN Mose NaN | ||
2 2.0 Faye Raker NaN NaN NaN NaN | ||
|
||
>>> from pandas.io.json import json_normalize | ||
>>> data = [{'id': 1, | ||
... 'name': {'first': 'Coleen', 'last': 'Volk'}, | ||
... "fitness": {"height":130, "weight":60}}, | ||
... {'name': {'given': 'Mose', 'family': 'Regner'}, | ||
bhavaniravi marked this conversation as resolved.
Show resolved
Hide resolved
|
||
... "fitness": {"height":130, "weight":60}}, | ||
... {'id': 2, 'name': 'Faye Raker', | ||
... "fitness": {"height":130, "weight":60}}] | ||
>>> json_normalize(data, max_level=1, ignore_keys=["name"]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. After giving this some more thought...can we split this PR up? I think this one should just focus on |
||
fitness.height fitness.weight id name | ||
0 130 60 1.0 {'first': 'Coleen', 'last': 'Volk'} | ||
1 130 60 NaN {'given': 'Mose', 'family': 'Regner'} | ||
2 130 60 2.0 Faye Raker | ||
|
||
>>> data = [{'state': 'Florida', | ||
... 'shortname': 'FL', | ||
... 'info': { | ||
|
@@ -167,12 +207,12 @@ def json_normalize(data, record_path=None, meta=None, | |
>>> result = json_normalize(data, 'counties', ['state', 'shortname', | ||
... ['info', 'governor']]) | ||
>>> result | ||
name population info.governor state shortname | ||
0 Dade 12345 Rick Scott Florida FL | ||
1 Broward 40000 Rick Scott Florida FL | ||
2 Palm Beach 60000 Rick Scott Florida FL | ||
3 Summit 1234 John Kasich Ohio OH | ||
4 Cuyahoga 1337 John Kasich Ohio OH | ||
name population state shortname info.governor | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why is there a period here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Because |
||
0 Dade 12345 Florida FL Rick Scott | ||
1 Broward 40000 Florida FL Rick Scott | ||
2 Palm Beach 60000 Florida FL Rick Scott | ||
3 Summit 1234 Ohio OH John Kasich | ||
4 Cuyahoga 1337 Ohio OH John Kasich | ||
|
||
>>> data = {'A': [1, 2]} | ||
>>> json_normalize(data, 'A', record_prefix='Prefix.') | ||
|
@@ -197,6 +237,8 @@ def _pull_field(js, spec): | |
if isinstance(data, dict): | ||
data = [data] | ||
|
||
ignore_keys = ignore_keys if ignore_keys else [] | ||
|
||
if record_path is None: | ||
if any([isinstance(x, dict) for x in y.values()] for y in data): | ||
# naive normalization, this is idempotent for flat records | ||
|
@@ -206,7 +248,9 @@ def _pull_field(js, spec): | |
# | ||
# TODO: handle record value which are lists, at least error | ||
# reasonably | ||
data = nested_to_record(data, sep=sep) | ||
data = nested_to_record(data, sep=sep, | ||
max_level=max_level, | ||
ignore_keys=ignore_keys) | ||
return DataFrame(data) | ||
elif not isinstance(record_path, list): | ||
record_path = [record_path] | ||
|
@@ -241,10 +285,13 @@ def _recursive_extract(data, path, seen_meta, level=0): | |
else: | ||
for obj in data: | ||
recs = _pull_field(obj, path[0]) | ||
recs = [nested_to_record(r, sep=sep, | ||
max_level=max_level, | ||
ignore_keys=ignore_keys) | ||
if isinstance(r, dict) else r for r in recs] | ||
|
||
# For repeating the metadata later | ||
lengths.append(len(recs)) | ||
|
||
for val, key in zip(meta, meta_keys): | ||
if level + 1 > len(val): | ||
meta_val = seen_meta[key] | ||
|
@@ -260,7 +307,6 @@ def _recursive_extract(data, path, seen_meta, level=0): | |
"{err} is not always present" | ||
.format(err=e)) | ||
meta_vals[key].append(meta_val) | ||
|
||
records.extend(recs) | ||
|
||
_recursive_extract(data, record_path, {}, level=0) | ||
|
@@ -279,8 +325,5 @@ def _recursive_extract(data, path, seen_meta, level=0): | |
if k in result: | ||
raise ValueError('Conflicting metadata name {name}, ' | ||
'need distinguishing prefix '.format(name=k)) | ||
|
||
# forcing dtype to object to avoid the metadata being casted to string | ||
result[k] = np.array(v, dtype=object).repeat(lengths) | ||
|
||
return result |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -66,6 +66,39 @@ def author_missing_data(): | |
}] | ||
|
||
|
||
@pytest.fixture | ||
def deeply_nested_post_data(): | ||
return [{'CreatedBy': {'Name': 'User001'}, | ||
'Lookup': [{'TextField': 'Some text', | ||
'UserField': {'Id': 'ID001', 'Name': 'Name001'}}, | ||
{'TextField': 'Some text', | ||
'UserField': {'Id': 'ID001', 'Name': 'Name001'}} | ||
], | ||
'Image': {'a': 'b'}, | ||
'tags': [{'foo': 'something', 'bar': 'else'}, | ||
{'foo': 'something2', 'bar': 'else2'}] | ||
}] | ||
|
||
|
||
def expected_data_test_path_with_nested_data(): | ||
bhavaniravi marked this conversation as resolved.
Show resolved
Hide resolved
|
||
return {0: [{"TextField": "Some text", | ||
'UserField': {'Id': 'ID001', 'Name': 'Name001'}, | ||
"CreatedBy": {"Name": "User001"}, | ||
'Image': {'a': 'b'}}, | ||
{"TextField": "Some text", | ||
'UserField': {'Id': 'ID001', 'Name': 'Name001'}, | ||
"CreatedBy": {"Name": "User001"}, | ||
'Image': {'a': 'b'}}], | ||
1: [{"TextField": "Some text", "UserField.Id": "ID001", | ||
"UserField.Name": "Name001", | ||
"CreatedBy": {"Name": "User001"}, | ||
'Image': {'a': 'b'}}, | ||
{"TextField": "Some text", "UserField.Id": "ID001", | ||
"UserField.Name": "Name001", | ||
"CreatedBy": {"Name": "User001"}, | ||
'Image': {'a': 'b'}}]} | ||
|
||
|
||
@pytest.fixture | ||
def missing_metadata(): | ||
return [ | ||
|
@@ -294,14 +327,27 @@ def test_missing_field(self, author_missing_data): | |
expected = DataFrame(ex_data) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
@pytest.mark.parametrize("max_level", [0, 1]) | ||
def test_max_level_with_records_path(self, | ||
deeply_nested_post_data, | ||
max_level): | ||
test_input = deeply_nested_post_data | ||
expected_data = expected_data_test_path_with_nested_data()[max_level] | ||
result = json_normalize(test_input, | ||
record_path=["Lookup"], | ||
meta=[["CreatedBy"], ["Image"]], | ||
max_level=max_level) | ||
expected_df = DataFrame(data=expected_data, | ||
columns=result.columns.values) | ||
tm.assert_equal(expected_df, result) | ||
|
||
|
||
class TestNestedToRecord: | ||
|
||
def test_flat_stays_flat(self): | ||
recs = [dict(flat1=1, flat2=2), | ||
dict(flat1=3, flat2=4), | ||
] | ||
|
||
result = nested_to_record(recs) | ||
expected = recs | ||
assert result == expected | ||
|
@@ -370,12 +416,14 @@ def test_missing_meta(self, missing_metadata): | |
'zip': 37643, | ||
'name': np.nan} | ||
] | ||
ex_data = [ | ||
['Massillon', 9562, 'OH', 'Morris St.', 44646, 'Alice'], | ||
['Elizabethton', 8449, 'TN', 'Spring St.', 37643, np.nan] | ||
] | ||
# ex_data = [ | ||
bhavaniravi marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# ['Massillon', 9562, 'OH', 'Morris St.', 44646, 'Alice'], | ||
# ['Elizabethton', 8449, 'TN', 'Spring St.', 37643, "nan"] | ||
# ] | ||
|
||
columns = ['city', 'number', 'state', 'street', 'zip', 'name'] | ||
expected = DataFrame(ex_data, columns=columns) | ||
# print(type(expected["name"][1]), type(result["name"][1])) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
def test_donot_drop_nonevalues(self): | ||
|
@@ -460,3 +508,93 @@ def test_nonetype_multiple_levels(self): | |
'location.country.state.town.info.y': -33.148521423339844, | ||
'location.country.state.town.info.z': 27.572303771972656} | ||
assert result == expected | ||
|
||
def test_with_max_level_none(self): | ||
data = [{ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. need the issue number as a comment |
||
'CreatedBy': {'Name': 'User001'}, | ||
'Lookup': {'TextField': 'Some text', | ||
'UserField': {'Id': 'ID001', 'Name': 'Name001'}}, | ||
'Image': {'a': 'b'} | ||
}] | ||
expected_output = [{ | ||
'CreatedBy.Name': 'User001', | ||
'Lookup.TextField': 'Some text', | ||
'Lookup.UserField.Id': 'ID001', | ||
'Lookup.UserField.Name': 'Name001', | ||
'Image': {'a': 'b'} | ||
}] | ||
output = nested_to_record(data, ignore_keys=["Image"]) | ||
assert output == expected_output | ||
|
||
def test_with_max_level_zero(self): | ||
data = [{ | ||
'CreatedBy': {'Name': 'User001'}, | ||
'Lookup': {'TextField': 'Some text', | ||
'UserField': {'Id': 'ID001', 'Name': 'Name001'}}, | ||
'Image': {'a': 'b'} | ||
}] | ||
output = nested_to_record(data, max_level=0, ignore_keys=["Image"]) | ||
assert output == data | ||
|
||
def test_with_max_level_one(self): | ||
bhavaniravi marked this conversation as resolved.
Show resolved
Hide resolved
|
||
data = [{ | ||
'CreatedBy': {'Name': 'User001'}, | ||
'Lookup': {'TextField': 'Some text', | ||
'UserField': {'Id': 'ID001', 'Name': 'Name001'}}, | ||
'Image': {'a': 'b'} | ||
}] | ||
expected_output = [{ | ||
'CreatedBy.Name': 'User001', | ||
'Lookup.TextField': 'Some text', | ||
'Lookup.UserField': {'Id': 'ID001', 'Name': 'Name001'}, | ||
'Image': {'a': 'b'} | ||
}] | ||
output = nested_to_record(data, max_level=1, ignore_keys=["Image"]) | ||
assert output == expected_output | ||
|
||
def test_with_large_max_level(self): | ||
data = [ | ||
{'CreatedBy': { | ||
"user": { | ||
"name": {"firstname": "Leo", | ||
"LastName": "Thomson"}, | ||
"family_tree": { | ||
"father": { | ||
"name": "Father001", | ||
"father": { | ||
"Name": "Father002", | ||
"father": { | ||
"name": "Father003", | ||
"father": { | ||
"Name": "Father004", | ||
}, | ||
}, | ||
} | ||
} | ||
} | ||
} | ||
}} | ||
] | ||
expected_output = [ | ||
{'CreatedBy.user.name.firstname': 'Leo', | ||
'CreatedBy.user.name.LastName': 'Thomson', | ||
'CreatedBy.user.family_tree.father.name': 'Father001', | ||
'CreatedBy.user.family_tree.father.father.Name': 'Father002', | ||
'CreatedBy.user.family_tree.father.father.father.name': | ||
'Father003', | ||
'CreatedBy.user.family_tree.father.father.father.father.Name': | ||
'Father004'} | ||
] | ||
|
||
output = nested_to_record(data, max_level=100) | ||
assert output == expected_output | ||
|
||
def test_with_all_keys_to_ignore(self): | ||
data = [{ | ||
'CreatedBy': {'Name': 'User001'}, | ||
'Lookup': {'TextField': 'Some text', | ||
'UserField': {'Id': 'ID001', 'Name': 'Name001'}}, | ||
'Image': {'a': 'b'} | ||
}] | ||
output = nested_to_record(data, ignore_keys=list(data[0].keys())) | ||
assert output == data |
Uh oh!
There was an error while loading. Please reload this page.