-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
Enhancement Add max_level param to json_normalize #26876
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 47 commits
cb53be7
0972746
5a5c708
be7ec0e
a79e126
cd12a23
d3b3503
4ec60bc
e001264
5c88339
55f7b1c
1af2bfc
882a2ca
caba6db
4e22c69
c2eff85
247124f
ab15869
26bf967
fca2a27
7a58456
f3d25e3
7a1297d
177c750
cb82bca
2a7b966
4635591
22fd84e
2e407e3
cf27cae
124fbd9
7b65999
03d3d23
8e61a04
b808d5a
0eaea30
837ba18
217d4ae
b2fc133
acf1137
ff30152
fa2ecee
aed2db5
f5dacd6
33e2504
699d696
a91f27a
0a04cdb
e4e586d
62a35db
d113401
bfa62cf
53b6bcb
2bc829b
d6a7cc7
a69ad2b
b0133d2
1564d49
463adc7
bbf894a
775472e
f8f550a
4ddf0cc
9f2d356
f3ff665
20432ad
311b898
7850db7
676c7f1
e2796d4
69a0d43
3a80a4d
c288c2e
f96d8fb
3ec85bf
ba1d983
4b754a0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,7 +9,7 @@ | |
from pandas._libs.writers import convert_json_to_lines | ||
|
||
from pandas import DataFrame | ||
|
||
from typing import Union | ||
|
||
def _convert_to_line_delimits(s): | ||
""" | ||
|
@@ -25,9 +25,11 @@ def _convert_to_line_delimits(s): | |
return convert_json_to_lines(s) | ||
|
||
|
||
def nested_to_record(ds, prefix="", sep=".", level=0): | ||
def nested_to_record(ds, prefix: str="", sep: str=".", level: int=0, | ||
max_level: int=None): | ||
""" | ||
A simplified json_normalize. | ||
|
||
bhavaniravi marked this conversation as resolved.
Show resolved
Hide resolved
|
||
A simplified json_normalize | ||
|
||
Converts a nested dict into a flat dict ("record"), unlike json_normalize, | ||
it does not attempt to extract a subset of the data. | ||
|
@@ -36,13 +38,19 @@ def nested_to_record(ds, prefix="", sep=".", level=0): | |
---------- | ||
ds : dict or list of dicts | ||
prefix: the prefix, optional, default: "" | ||
sep : string, default '.' | ||
sep : str, default '.' | ||
Nested records will generate names separated by sep, | ||
e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar | ||
|
||
.. versionadded:: 0.20.0 | ||
|
||
level: the number of levels in the jason string, optional, default: 0 | ||
level: int, optional, default: 0 | ||
The number of levels in the json string. | ||
|
||
max_level: int, optional, default: None | ||
The max depth to normalize. | ||
|
||
.. versionadded:: 0.25.0 | ||
|
||
Returns | ||
------- | ||
|
@@ -65,10 +73,8 @@ def nested_to_record(ds, prefix="", sep=".", level=0): | |
if isinstance(ds, dict): | ||
ds = [ds] | ||
singleton = True | ||
|
||
new_ds = [] | ||
for d in ds: | ||
|
||
new_d = copy.deepcopy(d) | ||
for k, v in d.items(): | ||
# each key gets renamed with prefix | ||
|
@@ -79,63 +85,77 @@ def nested_to_record(ds, prefix="", sep=".", level=0): | |
else: | ||
newkey = prefix + sep + k | ||
|
||
# only dicts gets recurse-flattend | ||
# flatten if type is dict and | ||
# current dict level < maximum level provided and | ||
# only at level>1 do we rename the rest of the keys | ||
if not isinstance(v, dict): | ||
if (not isinstance(v, dict) or | ||
(max_level is not None and level >= max_level)): | ||
if level != 0: # so we skip copying for top level, common case | ||
v = new_d.pop(k) | ||
new_d[newkey] = v | ||
continue | ||
else: | ||
v = new_d.pop(k) | ||
new_d.update(nested_to_record(v, newkey, sep, level + 1)) | ||
new_d.update(nested_to_record(v, newkey, sep, level + 1, | ||
max_level)) | ||
new_ds.append(new_d) | ||
|
||
if singleton: | ||
return new_ds[0] | ||
return new_ds | ||
|
||
|
||
def json_normalize(data, record_path=None, meta=None, | ||
meta_prefix=None, | ||
record_prefix=None, | ||
def json_normalize(data:dict, record_path: Union[str, list]=None, meta:Union[str, list]=None, | ||
meta_prefix: str=None, | ||
record_prefix: str=None, | ||
errors='raise', | ||
sep='.'): | ||
sep: str ='.', | ||
max_level: int=None): | ||
""" | ||
Normalize semi-structured JSON data into a flat table. | ||
|
||
Parameters | ||
---------- | ||
data : dict or list of dicts | ||
Unserialized JSON objects | ||
record_path : string or list of strings, default None | ||
Unserialized JSON objects. | ||
record_path : str or list of str, default None | ||
Path in each object to list of records. If not passed, data will be | ||
assumed to be an array of records | ||
meta : list of paths (string or list of strings), default None | ||
Fields to use as metadata for each record in resulting table | ||
meta_prefix : string, default None | ||
record_prefix : string, default None | ||
assumed to be an array of records. | ||
meta : list of paths (str or list of str), default None | ||
Fields to use as metadata for each record in resulting table. | ||
meta_prefix : str, default None | ||
If True, prefix records with dotted (?) path, e.g. foo.bar.field if | ||
path to records is ['foo', 'bar'] | ||
meta is ['foo', 'bar']. | ||
record_prefix : str, default None | ||
If True, prefix records with dotted (?) path, e.g. foo.bar.field if | ||
path to records is ['foo', 'bar']. | ||
errors : {'raise', 'ignore'}, default 'raise' | ||
|
||
Configures error handling. | ||
* 'ignore' : will ignore KeyError if keys listed in meta are not | ||
always present | ||
always present. | ||
* 'raise' : will raise KeyError if keys listed in meta are not | ||
always present | ||
always present. | ||
|
||
.. versionadded:: 0.20.0 | ||
|
||
sep : string, default '.' | ||
Nested records will generate names separated by sep, | ||
e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar | ||
sep : str, default '.' | ||
Nested records will generate names separated by sep. | ||
e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar. | ||
|
||
.. versionadded:: 0.20.0 | ||
|
||
max_level : int, default None | ||
Max number of levels(depth of dict) to normalize. | ||
if None, normalizes all levels. | ||
|
||
.. versionadded:: 0.25.0 | ||
|
||
Returns | ||
------- | ||
frame : DataFrame | ||
|
||
Normalize semi-structured JSON data into a flat table. | ||
|
||
Examples | ||
-------- | ||
|
||
|
@@ -149,30 +169,54 @@ def json_normalize(data, record_path=None, meta=None, | |
1 NaN NaN Regner NaN Mose NaN | ||
2 2.0 Faye Raker NaN NaN NaN NaN | ||
|
||
>>> from pandas.io.json import json_normalize | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you give the same example w/o max_level as well There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It will be the same w/o max_level, whereas max_level=0 will have a different effect. Will that work? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. sure |
||
>>> data = [{'id': 1, | ||
... 'name': "Cole Volk", | ||
... 'fitness': {'height': 130, 'weight': 60}}, | ||
... {'name': "Mose Reg", | ||
... 'fitness': {'height': 130, 'weight': 60}}, | ||
... {'id': 2, 'name': 'Faye Raker', | ||
... 'fitness': {'height': 130, 'weight': 60}}] | ||
>>> json_normalize(data, max_level=0) | ||
fitness id name | ||
0 {'height': 130, 'weight': 60} 1.0 Cole Volk | ||
1 {'height': 130, 'weight': 60} NaN Mose Reg | ||
2 {'height': 130, 'weight': 60} 2.0 Faye Raker | ||
|
||
>>> from pandas.io.json import json_normalize | ||
bhavaniravi marked this conversation as resolved.
Show resolved
Hide resolved
bhavaniravi marked this conversation as resolved.
Show resolved
Hide resolved
|
||
>>> data = [{'id': 1, | ||
... 'name': "Cole Volk", | ||
... 'fitness': {'height': 130, 'weight': 60}}, | ||
... {'name': "Mose Reg", | ||
... 'fitness': {'height': 130, 'weight': 60}}, | ||
... {'id': 2, 'name': 'Faye Raker', | ||
... 'fitness': {'height': 130, 'weight': 60}}] | ||
>>> json_normalize(data, max_level=1) | ||
fitness.height fitness.weight id name | ||
0 130 60 1.0 Cole Volk | ||
1 130 60 NaN Mose Reg | ||
2 130 60 2.0 Faye Raker | ||
|
||
>>> data = [{'state': 'Florida', | ||
... 'shortname': 'FL', | ||
... 'info': { | ||
... 'governor': 'Rick Scott' | ||
... }, | ||
... 'info': {'governor': 'Rick Scott'}, | ||
... 'counties': [{'name': 'Dade', 'population': 12345}, | ||
... {'name': 'Broward', 'population': 40000}, | ||
... {'name': 'Palm Beach', 'population': 60000}]}, | ||
... {'name': 'Broward', 'population': 40000}, | ||
... {'name': 'Palm Beach', 'population': 60000}]}, | ||
... {'state': 'Ohio', | ||
... 'shortname': 'OH', | ||
... 'info': { | ||
... 'governor': 'John Kasich' | ||
... }, | ||
... 'info': {'governor': 'John Kasich'}, | ||
... 'counties': [{'name': 'Summit', 'population': 1234}, | ||
... {'name': 'Cuyahoga', 'population': 1337}]}] | ||
>>> result = json_normalize(data, 'counties', ['state', 'shortname', | ||
... ['info', 'governor']]) | ||
>>> result | ||
name population info.governor state shortname | ||
0 Dade 12345 Rick Scott Florida FL | ||
1 Broward 40000 Rick Scott Florida FL | ||
2 Palm Beach 60000 Rick Scott Florida FL | ||
3 Summit 1234 John Kasich Ohio OH | ||
4 Cuyahoga 1337 John Kasich Ohio OH | ||
name population state shortname info.governor | ||
0 Dade 12345 Florida FL Rick Scott | ||
1 Broward 40000 Florida FL Rick Scott | ||
2 Palm Beach 60000 Florida FL Rick Scott | ||
3 Summit 1234 Ohio OH John Kasich | ||
4 Cuyahoga 1337 Ohio OH John Kasich | ||
|
||
>>> data = {'A': [1, 2]} | ||
>>> json_normalize(data, 'A', record_prefix='Prefix.') | ||
|
@@ -206,7 +250,8 @@ def _pull_field(js, spec): | |
# | ||
# TODO: handle record value which are lists, at least error | ||
# reasonably | ||
data = nested_to_record(data, sep=sep) | ||
data = nested_to_record(data, sep=sep, | ||
max_level=max_level) | ||
return DataFrame(data) | ||
elif not isinstance(record_path, list): | ||
record_path = [record_path] | ||
|
@@ -241,10 +286,12 @@ def _recursive_extract(data, path, seen_meta, level=0): | |
else: | ||
for obj in data: | ||
recs = _pull_field(obj, path[0]) | ||
recs = [nested_to_record(r, sep=sep, | ||
max_level=max_level) | ||
if isinstance(r, dict) else r for r in recs] | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
# For repeating the metadata later | ||
lengths.append(len(recs)) | ||
|
||
for val, key in zip(meta, meta_keys): | ||
if level + 1 > len(val): | ||
meta_val = seen_meta[key] | ||
|
@@ -260,7 +307,6 @@ def _recursive_extract(data, path, seen_meta, level=0): | |
"{err} is not always present" | ||
.format(err=e)) | ||
meta_vals[key].append(meta_val) | ||
|
||
records.extend(recs) | ||
|
||
_recursive_extract(data, record_path, {}, level=0) | ||
|
@@ -279,8 +325,5 @@ def _recursive_extract(data, path, seen_meta, level=0): | |
if k in result: | ||
raise ValueError('Conflicting metadata name {name}, ' | ||
'need distinguishing prefix '.format(name=k)) | ||
|
||
# forcing dtype to object to avoid the metadata being casted to string | ||
result[k] = np.array(v, dtype=object).repeat(lengths) | ||
|
||
return result |
Uh oh!
There was an error while loading. Please reload this page.