-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
Enhanced json normalize #23861
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Enhanced json normalize #23861
Changes from all commits
cb53be7
0972746
5a5c708
be7ec0e
a79e126
cd12a23
d3b3503
4ec60bc
e001264
5c88339
55f7b1c
1af2bfc
882a2ca
caba6db
4e22c69
c2eff85
247124f
ab15869
26bf967
fca2a27
7a58456
f3d25e3
7a1297d
177c750
cb82bca
2a7b966
4635591
22fd84e
2e407e3
cf27cae
124fbd9
7b65999
03d3d23
8e61a04
b808d5a
0eaea30
837ba18
217d4ae
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -25,9 +25,11 @@ def _convert_to_line_delimits(s): | |
return convert_json_to_lines(s) | ||
|
||
|
||
def nested_to_record(ds, prefix="", sep=".", level=0): | ||
def nested_to_record(ds, prefix="", sep=".", level=0, | ||
max_level=None, ignore_keys=None): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can u type these parameters |
||
""" | ||
A simplified json_normalize. | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you revert the change to this line? |
||
A simplified json_normalize | ||
|
||
Converts a nested dict into a flat dict ("record"), unlike json_normalize, | ||
it does not attempt to extract a subset of the data. | ||
|
@@ -36,13 +38,24 @@ def nested_to_record(ds, prefix="", sep=".", level=0): | |
---------- | ||
ds : dict or list of dicts | ||
prefix: the prefix, optional, default: "" | ||
sep : string, default '.' | ||
sep : str, default '.' | ||
Nested records will generate names separated by sep, | ||
e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar | ||
|
||
.. versionadded:: 0.20.0 | ||
|
||
level: the number of levels in the jason string, optional, default: 0 | ||
level: int, optional, default: 0 | ||
The number of levels in the json string. | ||
|
||
max_level: int, optional, default: None | ||
The max depth to normalize. | ||
|
||
.. versionadded:: 0.25.0 | ||
|
||
ignore_keys: list, optional, default None | ||
keys to ignore | ||
|
||
.. versionadded:: 0.25.0 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is not lined up |
||
|
||
Returns | ||
------- | ||
|
@@ -65,10 +78,9 @@ def nested_to_record(ds, prefix="", sep=".", level=0): | |
if isinstance(ds, dict): | ||
ds = [ds] | ||
singleton = True | ||
|
||
ignore_keys = ignore_keys if ignore_keys else [] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this getting mutated? |
||
new_ds = [] | ||
for d in ds: | ||
|
||
new_d = copy.deepcopy(d) | ||
for k, v in d.items(): | ||
# each key gets renamed with prefix | ||
|
@@ -79,16 +91,21 @@ def nested_to_record(ds, prefix="", sep=".", level=0): | |
else: | ||
newkey = prefix + sep + k | ||
|
||
# only dicts gets recurse-flattend | ||
# flatten if type is dict and | ||
# current dict level < maximum level provided and | ||
# current dict key not in ignore keys list flatten it | ||
# only at level>1 do we rename the rest of the keys | ||
if not isinstance(v, dict): | ||
if (not isinstance(v, dict) or | ||
(max_level is not None and level >= max_level) or | ||
(k in ignore_keys)): | ||
if level != 0: # so we skip copying for top level, common case | ||
v = new_d.pop(k) | ||
new_d[newkey] = v | ||
continue | ||
else: | ||
v = new_d.pop(k) | ||
new_d.update(nested_to_record(v, newkey, sep, level + 1)) | ||
new_d.update(nested_to_record(v, newkey, sep, level + 1, | ||
max_level, ignore_keys)) | ||
new_ds.append(new_d) | ||
|
||
if singleton: | ||
|
@@ -100,41 +117,57 @@ def json_normalize(data, record_path=None, meta=None, | |
meta_prefix=None, | ||
record_prefix=None, | ||
errors='raise', | ||
sep='.'): | ||
sep='.', | ||
max_level=None, | ||
bhavaniravi marked this conversation as resolved.
Show resolved
Hide resolved
|
||
ignore_keys=None): | ||
bhavaniravi marked this conversation as resolved.
Show resolved
Hide resolved
|
||
""" | ||
Normalize semi-structured JSON data into a flat table. | ||
|
||
Parameters | ||
---------- | ||
data : dict or list of dicts | ||
Unserialized JSON objects | ||
record_path : string or list of strings, default None | ||
Unserialized JSON objects. | ||
record_path : str or list of str, default None | ||
Path in each object to list of records. If not passed, data will be | ||
assumed to be an array of records | ||
meta : list of paths (string or list of strings), default None | ||
Fields to use as metadata for each record in resulting table | ||
meta_prefix : string, default None | ||
record_prefix : string, default None | ||
assumed to be an array of records. | ||
meta : list of paths (str or list of str), default None | ||
Fields to use as metadata for each record in resulting table. | ||
meta_prefix : str, default None | ||
If True, prefix records with dotted (?) path, e.g. foo.bar.field if | ||
meta is ['foo', 'bar']. | ||
record_prefix : str, default None | ||
If True, prefix records with dotted (?) path, e.g. foo.bar.field if | ||
path to records is ['foo', 'bar'] | ||
path to records is ['foo', 'bar']. | ||
errors : {'raise', 'ignore'}, default 'raise' | ||
|
||
Configures error handling. | ||
* 'ignore' : will ignore KeyError if keys listed in meta are not | ||
always present | ||
always present. | ||
* 'raise' : will raise KeyError if keys listed in meta are not | ||
always present | ||
always present. | ||
|
||
.. versionadded:: 0.20.0 | ||
|
||
sep : string, default '.' | ||
Nested records will generate names separated by sep, | ||
e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar | ||
sep : str, default '.' | ||
Nested records will generate names separated by sep. | ||
e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar. | ||
|
||
.. versionadded:: 0.20.0 | ||
|
||
max_level : int, default None | ||
Max number of levels(depth of dict) to normalize. | ||
if None, normalizes all levels. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Minor capitalization issue here |
||
|
||
.. versionadded:: 0.25.0 | ||
|
||
ignore_keys : list, keys to ignore, default None | ||
List of keys that you do not want to normalize. | ||
|
||
.. versionadded:: 0.25.0 | ||
|
||
Returns | ||
------- | ||
frame : DataFrame | ||
Returns a JSON normalized Dataframe. | ||
|
||
Examples | ||
-------- | ||
|
@@ -149,6 +182,20 @@ def json_normalize(data, record_path=None, meta=None, | |
1 NaN NaN Regner NaN Mose NaN | ||
2 2.0 Faye Raker NaN NaN NaN NaN | ||
|
||
>>> from pandas.io.json import json_normalize | ||
>>> data = [{'id': 1, | ||
... 'name': {'first': 'Cole', 'last': 'Volk'}, | ||
... 'fitness': {'height': 130, 'weight': 60}}, | ||
... {'name': {'given': 'Mose', 'family': 'Reg'}, | ||
... 'fitness': {'height': 130, 'weight': 60}}, | ||
... {'id': 2, 'name': 'Faye Raker', | ||
... 'fitness': {'height': 130, 'weight': 60}}] | ||
>>> json_normalize(data, max_level=1, ignore_keys=['name']) | ||
fitness.height fitness.weight id name | ||
0 130 60 1.0 {'first': 'Cole', 'last': 'Volk'} | ||
1 130 60 NaN {'given': 'Mose', 'family': 'Reg'} | ||
2 130 60 2.0 Faye Raker | ||
|
||
>>> data = [{'state': 'Florida', | ||
... 'shortname': 'FL', | ||
... 'info': { | ||
|
@@ -167,12 +214,12 @@ def json_normalize(data, record_path=None, meta=None, | |
>>> result = json_normalize(data, 'counties', ['state', 'shortname', | ||
... ['info', 'governor']]) | ||
>>> result | ||
name population info.governor state shortname | ||
0 Dade 12345 Rick Scott Florida FL | ||
1 Broward 40000 Rick Scott Florida FL | ||
2 Palm Beach 60000 Rick Scott Florida FL | ||
3 Summit 1234 John Kasich Ohio OH | ||
4 Cuyahoga 1337 John Kasich Ohio OH | ||
name population state shortname info.governor | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why is there a period here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Because |
||
0 Dade 12345 Florida FL Rick Scott | ||
1 Broward 40000 Florida FL Rick Scott | ||
2 Palm Beach 60000 Florida FL Rick Scott | ||
3 Summit 1234 Ohio OH John Kasich | ||
4 Cuyahoga 1337 Ohio OH John Kasich | ||
|
||
>>> data = {'A': [1, 2]} | ||
>>> json_normalize(data, 'A', record_prefix='Prefix.') | ||
|
@@ -197,6 +244,8 @@ def _pull_field(js, spec): | |
if isinstance(data, dict): | ||
data = [data] | ||
|
||
ignore_keys = ignore_keys if ignore_keys else [] | ||
|
||
if record_path is None: | ||
if any([isinstance(x, dict) for x in y.values()] for y in data): | ||
# naive normalization, this is idempotent for flat records | ||
|
@@ -206,7 +255,9 @@ def _pull_field(js, spec): | |
# | ||
# TODO: handle record value which are lists, at least error | ||
# reasonably | ||
data = nested_to_record(data, sep=sep) | ||
data = nested_to_record(data, sep=sep, | ||
max_level=max_level, | ||
ignore_keys=ignore_keys) | ||
return DataFrame(data) | ||
elif not isinstance(record_path, list): | ||
record_path = [record_path] | ||
|
@@ -241,10 +292,13 @@ def _recursive_extract(data, path, seen_meta, level=0): | |
else: | ||
for obj in data: | ||
recs = _pull_field(obj, path[0]) | ||
recs = [nested_to_record(r, sep=sep, | ||
max_level=max_level, | ||
ignore_keys=ignore_keys) | ||
if isinstance(r, dict) else r for r in recs] | ||
|
||
# For repeating the metadata later | ||
lengths.append(len(recs)) | ||
|
||
for val, key in zip(meta, meta_keys): | ||
if level + 1 > len(val): | ||
meta_val = seen_meta[key] | ||
|
@@ -260,7 +314,6 @@ def _recursive_extract(data, path, seen_meta, level=0): | |
"{err} is not always present" | ||
.format(err=e)) | ||
meta_vals[key].append(meta_val) | ||
|
||
records.extend(recs) | ||
|
||
_recursive_extract(data, record_path, {}, level=0) | ||
|
@@ -279,8 +332,5 @@ def _recursive_extract(data, path, seen_meta, level=0): | |
if k in result: | ||
raise ValueError('Conflicting metadata name {name}, ' | ||
'need distinguishing prefix '.format(name=k)) | ||
|
||
# forcing dtype to object to avoid the metadata being casted to string | ||
result[k] = np.array(v, dtype=object).repeat(lengths) | ||
|
||
return result |
Uh oh!
There was an error while loading. Please reload this page.