Skip to content

BUG: improve pd.io.json_normalize #57811

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 42 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
606e96d
Update _normalize.py
slavanorm Mar 11, 2024
cc7aec8
Update test_normalize.py
slavanorm Mar 11, 2024
0793c40
Update v2.2.2.rst
slavanorm Mar 11, 2024
9647d1d
Merge branch 'pandas-dev:main' into main
slavanorm Mar 11, 2024
391ff1a
Update _normalize.py
slavanorm Mar 11, 2024
a3a22e8
Update test_normalize.py
slavanorm Mar 12, 2024
15901a9
Merge branch 'pandas-dev:main' into main
slavanorm Mar 12, 2024
abc9974
Update _normalize.py
slavanorm Mar 12, 2024
3769b1f
precommit fix
Mar 12, 2024
0b91678
precommit fix
Mar 12, 2024
dc0d6f8
test assertions
Mar 13, 2024
d7bd075
test assertions
Mar 13, 2024
03f1dc1
test assertion
Mar 13, 2024
6b5d760
fix test
Mar 13, 2024
9a72855
fix test
Mar 13, 2024
194ff85
Merge branch 'main' into main
slavanorm Mar 13, 2024
eb71e1b
Update v2.2.2.rst
slavanorm Mar 14, 2024
dfc6376
testcase
Mar 14, 2024
61e8dcc
Merge branch 'main' of https://www.github.com/slavanorm/pandas
Mar 14, 2024
aeae288
Merge branch 'pandas-dev:main' into main
slavanorm Mar 14, 2024
c9ef7de
missing key should be ignored + tests
Mar 19, 2024
add1882
Merge branch 'main' of https://www.github.com/slavanorm/pandas
Mar 19, 2024
c924ba3
merge
Mar 21, 2024
3fe7870
Merge branch 'pandas-dev-main'
Mar 21, 2024
68b2a15
fix test
Mar 25, 2024
f24dc1b
precommit
Mar 25, 2024
69c5243
Merge branch 'pandas-dev:main' into main
slavanorm Mar 25, 2024
296ccc5
fix tests
Mar 26, 2024
51179b1
Merge branch 'pandas-dev:main' into main
slavanorm Mar 26, 2024
1e8d43a
Update _normalize.py
slavanorm Mar 26, 2024
6849ffe
Merge branch 'main' of https://github.com/slavanorm/pandas
Mar 26, 2024
fde9565
fix tests
Mar 26, 2024
7e10e5a
precommit
Mar 26, 2024
2ef598d
precommit
Mar 27, 2024
ef857ba
Merge branch 'main' into main
slavanorm Mar 27, 2024
9caace9
merge
Mar 29, 2024
a73e49f
merge
Mar 29, 2024
0db9759
Merge branch 'pandas-dev:main' into main
slavanorm Apr 1, 2024
a27d7a9
Merge branch 'pandas-dev:main' into main
slavanorm Apr 3, 2024
51062d7
Merge branch 'main' into main
slavanorm Apr 8, 2024
750a270
Merge branch 'main' into main
slavanorm Apr 12, 2024
11138d8
Merge branch 'pandas-dev:main' into main
slavanorm Apr 17, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.2.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ Bug fixes
- :meth:`DataFrame.__dataframe__` was showing bytemask instead of bitmask for ``'string[pyarrow]'`` validity buffer (:issue:`57762`)
- :meth:`DataFrame.__dataframe__` was showing non-null validity buffer (instead of ``None``) ``'string[pyarrow]'`` without missing values (:issue:`57761`)
- :meth:`DataFrame.to_sql` was failing to find the right table when using the schema argument (:issue:`57539`)
- Fixed bug in :func:`pandas.io.json_normalize` raising with errors='ignore' while traversing empty list (:issue:`57810`)

.. ---------------------------------------------------------------------------
.. _whatsnew_222.other:
Expand Down
14 changes: 7 additions & 7 deletions pandas/io/json/_normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -432,14 +432,14 @@ def _pull_field(
) -> Scalar | Iterable:
"""Internal function to pull field"""
result = js
if not isinstance(spec, list):
spec = [spec]
try:
if isinstance(spec, list):
for field in spec:
if result is None:
raise KeyError(field)
result = result[field]
else:
result = result[spec]
for field in spec:
# GH 57810
if result is None or not len(result):
raise KeyError(field)
result = result[field]
except KeyError as e:
if extract_record:
raise KeyError(
Expand Down
41 changes: 35 additions & 6 deletions pandas/tests/io/json/test_normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,15 @@ def missing_metadata():
],
"previous_residences": {"cities": [{"city_name": "Barmingham"}]},
},
{
"name": "Minnie",
"addresses": [
{
"number": 8449,
}
],
"previous_residences": {"cities": []},
},
]


Expand Down Expand Up @@ -631,14 +640,15 @@ def test_missing_meta(self, missing_metadata):
ex_data = [
[9562, "Morris St.", "Massillon", "OH", 44646, "Alice"],
[8449, "Spring St.", "Elizabethton", "TN", 37643, np.nan],
[8449, np.nan, np.nan, np.nan, np.nan, "Minnie"],
]
columns = ["number", "street", "city", "state", "zip", "name"]
expected = DataFrame(ex_data, columns=columns)
tm.assert_frame_equal(result, expected)

def test_missing_nested_meta(self):
def test_missing_nested_meta_traverse_none_errors_ignore(self):
# GH44312
# If errors="ignore" and nested metadata is null, we should return nan
# If errors="ignore" and nested metadata is nullable, return nan
data = {"meta": "foo", "nested_meta": None, "value": [{"rec": 1}, {"rec": 2}]}
result = json_normalize(
data,
Expand All @@ -653,8 +663,11 @@ def test_missing_nested_meta(self):
)
tm.assert_frame_equal(result, expected)

# If errors="raise" and nested metadata is null, we should raise with the
# key of the first missing level
def test_missing_nested_meta_traverse_none_errors_raise(self):
# GH44312
# If errors="raise" and nested metadata is null, should raise
data = {"meta": "foo", "nested_meta": None, "value": [{"rec": 1}, {"rec": 2}]}

with pytest.raises(KeyError, match="'leaf' not found"):
json_normalize(
data,
Expand All @@ -663,6 +676,22 @@ def test_missing_nested_meta(self):
errors="raise",
)

def test_missing_nested_meta_traverse_empty_list_errors_ignore(self):
# If errors="ignore" and nested metadata is nullable, return nan
data = {"meta": "foo", "nested_meta": [], "value": [{"rec": 1}, {"rec": 2}]}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
data = {"meta": "foo", "nested_meta": [], "value": [{"rec": 1}, {"rec": 2}]}
data = {"meta": "foo", "nested_meta": {}, "value": [{"rec": 1}, {"rec": 2}]}

I have lost track a bit on what we are trying to accomplish but this doesn't seem right to ignore an error when the meta argument starts peering into in an array instead of an object. Is this a TypeError on main or a KeyError? I feel like we are mixing the two up when we should not be

result = json_normalize(
data,
record_path="value",
meta=["meta", ["nested_meta", "leaf"]],
errors="ignore",
)
ex_data = [[1, "foo", np.nan], [2, "foo", np.nan]]
columns = ["rec", "meta", "nested_meta.leaf"]
expected = DataFrame(ex_data, columns=columns).astype(
{"nested_meta.leaf": object}
)
tm.assert_frame_equal(result, expected)

def test_missing_meta_multilevel_record_path_errors_raise(self, missing_metadata):
# GH41876
# Ensure errors='raise' works as intended even when a record_path of length
Expand All @@ -681,8 +710,8 @@ def test_missing_meta_multilevel_record_path_errors_raise(self, missing_metadata

def test_missing_meta_multilevel_record_path_errors_ignore(self, missing_metadata):
# GH41876
# Ensure errors='ignore' works as intended even when a record_path of length
# greater than one is passed in
# Ensure errors='ignore' works as intended
# even when a record_path of length greater than one is passed in
result = json_normalize(
data=missing_metadata,
record_path=["previous_residences", "cities"],
Expand Down