diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index e74bd2f745b94..b92679b5e2f0d 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -64,6 +64,7 @@ Other enhancements - Support passing a :class:`Iterable[Hashable]` input to :meth:`DataFrame.drop_duplicates` (:issue:`59237`) - Support reading Stata 102-format (Stata 1) dta files (:issue:`58978`) - Support reading Stata 110-format (Stata 7) dta files (:issue:`47176`) +- :func:`pandas.json_normalize` now supports normalizing specified ``meta`` fields from an array of records when the ``record_path`` is ``None`` or an empty list (``[]``) .. --------------------------------------------------------------------------- .. _whatsnew_300.notable_bug_fixes: diff --git a/pandas/io/json/_normalize.py b/pandas/io/json/_normalize.py index 45c8876dbe3e5..23cf38fa699d6 100644 --- a/pandas/io/json/_normalize.py +++ b/pandas/io/json/_normalize.py @@ -514,7 +514,7 @@ def _pull_records(js: dict[str, Any], spec: list | str) -> list: ): return DataFrame(_simple_json_normalize(data, sep=sep), index=index) - if record_path is None: + if record_path is None and meta is None: if any([isinstance(x, dict) for x in y.values()] for y in data): # naive normalization, this is idempotent for flat records # and potentially will inflate the data considerably for @@ -525,6 +525,8 @@ def _pull_records(js: dict[str, Any], spec: list | str) -> list: # reasonably data = nested_to_record(data, sep=sep, max_level=max_level) return DataFrame(data, index=index) + elif record_path is None and meta is not None: + record_path = [] elif not isinstance(record_path, list): record_path = [record_path] @@ -554,23 +556,30 @@ def _recursive_extract(data, path, seen_meta, level: int = 0) -> None: _recursive_extract(obj[path[0]], path[1:], seen_meta, level=level + 1) else: for obj in data: - recs = _pull_records(obj, path[0]) - recs = [ - nested_to_record(r, sep=sep, max_level=max_level) - if isinstance(r, dict) - else r - for r in recs - ] - - # For repeating the metadata later - lengths.append(len(recs)) + if len(path) == 1: + recs = _pull_records(obj, path[0]) + recs = [ + nested_to_record(r, sep=sep, max_level=max_level) + if isinstance(r, dict) + else r + for r in recs + ] + records.extend(recs) + + # For repeating the metadata later + lengths.append(len(recs)) + else: + # If path is an empty list, data is treated as an + # array of records, and only the meta fields will + # be extract from each record. + lengths.append(1) + for val, key in zip(_meta, meta_keys): if level + 1 > len(val): meta_val = seen_meta[key] else: meta_val = _pull_field(obj, val[level:]) meta_vals[key].append(meta_val) - records.extend(recs) _recursive_extract(data, record_path, {}, level=0) diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index fdbfbd004617e..6a8070d27fb35 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -569,6 +569,29 @@ def test_series_index(self, state_data): result = json_normalize(series, "counties") tm.assert_index_equal(result.index, idx.repeat([3, 2])) + @pytest.mark.parametrize("path", [None, []]) + def test_empty_record_path_and_not_empty_meta(self, state_data, path): + ex_data = [ + { + "shortname": "FL", + "state": "Florida", + "info.governor": "Rick Scott", + }, + { + "shortname": "OH", + "state": "Ohio", + "info.governor": "John Kasich", + }, + ] + expected = DataFrame(ex_data) + + result = json_normalize( + state_data, + record_path=path, + meta=["shortname", "state", ["info", "governor"]], + ) + tm.assert_frame_equal(result, expected) + class TestNestedToRecord: def test_flat_stays_flat(self):