From daf9d3e8d0e9baf9455b31d5a950146f0a63d094 Mon Sep 17 00:00:00 2001 From: antoineviscardi Date: Sat, 9 Mar 2019 11:47:57 -0500 Subject: [PATCH 01/10] BUG: fixed json_normalize with nullable meta fields (#25468) --- doc/source/whatsnew/v0.25.0.rst | 2 +- pandas/io/json/normalize.py | 2 +- pandas/tests/io/json/test_normalize.py | 43 ++++++++++++++++++++++++++ 3 files changed, 45 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index ea08a0a6fe07b..380f00903e0f4 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -214,7 +214,7 @@ I/O - Bug in :func:`read_json` for ``orient='table'`` when it tries to infer dtypes by default, which is not applicable as dtypes are already defined in the JSON schema (:issue:`21345`) - Bug in :func:`read_json` for ``orient='table'`` and float index, as it infers index dtype by default, which is not applicable because index dtype is already defined in the JSON schema (:issue:`25433`) - Bug in :func:`read_json` for ``orient='table'`` and string of float column names, as it makes a column name type conversion to Timestamp, which is not applicable because column names are already defined in the JSON schema (:issue:`25435`) -- +- Bug in :func:`json_normalize` for ``errors='ignore'`` and nullable metadata fields, the null values in dataframe were literal nan string and not numpy.nan (:issue:`25468`) - - diff --git a/pandas/io/json/normalize.py b/pandas/io/json/normalize.py index 279630ccd107c..7c18f11eb13b0 100644 --- a/pandas/io/json/normalize.py +++ b/pandas/io/json/normalize.py @@ -281,6 +281,6 @@ def _recursive_extract(data, path, seen_meta, level=0): raise ValueError('Conflicting metadata name {name}, ' 'need distinguishing prefix '.format(name=k)) - result[k] = np.array(v).repeat(lengths) + result[k] = np.array(v, dtype=object).repeat(lengths) return result diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index 3bf699cc8a1f0..8d7ba2f0cffaf 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -65,6 +65,24 @@ def author_missing_data(): {'first': 'Jane', 'last_name': 'Doe'} }] +@pytest.fixture +def address_missing_data(): + return [ + {'name': 'Alice', + 'addresses': [{'number': 9562, + 'street': 'Morris St.', + 'city': 'Massillon', + 'state': 'OH', + 'zip': 44646}] + }, + {'addresses': [{'number': 8449, + 'street': 'Spring St.', + 'city': 'Elizabethton', + 'state': 'TN', + 'zip': 37643}] + } + ] + class TestJSONNormalize(object): @@ -378,6 +396,31 @@ def test_json_normalize_errors(self): ['general', 'trade_version']], errors='raise') + def test_missing_meta(self, address_missing_data): + # GH25468: If metadata is nullable with errors set to ignore, the null + # values should be numpy.nan values + result = json_normalize( + data=address_missing_data, + record_path='addresses', + meta='name', + errors='ignore') + ex_data = [ + {'city': 'Massillon', + 'number': 9562, + 'state': 'OH', + 'street': 'Morris St.', + 'zip': 44646, + 'name': 'Alice'}, + {'city': 'Elizabethton', + 'number': 8449, + 'state': 'TN', + 'street': 'Spring St.', + 'zip': 37643, + 'name': np.nan} + ] + expected = DataFrame(ex_data, columns=ex_data[0].keys()) + tm.assert_frame_equal(result, expected) + def test_donot_drop_nonevalues(self): # GH21356 data = [ From 2cc267c26c0965406ce97444509d6f908f285877 Mon Sep 17 00:00:00 2001 From: antoineviscardi Date: Sat, 9 Mar 2019 11:55:50 -0500 Subject: [PATCH 02/10] fixed code style --- pandas/tests/io/json/test_normalize.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index 8d7ba2f0cffaf..547823c7122f2 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -65,6 +65,7 @@ def author_missing_data(): {'first': 'Jane', 'last_name': 'Doe'} }] + @pytest.fixture def address_missing_data(): return [ @@ -75,7 +76,7 @@ def address_missing_data(): 'state': 'OH', 'zip': 44646}] }, - {'addresses': [{'number': 8449, + {'addresses': [{'number': 8449, 'street': 'Spring St.', 'city': 'Elizabethton', 'state': 'TN', From a24a2131eb019388d16828cc4a593de055b1361a Mon Sep 17 00:00:00 2001 From: antoineviscardi Date: Sat, 9 Mar 2019 12:31:00 -0500 Subject: [PATCH 03/10] modified test to ignore order of columns --- pandas/tests/io/json/test_normalize.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index 547823c7122f2..a4fcd8f9034b2 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -419,8 +419,8 @@ def test_missing_meta(self, address_missing_data): 'zip': 37643, 'name': np.nan} ] - expected = DataFrame(ex_data, columns=ex_data[0].keys()) - tm.assert_frame_equal(result, expected) + expected = DataFrame(ex_data) + tm.assert_frame_equal(result, expected, check_like=True) def test_donot_drop_nonevalues(self): # GH21356 From cf93f8ae43e8a5c615814d5fff858b346b8202f5 Mon Sep 17 00:00:00 2001 From: antoineviscardi Date: Sat, 9 Mar 2019 13:17:56 -0500 Subject: [PATCH 04/10] fixed change log addition --- doc/source/whatsnew/v0.25.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 380f00903e0f4..b064fecdf9617 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -214,7 +214,7 @@ I/O - Bug in :func:`read_json` for ``orient='table'`` when it tries to infer dtypes by default, which is not applicable as dtypes are already defined in the JSON schema (:issue:`21345`) - Bug in :func:`read_json` for ``orient='table'`` and float index, as it infers index dtype by default, which is not applicable because index dtype is already defined in the JSON schema (:issue:`25433`) - Bug in :func:`read_json` for ``orient='table'`` and string of float column names, as it makes a column name type conversion to Timestamp, which is not applicable because column names are already defined in the JSON schema (:issue:`25435`) -- Bug in :func:`json_normalize` for ``errors='ignore'`` and nullable metadata fields, the null values in dataframe were literal nan string and not numpy.nan (:issue:`25468`) +- Bug in :func:`json_normalize` for ``errors='ignore'`` and nullable metadata fields, the missing metadata values in dataframe were filled with the string "nan" instead of ``numpy.nan`` (:issue:`25468`) - - From 2a8a7260aa402d35699513d90d0aa3f0c584b5c6 Mon Sep 17 00:00:00 2001 From: antoineviscardi Date: Sun, 10 Mar 2019 13:23:02 -0400 Subject: [PATCH 05/10] added requested changes --- doc/source/whatsnew/v0.25.0.rst | 2 +- pandas/tests/io/json/test_normalize.py | 14 ++++++++------ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index b064fecdf9617..367962a534b3d 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -214,7 +214,7 @@ I/O - Bug in :func:`read_json` for ``orient='table'`` when it tries to infer dtypes by default, which is not applicable as dtypes are already defined in the JSON schema (:issue:`21345`) - Bug in :func:`read_json` for ``orient='table'`` and float index, as it infers index dtype by default, which is not applicable because index dtype is already defined in the JSON schema (:issue:`25433`) - Bug in :func:`read_json` for ``orient='table'`` and string of float column names, as it makes a column name type conversion to Timestamp, which is not applicable because column names are already defined in the JSON schema (:issue:`25435`) -- Bug in :func:`json_normalize` for ``errors='ignore'`` and nullable metadata fields, the missing metadata values in dataframe were filled with the string "nan" instead of ``numpy.nan`` (:issue:`25468`) +- Bug in :func:`json_normalize` for ``errors='ignore'`` and missing metadata fields, the missing metadata values in dataframe were filled with the string "nan" instead of ``numpy.nan`` (:issue:`25468`) - - diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index a4fcd8f9034b2..1bf328b1fa5af 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -67,7 +67,7 @@ def author_missing_data(): @pytest.fixture -def address_missing_data(): +def missing_metadata(): return [ {'name': 'Alice', 'addresses': [{'number': 9562, @@ -397,11 +397,12 @@ def test_json_normalize_errors(self): ['general', 'trade_version']], errors='raise') - def test_missing_meta(self, address_missing_data): - # GH25468: If metadata is nullable with errors set to ignore, the null - # values should be numpy.nan values + def test_missing_meta(self, missing_metadata): + # GH25468 + # If metadata is nullable with errors set to ignore, the null values + # should be numpy.nan values result = json_normalize( - data=address_missing_data, + data=missing_metadata, record_path='addresses', meta='name', errors='ignore') @@ -419,8 +420,9 @@ def test_missing_meta(self, address_missing_data): 'zip': 37643, 'name': np.nan} ] + columns = ex_data[0].keys() expected = DataFrame(ex_data) - tm.assert_frame_equal(result, expected, check_like=True) + tm.assert_frame_equal(result[columns], expected[columns]) def test_donot_drop_nonevalues(self): # GH21356 From dba832c5cef6db6e4d5b33632785dce96ee66a12 Mon Sep 17 00:00:00 2001 From: antoineviscardi Date: Sun, 10 Mar 2019 13:29:38 -0400 Subject: [PATCH 06/10] refactor test_json_normalize_errors --- pandas/tests/io/json/test_normalize.py | 64 ++++---------------------- 1 file changed, 9 insertions(+), 55 deletions(-) diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index 1bf328b1fa5af..18efbf4535439 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -337,64 +337,18 @@ def test_nested_flattens(self): assert result == expected - def test_json_normalize_errors(self): - # GH14583: If meta keys are not always present - # a new option to set errors='ignore' has been implemented - i = { - "Trades": [{ - "general": { - "tradeid": 100, - "trade_version": 1, - "stocks": [{ - - "symbol": "AAPL", - "name": "Apple", - "price": "0" - }, { - "symbol": "GOOG", - "name": "Google", - "price": "0" - } - ] - } - }, { - "general": { - "tradeid": 100, - "stocks": [{ - "symbol": "AAPL", - "name": "Apple", - "price": "0" - }, { - "symbol": "GOOG", - "name": "Google", - "price": "0" - } - ] - } - } - ] - } - j = json_normalize(data=i['Trades'], - record_path=[['general', 'stocks']], - meta=[['general', 'tradeid'], - ['general', 'trade_version']], - errors='ignore') - expected = {'general.trade_version': {0: 1.0, 1: 1.0, 2: '', 3: ''}, - 'general.tradeid': {0: 100, 1: 100, 2: 100, 3: 100}, - 'name': {0: 'Apple', 1: 'Google', 2: 'Apple', 3: 'Google'}, - 'price': {0: '0', 1: '0', 2: '0', 3: '0'}, - 'symbol': {0: 'AAPL', 1: 'GOOG', 2: 'AAPL', 3: 'GOOG'}} - - assert j.fillna('').to_dict() == expected - - msg = ("Try running with errors='ignore' as key 'trade_version'" + def test_json_normalize_errors(self, missing_metadata): + # GH14583: + # If meta keys are not always present a new option to set + # errors='ignore' has been implemented + + msg = ("Try running with errors='ignore' as key 'name'" " is not always present") with pytest.raises(KeyError, match=msg): json_normalize( - data=i['Trades'], - record_path=[['general', 'stocks']], - meta=[['general', 'tradeid'], - ['general', 'trade_version']], + data=missing_metadata, + record_path='addresses', + meta='name', errors='raise') def test_missing_meta(self, missing_metadata): From 5b6a8e749ed1242fe931ddfffe547d3e227ce5c9 Mon Sep 17 00:00:00 2001 From: antoineviscardi Date: Sun, 10 Mar 2019 15:12:20 -0400 Subject: [PATCH 07/10] build expected from list of lists to make order deterministic --- pandas/tests/io/json/test_normalize.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index 18efbf4535439..5362274274d72 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -374,9 +374,13 @@ def test_missing_meta(self, missing_metadata): 'zip': 37643, 'name': np.nan} ] - columns = ex_data[0].keys() - expected = DataFrame(ex_data) - tm.assert_frame_equal(result[columns], expected[columns]) + ex_data = [ + ['Massillon', 9562, 'OH', 'Morris St.', 44646, 'Alice'], + ['Elizabethton', 8449, 'TN', 'Spring St.', 37643, np.nan] + ] + columns = ['city', 'number', 'state', 'street', 'zip', 'name'] + expected = DataFrame(ex_data, columns=columns) + tm.assert_frame_equal(result, expected) def test_donot_drop_nonevalues(self): # GH21356 From ba5902ef058a89259f8b1adef2667bb20ba84f35 Mon Sep 17 00:00:00 2001 From: antoineviscardi Date: Sun, 10 Mar 2019 20:28:59 -0400 Subject: [PATCH 08/10] changed the entry for json_normalize --- doc/source/whatsnew/v0.25.0.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 367962a534b3d..3f74e3cfc1d9c 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -214,8 +214,7 @@ I/O - Bug in :func:`read_json` for ``orient='table'`` when it tries to infer dtypes by default, which is not applicable as dtypes are already defined in the JSON schema (:issue:`21345`) - Bug in :func:`read_json` for ``orient='table'`` and float index, as it infers index dtype by default, which is not applicable because index dtype is already defined in the JSON schema (:issue:`25433`) - Bug in :func:`read_json` for ``orient='table'`` and string of float column names, as it makes a column name type conversion to Timestamp, which is not applicable because column names are already defined in the JSON schema (:issue:`25435`) -- Bug in :func:`json_normalize` for ``errors='ignore'`` and missing metadata fields, the missing metadata values in dataframe were filled with the string "nan" instead of ``numpy.nan`` (:issue:`25468`) -- +- Bug in :func:`json_normalize` for ``errors='ignore'`` where missing values in the input data, were filled in resulting ``DataFrame`` with the string "nan" instead of ``numpy.nan`` (:issue:`25468`) - From 49530126682272355176e818f8ea4e184f9c1485 Mon Sep 17 00:00:00 2001 From: antoineviscardi Date: Wed, 13 Mar 2019 11:16:41 -0400 Subject: [PATCH 09/10] added one-liner --- pandas/io/json/normalize.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/io/json/normalize.py b/pandas/io/json/normalize.py index 7c18f11eb13b0..7a8188dd07b6b 100644 --- a/pandas/io/json/normalize.py +++ b/pandas/io/json/normalize.py @@ -281,6 +281,7 @@ def _recursive_extract(data, path, seen_meta, level=0): raise ValueError('Conflicting metadata name {name}, ' 'need distinguishing prefix '.format(name=k)) + # forcing dtype to object to avoid the metadata being casted to string result[k] = np.array(v, dtype=object).repeat(lengths) return result From b3b744cf23dd12b603cff2b3e57751bf7150cda9 Mon Sep 17 00:00:00 2001 From: antoineviscardi Date: Wed, 13 Mar 2019 11:18:51 -0400 Subject: [PATCH 10/10] resolved merging issue --- doc/source/whatsnew/v0.25.0.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 16804e8866bff..0b4b3519003aa 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -214,7 +214,6 @@ I/O - Bug in :func:`read_json` for ``orient='table'`` when it tries to infer dtypes by default, which is not applicable as dtypes are already defined in the JSON schema (:issue:`21345`) - Bug in :func:`read_json` for ``orient='table'`` and float index, as it infers index dtype by default, which is not applicable because index dtype is already defined in the JSON schema (:issue:`25433`) - Bug in :func:`read_json` for ``orient='table'`` and string of float column names, as it makes a column name type conversion to Timestamp, which is not applicable because column names are already defined in the JSON schema (:issue:`25435`) -<<<<<<< HEAD - Bug in :func:`json_normalize` for ``errors='ignore'`` where missing values in the input data, were filled in resulting ``DataFrame`` with the string "nan" instead of ``numpy.nan`` (:issue:`25468`) - :meth:`DataFrame.to_html` now raises ``TypeError`` when using an invalid type for the ``classes`` parameter instead of ``AsseertionError`` (:issue:`25608`) -