From 855985d5c804eac365d4219a9d7c53e15ff3281a Mon Sep 17 00:00:00 2001 From: Michael Silverstein Date: Thu, 8 Nov 2018 13:07:35 -0500 Subject: [PATCH 01/32] check for columns in dataframe --- pandas/core/reshape/melt.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index 6596e055db1a8..06123906b25d0 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -32,7 +32,12 @@ def melt(frame, id_vars=None, value_vars=None, var_name=None, raise ValueError('id_vars must be a list of tuples when columns' ' are a MultiIndex') else: + # Check that `id_vars` are in frame id_vars = list(id_vars) + not_in_frame = [i not in frame.columns for i in id_vars] + if any(not_in_frame): + missing = ', '.join(not_in_frame) + raise ValueError(f'{missing} are not in dataframe') else: id_vars = [] @@ -45,6 +50,11 @@ def melt(frame, id_vars=None, value_vars=None, var_name=None, ' columns are a MultiIndex') else: value_vars = list(value_vars) + # Check that `value_vars` are in frame + not_in_frame = [i not in frame.columns for i in value_vars] + if any(not_in_frame): + missing = ', '.join(not_in_frame) + raise ValueError(f'{missing} are not in dataframe') frame = frame.loc[:, id_vars + value_vars] else: frame = frame.copy() From 40fdb0590034de966485b95709c952f5ed3be88a Mon Sep 17 00:00:00 2001 From: Michael Silverstein Date: Thu, 8 Nov 2018 16:10:51 -0500 Subject: [PATCH 02/32] check for columns in dataframe --- pandas/core/reshape/melt.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index 06123906b25d0..1ed8395c3428c 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -34,10 +34,9 @@ def melt(frame, id_vars=None, value_vars=None, var_name=None, else: # Check that `id_vars` are in frame id_vars = list(id_vars) - not_in_frame = [i not in frame.columns for i in id_vars] - if any(not_in_frame): - missing = ', '.join(not_in_frame) - raise ValueError(f'{missing} are not in dataframe') + missing = [v for v in id_vars if v not in frame.columns] + if missing: + raise ValueError(f'Columns {missing} are not in dataframe') else: id_vars = [] @@ -51,10 +50,10 @@ def melt(frame, id_vars=None, value_vars=None, var_name=None, else: value_vars = list(value_vars) # Check that `value_vars` are in frame - not_in_frame = [i not in frame.columns for i in value_vars] - if any(not_in_frame): - missing = ', '.join(not_in_frame) - raise ValueError(f'{missing} are not in dataframe') + missing = [v for v in value_vars if v not in frame.columns] + if missing: + # missing_vars = str(missing) + raise ValueError(f'Columns {missing} are not in dataframe') frame = frame.loc[:, id_vars + value_vars] else: frame = frame.copy() From 9670da27188b7f210efe04635c00194f81d63355 Mon Sep 17 00:00:00 2001 From: Michael Silverstein Date: Tue, 13 Nov 2018 09:07:59 -0500 Subject: [PATCH 03/32] check difference with Index; use {} str formatting --- pandas/core/reshape/melt.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index 1ed8395c3428c..d1a1f6d57e6ac 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -12,6 +12,7 @@ from pandas import compat from pandas.core.arrays import Categorical +from pandas.core.indexes.base import Index from pandas.core.frame import _shared_docs from pandas.core.reshape.concat import concat from pandas.core.tools.numeric import to_numeric @@ -34,9 +35,9 @@ def melt(frame, id_vars=None, value_vars=None, var_name=None, else: # Check that `id_vars` are in frame id_vars = list(id_vars) - missing = [v for v in id_vars if v not in frame.columns] + missing = Index(id_vars).difference(frame.columns) if missing: - raise ValueError(f'Columns {missing} are not in dataframe') + raise ValueError('Columns {missing} are not in dataframe'.format(missing=missing)) else: id_vars = [] @@ -50,10 +51,10 @@ def melt(frame, id_vars=None, value_vars=None, var_name=None, else: value_vars = list(value_vars) # Check that `value_vars` are in frame - missing = [v for v in value_vars if v not in frame.columns] + missing = Index(value_vars).difference(frame.columns) if missing: # missing_vars = str(missing) - raise ValueError(f'Columns {missing} are not in dataframe') + raise ValueError('Columns {missing} are not in dataframe'.format(missing=missing)) frame = frame.loc[:, id_vars + value_vars] else: frame = frame.copy() From 3ffc8701ef6e63209927a14ec3b4cb29ff991485 Mon Sep 17 00:00:00 2001 From: Michael Silverstein Date: Tue, 13 Nov 2018 09:11:52 -0500 Subject: [PATCH 04/32] missing.any() --- pandas/core/reshape/melt.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index d1a1f6d57e6ac..1d75d0b31e9a3 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -36,7 +36,7 @@ def melt(frame, id_vars=None, value_vars=None, var_name=None, # Check that `id_vars` are in frame id_vars = list(id_vars) missing = Index(id_vars).difference(frame.columns) - if missing: + if missing.any(): raise ValueError('Columns {missing} are not in dataframe'.format(missing=missing)) else: id_vars = [] @@ -52,7 +52,7 @@ def melt(frame, id_vars=None, value_vars=None, var_name=None, value_vars = list(value_vars) # Check that `value_vars` are in frame missing = Index(value_vars).difference(frame.columns) - if missing: + if missing.any(): # missing_vars = str(missing) raise ValueError('Columns {missing} are not in dataframe'.format(missing=missing)) frame = frame.loc[:, id_vars + value_vars] From 8139f78c502e468bee6b5e93aa0f48ea69c079f4 Mon Sep 17 00:00:00 2001 From: Michael Silverstein Date: Tue, 13 Nov 2018 09:26:02 -0500 Subject: [PATCH 05/32] started test --- pandas/tests/reshape/test_melt.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index e83a2cb483de7..9f9c1125e5c64 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -661,3 +661,16 @@ def test_col_substring_of_stubname(self): i=['node_id', 'A'], j='time') tm.assert_frame_equal(result, expected) + def test_melt_missing_columns(self): + # Addresses issue #23575 + # This test is to ensure that pandas raises an error if melting is + # attempted with column names absent from the dataframe + + # Generate data + people = ['Susie', 'Alejandro'] + day = ['Monday', 'Tuesday', 'Wednesday'] + data = [[person, d, *np.random.randint(0, 5, 2)] for person in people for d in day] + df = pd.DataFrame(data, columns=['Name', 'day', 'burgers', 'fries']) + + # Try to melt with missing column name + df.melt(['Name', 'day'], ['Burgers', 'fries']) \ No newline at end of file From 0a946502da1627cab8ef4d58a85bb03036d22706 Mon Sep 17 00:00:00 2001 From: Michael Silverstein Date: Tue, 13 Nov 2018 09:42:20 -0500 Subject: [PATCH 06/32] added to whatsnew --- doc/source/whatsnew/v0.24.0.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 5fefb9e3e405c..0234b379b809f 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -1344,6 +1344,7 @@ Reshaping - Bug in :func:`pandas.concat` when concatenating a multicolumn DataFrame with tz-aware data against a DataFrame with a different number of columns (:issue`22796`) - Bug in :func:`merge_asof` where confusing error message raised when attempting to merge with missing values (:issue:`23189`) - Bug in :meth:`DataFrame.nsmallest` and :meth:`DataFrame.nlargest` for dataframes that have :class:`MultiIndex`ed columns (:issue:`23033`). +- Bug in :func: `pandas.melt` when passing column names that do no exist in dataframe (:issue:`23575`) .. _whatsnew_0240.bug_fixes.sparse: From d0f6d23ce6a7d7b9db72fd7f360a95bde639828e Mon Sep 17 00:00:00 2001 From: Michael Silverstein Date: Tue, 13 Nov 2018 09:46:47 -0500 Subject: [PATCH 07/32] PEP criteria --- pandas/core/reshape/melt.py | 6 ++++-- pandas/tests/reshape/test_melt.py | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index 1d75d0b31e9a3..9893f6853bffa 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -37,7 +37,8 @@ def melt(frame, id_vars=None, value_vars=None, var_name=None, id_vars = list(id_vars) missing = Index(id_vars).difference(frame.columns) if missing.any(): - raise ValueError('Columns {missing} are not in dataframe'.format(missing=missing)) + raise ValueError('Columns {missing} are not in' + ' dataframe'.format(missing=missing)) else: id_vars = [] @@ -54,7 +55,8 @@ def melt(frame, id_vars=None, value_vars=None, var_name=None, missing = Index(value_vars).difference(frame.columns) if missing.any(): # missing_vars = str(missing) - raise ValueError('Columns {missing} are not in dataframe'.format(missing=missing)) + raise ValueError('Columns {missing} are not in' + ' dataframe'.format(missing=missing)) frame = frame.loc[:, id_vars + value_vars] else: frame = frame.copy() diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index 9f9c1125e5c64..1402cef663973 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -661,6 +661,7 @@ def test_col_substring_of_stubname(self): i=['node_id', 'A'], j='time') tm.assert_frame_equal(result, expected) + def test_melt_missing_columns(self): # Addresses issue #23575 # This test is to ensure that pandas raises an error if melting is @@ -669,8 +670,9 @@ def test_melt_missing_columns(self): # Generate data people = ['Susie', 'Alejandro'] day = ['Monday', 'Tuesday', 'Wednesday'] - data = [[person, d, *np.random.randint(0, 5, 2)] for person in people for d in day] + data = [[person, d, *np.random.randint(0, 5, 2)] for person in + people for d in day] df = pd.DataFrame(data, columns=['Name', 'day', 'burgers', 'fries']) # Try to melt with missing column name - df.melt(['Name', 'day'], ['Burgers', 'fries']) \ No newline at end of file + df.melt(['Name', 'day'], ['Burgers', 'fries']) From 6c76161a29e74a5333830cd38dc53d8bcbadca99 Mon Sep 17 00:00:00 2001 From: Michael Silverstein Date: Tue, 13 Nov 2018 11:05:03 -0500 Subject: [PATCH 08/32] `missing.empty` to accommodate MultiIndex --- pandas/core/reshape/melt.py | 4 ++-- pandas/tests/reshape/test_melt.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index 9893f6853bffa..f8fd1c8d66ae4 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -36,7 +36,7 @@ def melt(frame, id_vars=None, value_vars=None, var_name=None, # Check that `id_vars` are in frame id_vars = list(id_vars) missing = Index(id_vars).difference(frame.columns) - if missing.any(): + if not missing.empty: raise ValueError('Columns {missing} are not in' ' dataframe'.format(missing=missing)) else: @@ -53,7 +53,7 @@ def melt(frame, id_vars=None, value_vars=None, var_name=None, value_vars = list(value_vars) # Check that `value_vars` are in frame missing = Index(value_vars).difference(frame.columns) - if missing.any(): + if not missing.empty: # missing_vars = str(missing) raise ValueError('Columns {missing} are not in' ' dataframe'.format(missing=missing)) diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index 1402cef663973..eb8051fbba19e 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -675,4 +675,5 @@ def test_melt_missing_columns(self): df = pd.DataFrame(data, columns=['Name', 'day', 'burgers', 'fries']) # Try to melt with missing column name - df.melt(['Name', 'day'], ['Burgers', 'fries']) + with pytest.raises(ValueError): + df.melt(['Name', 'day'], ['Burgers', 'fries']) From ad3d9260cd6dc1ecc38a87aca9f2e00533d3643c Mon Sep 17 00:00:00 2001 From: Michael Silverstein Date: Tue, 13 Nov 2018 11:48:30 -0500 Subject: [PATCH 09/32] rm `*` --- pandas/tests/reshape/test_melt.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index eb8051fbba19e..8be0197542346 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -670,8 +670,9 @@ def test_melt_missing_columns(self): # Generate data people = ['Susie', 'Alejandro'] day = ['Monday', 'Tuesday', 'Wednesday'] - data = [[person, d, *np.random.randint(0, 5, 2)] for person in - people for d in day] + cols = ['burgers', 'fries'] + data = [[person, d] + list(np.random.randint(0, 5, len(cols))) + for person in people for d in day] df = pd.DataFrame(data, columns=['Name', 'day', 'burgers', 'fries']) # Try to melt with missing column name From e097a875a6bc4b37ef9a07558ee7a8da64e2e49f Mon Sep 17 00:00:00 2001 From: Michael Silverstein Date: Tue, 13 Nov 2018 11:48:43 -0500 Subject: [PATCH 10/32] rm comment --- pandas/core/reshape/melt.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index f8fd1c8d66ae4..1ca2cdc7d6254 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -54,7 +54,6 @@ def melt(frame, id_vars=None, value_vars=None, var_name=None, # Check that `value_vars` are in frame missing = Index(value_vars).difference(frame.columns) if not missing.empty: - # missing_vars = str(missing) raise ValueError('Columns {missing} are not in' ' dataframe'.format(missing=missing)) frame = frame.loc[:, id_vars + value_vars] From 5ff3a32823181df2828e8677ccfc6a44f84ac92c Mon Sep 17 00:00:00 2001 From: Michael Silverstein Date: Tue, 13 Nov 2018 12:50:40 -0500 Subject: [PATCH 11/32] add test for id_var and multiple missing --- pandas/tests/reshape/test_melt.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index 8be0197542346..036d6afaf08e4 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -675,6 +675,26 @@ def test_melt_missing_columns(self): for person in people for d in day] df = pd.DataFrame(data, columns=['Name', 'day', 'burgers', 'fries']) - # Try to melt with missing column name - with pytest.raises(ValueError): + # Try to melt with missing `value_vars` column name + with pytest.raises(KeyError, match="The following 'value_vars' are not" + " present in" + " the DataFrame: 'Burgers'"): df.melt(['Name', 'day'], ['Burgers', 'fries']) + + # Try to melt with missing `id_vars` column name + with pytest.raises(KeyError, match="The following 'id_vars' are not" + " present in" + " the DataFrame: 'Day'"): + df.melt(['Name', 'Day'], ['Burgers', 'fries']) + + # Try with error in both-> `id_vars` caught first + with pytest.raises(KeyError, match="The following 'id_vars' are not" + " present in" + " the DataFrame: 'not_here'"): + df.melt(['Name', 'day', 'not_here'], ['Burgers', 'fries']) + + # Multiple missing + with pytest.raises(KeyError, match="The following 'id_vars' are not" + " present in" + " the DataFrame: 'not_here, or_there'"): + df.melt(['Name', 'day', 'not_here', 'or_there'], ['Burgers', 'fries']) \ No newline at end of file From fcbda155cbdfc6d7413141e88705949bdc8c0c86 Mon Sep 17 00:00:00 2001 From: Michael Silverstein Date: Tue, 13 Nov 2018 12:51:06 -0500 Subject: [PATCH 12/32] reformat error statement; Value->KeyError --- pandas/core/reshape/melt.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index 1ca2cdc7d6254..ce2f086d77d37 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -37,8 +37,9 @@ def melt(frame, id_vars=None, value_vars=None, var_name=None, id_vars = list(id_vars) missing = Index(id_vars).difference(frame.columns) if not missing.empty: - raise ValueError('Columns {missing} are not in' - ' dataframe'.format(missing=missing)) + raise KeyError("The following 'id_vars' are not present" + " in the DataFrame:" + " '{missing}'".format(missing=', '.join(missing))) else: id_vars = [] @@ -54,8 +55,9 @@ def melt(frame, id_vars=None, value_vars=None, var_name=None, # Check that `value_vars` are in frame missing = Index(value_vars).difference(frame.columns) if not missing.empty: - raise ValueError('Columns {missing} are not in' - ' dataframe'.format(missing=missing)) + raise KeyError("The following 'value_vars' are not present in" + " the DataFrame:" + " '{missing}'".format(missing=', '.join(missing))) frame = frame.loc[:, id_vars + value_vars] else: frame = frame.copy() From 3175b3400efb0e020d5f53a9958b6ae9ed26f079 Mon Sep 17 00:00:00 2001 From: Michael Silverstein Date: Tue, 13 Nov 2018 12:54:15 -0500 Subject: [PATCH 13/32] simplified test --- doc/source/whatsnew/v0.24.0.txt | 2 +- pandas/tests/reshape/test_melt.py | 19 +++++++------------ 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 0234b379b809f..9650ea44bf5de 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -1344,7 +1344,7 @@ Reshaping - Bug in :func:`pandas.concat` when concatenating a multicolumn DataFrame with tz-aware data against a DataFrame with a different number of columns (:issue`22796`) - Bug in :func:`merge_asof` where confusing error message raised when attempting to merge with missing values (:issue:`23189`) - Bug in :meth:`DataFrame.nsmallest` and :meth:`DataFrame.nlargest` for dataframes that have :class:`MultiIndex`ed columns (:issue:`23033`). -- Bug in :func: `pandas.melt` when passing column names that do no exist in dataframe (:issue:`23575`) +- Bug in :func: `pandas.melt` when passing column names that are not present in ``DataFrame`` (:issue:`23575`) .. _whatsnew_0240.bug_fixes.sparse: diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index 036d6afaf08e4..d7b26bb8c40c8 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -668,33 +668,28 @@ def test_melt_missing_columns(self): # attempted with column names absent from the dataframe # Generate data - people = ['Susie', 'Alejandro'] - day = ['Monday', 'Tuesday', 'Wednesday'] - cols = ['burgers', 'fries'] - data = [[person, d] + list(np.random.randint(0, 5, len(cols))) - for person in people for d in day] - df = pd.DataFrame(data, columns=['Name', 'day', 'burgers', 'fries']) + df = pd.DataFrame(np.random.randn(5,4), columns=list('abcd')) # Try to melt with missing `value_vars` column name with pytest.raises(KeyError, match="The following 'value_vars' are not" " present in" - " the DataFrame: 'Burgers'"): - df.melt(['Name', 'day'], ['Burgers', 'fries']) + " the DataFrame: 'C'"): + df.melt(['a', 'b'], ['C', 'd']) # Try to melt with missing `id_vars` column name with pytest.raises(KeyError, match="The following 'id_vars' are not" " present in" - " the DataFrame: 'Day'"): - df.melt(['Name', 'Day'], ['Burgers', 'fries']) + " the DataFrame: 'A'"): + df.melt(['A', 'b'], ['c', 'd']) # Try with error in both-> `id_vars` caught first with pytest.raises(KeyError, match="The following 'id_vars' are not" " present in" " the DataFrame: 'not_here'"): - df.melt(['Name', 'day', 'not_here'], ['Burgers', 'fries']) + df.melt(['a', 'b', 'not_here'], ['c', 'd']) # Multiple missing with pytest.raises(KeyError, match="The following 'id_vars' are not" " present in" " the DataFrame: 'not_here, or_there'"): - df.melt(['Name', 'day', 'not_here', 'or_there'], ['Burgers', 'fries']) \ No newline at end of file + df.melt(['a', 'b', 'not_here', 'or_there'], ['c', 'd']) \ No newline at end of file From 515fb9f2866d7fa89629e2e1b544894f1d894ff9 Mon Sep 17 00:00:00 2001 From: Michael Silverstein Date: Tue, 13 Nov 2018 13:23:26 -0500 Subject: [PATCH 14/32] Issue -> GH --- pandas/tests/reshape/test_melt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index d7b26bb8c40c8..3356b46bf1621 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -663,7 +663,7 @@ def test_col_substring_of_stubname(self): tm.assert_frame_equal(result, expected) def test_melt_missing_columns(self): - # Addresses issue #23575 + # GH-23575 # This test is to ensure that pandas raises an error if melting is # attempted with column names absent from the dataframe From c7d6fcfc49746272c3ba086f65a9161ce1fcbad1 Mon Sep 17 00:00:00 2001 From: Michael Silverstein Date: Tue, 13 Nov 2018 13:31:04 -0500 Subject: [PATCH 15/32] PEP criteria --- pandas/core/reshape/melt.py | 6 ++++-- pandas/tests/reshape/test_melt.py | 7 ++++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index ce2f086d77d37..c58e66fcb5efa 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -39,7 +39,8 @@ def melt(frame, id_vars=None, value_vars=None, var_name=None, if not missing.empty: raise KeyError("The following 'id_vars' are not present" " in the DataFrame:" - " '{missing}'".format(missing=', '.join(missing))) + " '{missing}'".format( + missing=', '.join(missing))) else: id_vars = [] @@ -57,7 +58,8 @@ def melt(frame, id_vars=None, value_vars=None, var_name=None, if not missing.empty: raise KeyError("The following 'value_vars' are not present in" " the DataFrame:" - " '{missing}'".format(missing=', '.join(missing))) + " '{missing}'".format( + missing=', '.join(missing))) frame = frame.loc[:, id_vars + value_vars] else: frame = frame.copy() diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index 3356b46bf1621..22db298102eda 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -668,7 +668,7 @@ def test_melt_missing_columns(self): # attempted with column names absent from the dataframe # Generate data - df = pd.DataFrame(np.random.randn(5,4), columns=list('abcd')) + df = pd.DataFrame(np.random.randn(5, 4), columns=list('abcd')) # Try to melt with missing `value_vars` column name with pytest.raises(KeyError, match="The following 'value_vars' are not" @@ -691,5 +691,6 @@ def test_melt_missing_columns(self): # Multiple missing with pytest.raises(KeyError, match="The following 'id_vars' are not" " present in" - " the DataFrame: 'not_here, or_there'"): - df.melt(['a', 'b', 'not_here', 'or_there'], ['c', 'd']) \ No newline at end of file + " the DataFrame: " + "'not_here, or_there'"): + df.melt(['a', 'b', 'not_here', 'or_there'], ['c', 'd']) From 5911cc348c3d5da2b3b91c45d7304156b27fd6f3 Mon Sep 17 00:00:00 2001 From: Michael Silverstein Date: Tue, 13 Nov 2018 13:32:07 -0500 Subject: [PATCH 16/32] PEP criteria --- pandas/core/reshape/melt.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index c58e66fcb5efa..45b784c78123d 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -39,8 +39,8 @@ def melt(frame, id_vars=None, value_vars=None, var_name=None, if not missing.empty: raise KeyError("The following 'id_vars' are not present" " in the DataFrame:" - " '{missing}'".format( - missing=', '.join(missing))) + " '{missing}'" + "".format(missing=', '.join(missing))) else: id_vars = [] @@ -58,8 +58,8 @@ def melt(frame, id_vars=None, value_vars=None, var_name=None, if not missing.empty: raise KeyError("The following 'value_vars' are not present in" " the DataFrame:" - " '{missing}'".format( - missing=', '.join(missing))) + " '{missing}'" + "".format(missing=', '.join(missing))) frame = frame.loc[:, id_vars + value_vars] else: frame = frame.copy() From 47ca7fccadacd4068edac937bafc9d392fdcff6d Mon Sep 17 00:00:00 2001 From: Michael Silverstein Date: Tue, 13 Nov 2018 17:25:07 -0500 Subject: [PATCH 17/32] test not working now --- doc/source/whatsnew/v0.24.0.txt | 2 +- pandas/core/reshape/melt.py | 10 +++--- pandas/tests/reshape/test_melt.py | 57 +++++++++++++------------------ 3 files changed, 29 insertions(+), 40 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 9650ea44bf5de..66a8361974829 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -1344,7 +1344,7 @@ Reshaping - Bug in :func:`pandas.concat` when concatenating a multicolumn DataFrame with tz-aware data against a DataFrame with a different number of columns (:issue`22796`) - Bug in :func:`merge_asof` where confusing error message raised when attempting to merge with missing values (:issue:`23189`) - Bug in :meth:`DataFrame.nsmallest` and :meth:`DataFrame.nlargest` for dataframes that have :class:`MultiIndex`ed columns (:issue:`23033`). -- Bug in :func: `pandas.melt` when passing column names that are not present in ``DataFrame`` (:issue:`23575`) +- Bug in :func:`pandas.melt` when passing column names that are not present in ``DataFrame`` (:issue:`23575`) .. _whatsnew_0240.bug_fixes.sparse: diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index 45b784c78123d..60496ba64da5a 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -38,9 +38,8 @@ def melt(frame, id_vars=None, value_vars=None, var_name=None, missing = Index(id_vars).difference(frame.columns) if not missing.empty: raise KeyError("The following 'id_vars' are not present" - " in the DataFrame:" - " '{missing}'" - "".format(missing=', '.join(missing))) + " in the DataFrame: {missing}" + "".format(missing=list(missing))) else: id_vars = [] @@ -57,9 +56,8 @@ def melt(frame, id_vars=None, value_vars=None, var_name=None, missing = Index(value_vars).difference(frame.columns) if not missing.empty: raise KeyError("The following 'value_vars' are not present in" - " the DataFrame:" - " '{missing}'" - "".format(missing=', '.join(missing))) + " the DataFrame: {missing}" + "".format(missing=list(missing))) frame = frame.loc[:, id_vars + value_vars] else: frame = frame.copy() diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index 22db298102eda..e0cbbb5302ef8 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -305,6 +305,30 @@ def test_pairs(self): 'wt': ['wt%d' % i for i in range(1, 4)]} pytest.raises(ValueError, lreshape, df, spec) + def test_melt_missing_columns_raises(self): + # GH-23575 + # This test is to ensure that pandas raises an error if melting is + # attempted with column names absent from the dataframe + + # Generate data + df = pd.DataFrame(np.random.randn(5, 4), columns=list('abcd')) + + # Try to melt with missing `value_vars` column name + msg = "The following '{Var}' are not present in the DataFrame: {Col}" + with pytest.raises(KeyError, + match=msg.format(Var='value_vars', Col=['C'])): + df.melt(['a', 'b'], ['C', 'd']) + + # Try to melt with missing `id_vars` column name + with pytest.raises(KeyError, + match=msg.format(Var='id_vars', Col=['A'])): + df.melt(['A', 'b'], ['c', 'd']) + + # Multiple missing + with pytest.raises(KeyError, + match=msg.format(Var='id_vars', Col=['not_here', 'or_there'])): + df.melt(['a', 'b', 'not_here', 'or_there'], ['c', 'd']) + class TestWideToLong(object): @@ -661,36 +685,3 @@ def test_col_substring_of_stubname(self): i=['node_id', 'A'], j='time') tm.assert_frame_equal(result, expected) - - def test_melt_missing_columns(self): - # GH-23575 - # This test is to ensure that pandas raises an error if melting is - # attempted with column names absent from the dataframe - - # Generate data - df = pd.DataFrame(np.random.randn(5, 4), columns=list('abcd')) - - # Try to melt with missing `value_vars` column name - with pytest.raises(KeyError, match="The following 'value_vars' are not" - " present in" - " the DataFrame: 'C'"): - df.melt(['a', 'b'], ['C', 'd']) - - # Try to melt with missing `id_vars` column name - with pytest.raises(KeyError, match="The following 'id_vars' are not" - " present in" - " the DataFrame: 'A'"): - df.melt(['A', 'b'], ['c', 'd']) - - # Try with error in both-> `id_vars` caught first - with pytest.raises(KeyError, match="The following 'id_vars' are not" - " present in" - " the DataFrame: 'not_here'"): - df.melt(['a', 'b', 'not_here'], ['c', 'd']) - - # Multiple missing - with pytest.raises(KeyError, match="The following 'id_vars' are not" - " present in" - " the DataFrame: " - "'not_here, or_there'"): - df.melt(['a', 'b', 'not_here', 'or_there'], ['c', 'd']) From d0ee9c5ad85d2beb364dae3a6ea74c588bea36f5 Mon Sep 17 00:00:00 2001 From: Michael Silverstein Date: Wed, 14 Nov 2018 09:05:14 -0500 Subject: [PATCH 18/32] regex compatible match --- pandas/tests/reshape/test_melt.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index e0cbbb5302ef8..9eccac4c12b54 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -316,17 +316,17 @@ def test_melt_missing_columns_raises(self): # Try to melt with missing `value_vars` column name msg = "The following '{Var}' are not present in the DataFrame: {Col}" with pytest.raises(KeyError, - match=msg.format(Var='value_vars', Col=['C'])): + match=msg.format(Var='value_vars', Col="\\['C'\\]")): df.melt(['a', 'b'], ['C', 'd']) # Try to melt with missing `id_vars` column name with pytest.raises(KeyError, - match=msg.format(Var='id_vars', Col=['A'])): + match=msg.format(Var='id_vars', Col="\\['A'\\]")): df.melt(['A', 'b'], ['c', 'd']) # Multiple missing with pytest.raises(KeyError, - match=msg.format(Var='id_vars', Col=['not_here', 'or_there'])): + match=msg.format(Var='id_vars', Col="\\['not_here', 'or_there'\\]")): df.melt(['a', 'b', 'not_here', 'or_there'], ['c', 'd']) From c75ab23bdf237a9966a28f0b6941dea3ea5e0398 Mon Sep 17 00:00:00 2001 From: Michael Silverstein Date: Wed, 14 Nov 2018 09:10:20 -0500 Subject: [PATCH 19/32] PEP criteria --- pandas/tests/reshape/test_melt.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index 9eccac4c12b54..f7455ea6e4b69 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -316,17 +316,20 @@ def test_melt_missing_columns_raises(self): # Try to melt with missing `value_vars` column name msg = "The following '{Var}' are not present in the DataFrame: {Col}" with pytest.raises(KeyError, - match=msg.format(Var='value_vars', Col="\\['C'\\]")): + match=msg.format(Var='value_vars', + Col="\\['C'\\]")): df.melt(['a', 'b'], ['C', 'd']) # Try to melt with missing `id_vars` column name with pytest.raises(KeyError, - match=msg.format(Var='id_vars', Col="\\['A'\\]")): + match=msg.format(Var='id_vars', + Col="\\['A'\\]")): df.melt(['A', 'b'], ['c', 'd']) # Multiple missing with pytest.raises(KeyError, - match=msg.format(Var='id_vars', Col="\\['not_here', 'or_there'\\]")): + match=msg.format(Var='id_vars', + Col="\\['not_here', 'or_there'\\]")): df.melt(['a', 'b', 'not_here', 'or_there'], ['c', 'd']) From 32ed22ce375c369b88c7f0dfd68fd3342e4ebcc8 Mon Sep 17 00:00:00 2001 From: Michael Silverstein Date: Wed, 14 Nov 2018 13:12:38 -0500 Subject: [PATCH 20/32] move test to TestMelt() class --- pandas/tests/reshape/test_melt.py | 54 +++++++++++++++---------------- 1 file changed, 26 insertions(+), 28 deletions(-) diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index f7455ea6e4b69..d33bcde767edf 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -233,6 +233,32 @@ def test_pandas_dtypes(self, col): expected.columns = ['klass', 'col', 'attribute', 'value'] tm.assert_frame_equal(result, expected) + def test_melt_missing_columns_raises(self): + # GH-23575 + # This test is to ensure that pandas raises an error if melting is + # attempted with column names absent from the dataframe + + # Generate data + df = pd.DataFrame(np.random.randn(5, 4), columns=list('abcd')) + + # Try to melt with missing `value_vars` column name + msg = "The following '{Var}' are not present in the DataFrame: {Col}" + with pytest.raises(KeyError, + match=msg.format(Var='value_vars', + Col="\\['C'\\]")): + df.melt(['a', 'b'], ['C', 'd']) + + # Try to melt with missing `id_vars` column name + with pytest.raises(KeyError, + match=msg.format(Var='id_vars', + Col="\\['A'\\]")): + df.melt(['A', 'b'], ['c', 'd']) + + # Multiple missing + with pytest.raises(KeyError, + match=msg.format(Var='id_vars', + Col="\\['not_here', 'or_there'\\]")): + df.melt(['a', 'b', 'not_here', 'or_there'], ['c', 'd']) class TestLreshape(object): @@ -305,34 +331,6 @@ def test_pairs(self): 'wt': ['wt%d' % i for i in range(1, 4)]} pytest.raises(ValueError, lreshape, df, spec) - def test_melt_missing_columns_raises(self): - # GH-23575 - # This test is to ensure that pandas raises an error if melting is - # attempted with column names absent from the dataframe - - # Generate data - df = pd.DataFrame(np.random.randn(5, 4), columns=list('abcd')) - - # Try to melt with missing `value_vars` column name - msg = "The following '{Var}' are not present in the DataFrame: {Col}" - with pytest.raises(KeyError, - match=msg.format(Var='value_vars', - Col="\\['C'\\]")): - df.melt(['a', 'b'], ['C', 'd']) - - # Try to melt with missing `id_vars` column name - with pytest.raises(KeyError, - match=msg.format(Var='id_vars', - Col="\\['A'\\]")): - df.melt(['A', 'b'], ['c', 'd']) - - # Multiple missing - with pytest.raises(KeyError, - match=msg.format(Var='id_vars', - Col="\\['not_here', 'or_there'\\]")): - df.melt(['a', 'b', 'not_here', 'or_there'], ['c', 'd']) - - class TestWideToLong(object): def test_simple(self): From e629b2a8b9cb090b9371a18167aa20f71088fc52 Mon Sep 17 00:00:00 2001 From: Michael Silverstein Date: Wed, 14 Nov 2018 13:28:40 -0500 Subject: [PATCH 21/32] PEP --- pandas/tests/reshape/test_melt.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index d33bcde767edf..965ce713f18be 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -244,20 +244,17 @@ def test_melt_missing_columns_raises(self): # Try to melt with missing `value_vars` column name msg = "The following '{Var}' are not present in the DataFrame: {Col}" with pytest.raises(KeyError, - match=msg.format(Var='value_vars', - Col="\\['C'\\]")): + match=msg.format(Var='value_vars', Col="\\['C'\\]")): df.melt(['a', 'b'], ['C', 'd']) # Try to melt with missing `id_vars` column name with pytest.raises(KeyError, - match=msg.format(Var='id_vars', - Col="\\['A'\\]")): + match=msg.format(Var='id_vars', Col="\\['A'\\]")): df.melt(['A', 'b'], ['c', 'd']) # Multiple missing with pytest.raises(KeyError, - match=msg.format(Var='id_vars', - Col="\\['not_here', 'or_there'\\]")): + match=msg.format(Var='id_vars', Col="\\['not_here', 'or_there'\\]")): df.melt(['a', 'b', 'not_here', 'or_there'], ['c', 'd']) class TestLreshape(object): From 89de406958ea4b8b00fb111e764433f63fbf6692 Mon Sep 17 00:00:00 2001 From: Michael Silverstein Date: Wed, 14 Nov 2018 13:31:53 -0500 Subject: [PATCH 22/32] PEP --- pandas/tests/reshape/test_melt.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index 965ce713f18be..b32be3b2db5a1 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -244,17 +244,20 @@ def test_melt_missing_columns_raises(self): # Try to melt with missing `value_vars` column name msg = "The following '{Var}' are not present in the DataFrame: {Col}" with pytest.raises(KeyError, - match=msg.format(Var='value_vars', Col="\\['C'\\]")): + match=msg.format(Var='value_vars', + Col="\\['C'\\]")): df.melt(['a', 'b'], ['C', 'd']) # Try to melt with missing `id_vars` column name with pytest.raises(KeyError, - match=msg.format(Var='id_vars', Col="\\['A'\\]")): + match=msg.format(Var='id_vars', + Col="\\['A'\\]")): df.melt(['A', 'b'], ['c', 'd']) # Multiple missing with pytest.raises(KeyError, - match=msg.format(Var='id_vars', Col="\\['not_here', 'or_there'\\]")): + match=msg.format(Var='id_vars', + Col="\\['not_here', 'or_there'\\]")): df.melt(['a', 'b', 'not_here', 'or_there'], ['c', 'd']) class TestLreshape(object): From 1d13f4a6ec7140fa22509c29e10191809ab62b71 Mon Sep 17 00:00:00 2001 From: Michael Silverstein Date: Wed, 14 Nov 2018 13:34:03 -0500 Subject: [PATCH 23/32] PEP --- pandas/tests/reshape/test_melt.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index b32be3b2db5a1..f7818ca585fd0 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -243,21 +243,22 @@ def test_melt_missing_columns_raises(self): # Try to melt with missing `value_vars` column name msg = "The following '{Var}' are not present in the DataFrame: {Col}" - with pytest.raises(KeyError, - match=msg.format(Var='value_vars', - Col="\\['C'\\]")): + with pytest.raises( + KeyError, + match=msg.format(Var='value_vars', Col="\\['C'\\]")): df.melt(['a', 'b'], ['C', 'd']) # Try to melt with missing `id_vars` column name - with pytest.raises(KeyError, - match=msg.format(Var='id_vars', - Col="\\['A'\\]")): + with pytest.raises( + KeyError, + match=msg.format(Var='id_vars', Col="\\['A'\\]")): df.melt(['A', 'b'], ['c', 'd']) # Multiple missing - with pytest.raises(KeyError, - match=msg.format(Var='id_vars', - Col="\\['not_here', 'or_there'\\]")): + with pytest.raises( + KeyError, + match=msg.format(Var='id_vars', + Col="\\['not_here', 'or_there'\\]")): df.melt(['a', 'b', 'not_here', 'or_there'], ['c', 'd']) class TestLreshape(object): From 01e8d746497b00e5d7485df2439f0ed83a2b2818 Mon Sep 17 00:00:00 2001 From: Michael Silverstein Date: Thu, 15 Nov 2018 13:03:03 -0500 Subject: [PATCH 24/32] resolving conflicts --- doc/source/contributing.rst | 19 +- doc/source/whatsnew/v0.24.0.rst | 51 +- doc/sphinxext/contributors.py | 31 +- pandas/_libs/lib.pyx | 23 +- pandas/core/frame.py | 68 +- pandas/core/indexes/base.py | 106 ++- pandas/core/panel.py | 49 +- pandas/core/reshape/merge.py | 89 +++ pandas/core/series.py | 10 - pandas/io/formats/format.py | 11 +- pandas/io/formats/html.py | 35 +- pandas/tests/frame/test_combine_concat.py | 21 +- pandas/tests/indexes/test_base.py | 7 + .../formats/data/gh15019_expected_output.html | 30 + .../formats/data/gh22783_expected_output.html | 27 + pandas/tests/io/formats/test_format.py | 23 +- pandas/tests/io/formats/test_to_html.py | 45 ++ pandas/tests/io/parser/python_parser_only.py | 5 + pandas/tests/reshape/merge/test_merge.py | 520 +------------- pandas/tests/reshape/merge/test_multi.py | 672 ++++++++++++++++++ pandas/tests/reshape/test_concat.py | 15 + pandas/tests/series/test_operators.py | 28 +- pandas/tests/test_panel.py | 23 +- scripts/tests/test_validate_docstrings.py | 5 +- scripts/validate_docstrings.py | 8 +- 25 files changed, 1221 insertions(+), 700 deletions(-) create mode 100644 pandas/tests/io/formats/data/gh15019_expected_output.html create mode 100644 pandas/tests/io/formats/data/gh22783_expected_output.html create mode 100644 pandas/tests/reshape/merge/test_multi.py diff --git a/doc/source/contributing.rst b/doc/source/contributing.rst index 7eb9a6cf815ba..b44bd1cfd9007 100644 --- a/doc/source/contributing.rst +++ b/doc/source/contributing.rst @@ -591,21 +591,14 @@ run this slightly modified command:: git diff master --name-only -- "*.py" | grep "pandas/" | xargs flake8 -Note that on Windows, these commands are unfortunately not possible because -commands like ``grep`` and ``xargs`` are not available natively. To imitate the -behavior with the commands above, you should run:: +Windows does not support the ``grep`` and ``xargs`` commands (unless installed +for example via the `MinGW `__ toolchain), but one can +imitate the behaviour as follows:: - git diff master --name-only -- "*.py" + for /f %i in ('git diff upstream/master --name-only ^| findstr pandas/') do flake8 %i -This will list all of the Python files that have been modified. The only ones -that matter during linting are any whose directory filepath begins with "pandas." -For each filepath, copy and paste it after the ``flake8`` command as shown below: - - flake8 - -Alternatively, you can install the ``grep`` and ``xargs`` commands via the -`MinGW `__ toolchain, and it will allow you to run the -commands above. +This will also get all the files being changed by the PR (and within the +``pandas/`` folder), and run ``flake8`` on them one after the other. .. _contributing.import-formatting: diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 76e92978d0346..28e6f1c2c3573 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -24,7 +24,8 @@ New features the user to override the engine's default behavior to include or omit the dataframe's indexes from the resulting Parquet file. (:issue:`20768`) - :meth:`DataFrame.corr` and :meth:`Series.corr` now accept a callable for generic calculation methods of correlation, e.g. histogram intersection (:issue:`22684`) - +- :func:`DataFrame.to_string` now accepts ``decimal`` as an argument, allowing +the user to specify which decimal separator should be used in the output. (:issue:`23614`) .. _whatsnew_0240.enhancements.extension_array_operators: @@ -183,6 +184,47 @@ array, but rather an ``ExtensionArray``: This is the same behavior as ``Series.values`` for categorical data. See :ref:`whatsnew_0240.api_breaking.interval_values` for more. +.. _whatsnew_0240.enhancements.join_with_two_multiindexes: + +Joining with two multi-indexes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:func:`Datafame.merge` and :func:`Dataframe.join` can now be used to join multi-indexed ``Dataframe`` instances on the overlaping index levels (:issue:`6360`) + +See the :ref:`Merge, join, and concatenate +` documentation section. + +.. ipython:: python + + index_left = pd.MultiIndex.from_tuples([('K0', 'X0'), ('K0', 'X1'), + ('K1', 'X2')], + names=['key', 'X']) + + + left = pd.DataFrame({'A': ['A0', 'A1', 'A2'], + 'B': ['B0', 'B1', 'B2']}, + index=index_left) + + + index_right = pd.MultiIndex.from_tuples([('K0', 'Y0'), ('K1', 'Y1'), + ('K2', 'Y2'), ('K2', 'Y3')], + names=['key', 'Y']) + + + right = pd.DataFrame({'C': ['C0', 'C1', 'C2', 'C3'], + 'D': ['D0', 'D1', 'D2', 'D3']}, + index=index_right) + + + left.join(right) + +For earlier versions this can be done using the following. + +.. ipython:: python + + pd.merge(left.reset_index(), right.reset_index(), + on=['key'], how='inner').set_index(['key', 'X', 'Y']) + .. _whatsnew_0240.enhancements.rename_axis: Renaming names in a MultiIndex @@ -961,6 +1003,7 @@ Other API Changes - :class:`DateOffset` attribute `_cacheable` and method `_should_cache` have been removed (:issue:`23118`) - Comparing :class:`Timedelta` to be less or greater than unknown types now raises a ``TypeError`` instead of returning ``False`` (:issue:`20829`) - :meth:`Index.hasnans` and :meth:`Series.hasnans` now always return a python boolean. Previously, a python or a numpy boolean could be returned, depending on circumstances (:issue:`23294`). +- The order of the arguments of :func:`DataFrame.to_html` and :func:`DataFrame.to_string` is rearranged to be consistent with each other. (:issue:`23614`) .. _whatsnew_0240.deprecations: @@ -981,6 +1024,7 @@ Deprecations - The ``fastpath`` keyword of the different Index constructors is deprecated (:issue:`23110`). - :meth:`Timestamp.tz_localize`, :meth:`DatetimeIndex.tz_localize`, and :meth:`Series.tz_localize` have deprecated the ``errors`` argument in favor of the ``nonexistent`` argument (:issue:`8917`) - The class ``FrozenNDArray`` has been deprecated. When unpickling, ``FrozenNDArray`` will be unpickled to ``np.ndarray`` once this class is removed (:issue:`9031`) +- The methods :meth:`DataFrame.update` and :meth:`Panel.update` have deprecated the ``raise_conflict=False|True`` keyword in favor of ``errors='ignore'|'raise'`` (:issue:`23585`) - Deprecated the `nthreads` keyword of :func:`pandas.read_feather` in favor of `use_threads` to reflect the changes in pyarrow 0.11.0. (:issue:`23053`) - :func:`pandas.read_excel` has deprecated accepting ``usecols`` as an integer. Please pass in a list of ints from 0 to ``usecols`` inclusive instead (:issue:`23527`) @@ -1319,7 +1363,9 @@ Notice how we now instead output ``np.nan`` itself instead of a stringified form - :func:`read_sas()` will correctly parse sas7bdat files with many columns (:issue:`22628`) - :func:`read_sas()` will correctly parse sas7bdat files with data page types having also bit 7 set (so page type is 128 + 256 = 384) (:issue:`16615`) - Bug in :meth:`detect_client_encoding` where potential ``IOError`` goes unhandled when importing in a mod_wsgi process due to restricted access to stdout. (:issue:`21552`) -- Bug in :func:`to_string()` that broke column alignment when ``index=False`` and width of first column's values is greater than the width of first column's header (:issue:`16839`, :issue:`13032`) +- Bug in :func:`to_html()` with ``index=False`` misses truncation indicators (...) on truncated DataFrame (:issue:`15019`, :issue:`22783`) +- Bug in :func:`DataFrame.to_string()` that broke column alignment when ``index=False`` and width of first column's values is greater than the width of first column's header (:issue:`16839`, :issue:`13032`) +- Bug in :func:`DataFrame.to_string()` that caused representations of :class:`DataFrame` to not take up the whole window (:issue:`22984`) - Bug in :func:`DataFrame.to_csv` where a single level MultiIndex incorrectly wrote a tuple. Now just the value of the index is written (:issue:`19589`). - Bug in :meth:`HDFStore.append` when appending a :class:`DataFrame` with an empty string column and ``min_itemsize`` < 8 (:issue:`12242`) - Bug in :meth:`read_csv()` in which :class:`MultiIndex` index names were being improperly handled in the cases when they were not provided (:issue:`23484`) @@ -1373,6 +1419,7 @@ Reshaping - Bug in :func:`merge_asof` where confusing error message raised when attempting to merge with missing values (:issue:`23189`) - Bug in :meth:`DataFrame.nsmallest` and :meth:`DataFrame.nlargest` for dataframes that have a :class:`MultiIndex` for columns (:issue:`23033`). - Bug in :func:`pandas.melt` when passing column names that are not present in ``DataFrame`` (:issue:`23575`) +- Bug in :meth:`DataFrame.append` with a :class:`Series` with a dateutil timezone would raise a ``TypeError`` (:issue:`23682`) .. _whatsnew_0240.bug_fixes.sparse: diff --git a/doc/sphinxext/contributors.py b/doc/sphinxext/contributors.py index 0f04d47435699..8c9fa5bc961d1 100644 --- a/doc/sphinxext/contributors.py +++ b/doc/sphinxext/contributors.py @@ -10,6 +10,7 @@ """ from docutils import nodes from docutils.parsers.rst import Directive +import git from announce import build_components @@ -19,17 +20,25 @@ class ContributorsDirective(Directive): name = 'contributors' def run(self): - components = build_components(self.arguments[0]) - - message = nodes.paragraph() - message += nodes.Text(components['author_message']) - - listnode = nodes.bullet_list() - - for author in components['authors']: - para = nodes.paragraph() - para += nodes.Text(author) - listnode += nodes.list_item('', para) + range_ = self.arguments[0] + try: + components = build_components(range_) + except git.GitCommandError: + return [ + self.state.document.reporter.warning( + "Cannot find contributors for range '{}'".format(range_), + line=self.lineno) + ] + else: + message = nodes.paragraph() + message += nodes.Text(components['author_message']) + + listnode = nodes.bullet_list() + + for author in components['authors']: + para = nodes.paragraph() + para += nodes.Text(author) + listnode += nodes.list_item('', para) return [message, listnode] diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index cfc60256e97a3..0088a698f49e0 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -48,8 +48,7 @@ cdef extern from "src/parse_helper.h": int floatify(object, float64_t *result, int *maybe_int) except -1 cimport util -from util cimport (is_nan, - UINT8_MAX, UINT64_MAX, INT64_MAX, INT64_MIN) +from util cimport is_nan, UINT64_MAX, INT64_MAX, INT64_MIN from tslib import array_to_datetime from tslibs.nattype cimport NPY_NAT @@ -1642,20 +1641,22 @@ def is_datetime_with_singletz_array(values: ndarray) -> bool: if n == 0: return False - + # Get a reference timezone to compare with the rest of the tzs in the array for i in range(n): base_val = values[i] if base_val is not NaT: base_tz = get_timezone(getattr(base_val, 'tzinfo', None)) - - for j in range(i, n): - val = values[j] - if val is not NaT: - tz = getattr(val, 'tzinfo', None) - if not tz_compare(base_tz, tz): - return False break + for j in range(i, n): + # Compare val's timezone with the reference timezone + # NaT can coexist with tz-aware datetimes, so skip if encountered + val = values[j] + if val is not NaT: + tz = getattr(val, 'tzinfo', None) + if not tz_compare(base_tz, tz): + return False + return True @@ -2045,7 +2046,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, # we try to coerce datetime w/tz but must all have the same tz if seen.datetimetz_: - if len({getattr(val, 'tzinfo', None) for val in objects}) == 1: + if is_datetime_with_singletz_array(objects): from pandas import DatetimeIndex return DatetimeIndex(objects) seen.object_ = 1 diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 511604517a84e..e313e0f37a445 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2035,24 +2035,21 @@ def to_parquet(self, fname, engine='auto', compression='snappy', def to_string(self, buf=None, columns=None, col_space=None, header=True, index=True, na_rep='NaN', formatters=None, float_format=None, sparsify=None, index_names=True, justify=None, - line_width=None, max_rows=None, max_cols=None, - show_dimensions=False): + max_rows=None, max_cols=None, show_dimensions=False, + decimal='.', line_width=None): """ Render a DataFrame to a console-friendly tabular output. - %(shared_params)s line_width : int, optional Width to wrap a line in characters. - %(returns)s - See Also -------- to_html : Convert DataFrame to HTML. Examples -------- - >>> d = {'col1' : [1, 2, 3], 'col2' : [4, 5, 6]} + >>> d = {'col1': [1, 2, 3], 'col2': [4, 5, 6]} >>> df = pd.DataFrame(d) >>> print(df.to_string()) col1 col2 @@ -2068,42 +2065,37 @@ def to_string(self, buf=None, columns=None, col_space=None, header=True, sparsify=sparsify, justify=justify, index_names=index_names, header=header, index=index, - line_width=line_width, max_rows=max_rows, max_cols=max_cols, - show_dimensions=show_dimensions) + show_dimensions=show_dimensions, + decimal=decimal, + line_width=line_width) formatter.to_string() if buf is None: result = formatter.buf.getvalue() return result - @Substitution(header='whether to print column labels, default True') + @Substitution(header='Whether to print column labels, default True') @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring) def to_html(self, buf=None, columns=None, col_space=None, header=True, index=True, na_rep='NaN', formatters=None, float_format=None, - sparsify=None, index_names=True, justify=None, bold_rows=True, - classes=None, escape=True, max_rows=None, max_cols=None, - show_dimensions=False, notebook=False, decimal='.', - border=None, table_id=None): + sparsify=None, index_names=True, justify=None, max_rows=None, + max_cols=None, show_dimensions=False, decimal='.', + bold_rows=True, classes=None, escape=True, + notebook=False, border=None, table_id=None): """ Render a DataFrame as an HTML table. - %(shared_params)s - bold_rows : boolean, default True - Make the row labels bold in the output + bold_rows : bool, default True + Make the row labels bold in the output. classes : str or list or tuple, default None - CSS class(es) to apply to the resulting html table - escape : boolean, default True + CSS class(es) to apply to the resulting html table. + escape : bool, default True Convert the characters <, >, and & to HTML-safe sequences. notebook : {True, False}, default False Whether the generated HTML is for IPython Notebook. - decimal : string, default '.' - Character recognized as decimal separator, e.g. ',' in Europe - - .. versionadded:: 0.18.0 - border : int A ``border=border`` attribute is included in the opening `` tag. Default ``pd.options.html.border``. @@ -2114,9 +2106,7 @@ def to_html(self, buf=None, columns=None, col_space=None, header=True, A css id is included in the opening `
` tag if specified. .. versionadded:: 0.23.0 - %(returns)s - See Also -------- to_string : Convert DataFrame to a string. @@ -5213,8 +5203,10 @@ def combiner(x, y): return self.combine(other, combiner, overwrite=False) + @deprecate_kwarg(old_arg_name='raise_conflict', new_arg_name='errors', + mapping={False: 'ignore', True: 'raise'}) def update(self, other, join='left', overwrite=True, filter_func=None, - raise_conflict=False): + errors='ignore'): """ Modify in place using non-NA values from another DataFrame. @@ -5238,17 +5230,28 @@ def update(self, other, join='left', overwrite=True, filter_func=None, * False: only update values that are NA in the original DataFrame. - filter_func : callable(1d-array) -> boolean 1d-array, optional + filter_func : callable(1d-array) -> bool 1d-array, optional Can choose to replace values other than NA. Return True for values that should be updated. - raise_conflict : bool, default False - If True, will raise a ValueError if the DataFrame and `other` + errors : {'raise', 'ignore'}, default 'ignore' + If 'raise', will raise a ValueError if the DataFrame and `other` both contain non-NA data in the same place. + .. versionchanged :: 0.24.0 + Changed from `raise_conflict=False|True` + to `errors='ignore'|'raise'`. + + Returns + ------- + None : method directly changes calling object + Raises ------ ValueError - When `raise_conflict` is True and there's overlapping non-NA data. + * When `errors='raise'` and there's overlapping non-NA data. + * When `errors` is not either `'ignore'` or `'raise'` + NotImplementedError + * If `join != 'left'` See Also -------- @@ -5319,6 +5322,9 @@ def update(self, other, join='left', overwrite=True, filter_func=None, # TODO: Support other joins if join != 'left': # pragma: no cover raise NotImplementedError("Only left join is supported") + if errors not in ['ignore', 'raise']: + raise ValueError("The parameter errors must be either " + "'ignore' or 'raise'") if not isinstance(other, DataFrame): other = DataFrame(other) @@ -5332,7 +5338,7 @@ def update(self, other, join='left', overwrite=True, filter_func=None, with np.errstate(all='ignore'): mask = ~filter_func(this) | isna(that) else: - if raise_conflict: + if errors == 'raise': mask_this = notna(that) mask_that = notna(this) if any(mask_this & mask_that): diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 2f449b4b33d8d..0632198c77262 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -19,7 +19,7 @@ ABCSeries, ABCDataFrame, ABCMultiIndex, ABCPeriodIndex, ABCTimedeltaIndex, ABCDatetimeIndex, - ABCDateOffset) + ABCDateOffset, ABCIndexClass) from pandas.core.dtypes.missing import isna, array_equivalent from pandas.core.dtypes.cast import maybe_cast_to_integer_array from pandas.core.dtypes.common import ( @@ -522,6 +522,12 @@ def _simple_new(cls, values, name=None, dtype=None, **kwargs): values = cls(values, name=name, dtype=dtype, **kwargs)._ndarray_values + if isinstance(values, (ABCSeries, ABCIndexClass)): + # Index._data must always be an ndarray. + # This is no-copy for when _values is an ndarray, + # which should be always at this point. + values = np.asarray(values._values) + result = object.__new__(cls) result._data = values result.name = name @@ -3162,8 +3168,8 @@ def get_value(self, series, key): iloc = self.get_loc(key) return s[iloc] except KeyError: - if (len(self) > 0 - and (self.holds_integer() or self.is_boolean())): + if (len(self) > 0 and + (self.holds_integer() or self.is_boolean())): raise elif is_integer(key): return s[key] @@ -3951,46 +3957,72 @@ def join(self, other, how='left', level=None, return_indexers=False, def _join_multi(self, other, how, return_indexers=True): from .multi import MultiIndex + from pandas.core.reshape.merge import _restore_dropped_levels_multijoin + + # figure out join names + self_names = set(com._not_none(*self.names)) + other_names = set(com._not_none(*other.names)) + overlap = self_names & other_names + + # need at least 1 in common + if not overlap: + raise ValueError("cannot join with no overlapping index names") + self_is_mi = isinstance(self, MultiIndex) other_is_mi = isinstance(other, MultiIndex) - # figure out join names - self_names = com._not_none(*self.names) - other_names = com._not_none(*other.names) - overlap = list(set(self_names) & set(other_names)) - - # need at least 1 in common, but not more than 1 - if not len(overlap): - raise ValueError("cannot join with no level specified and no " - "overlapping names") - if len(overlap) > 1: - raise NotImplementedError("merging with more than one level " - "overlap on a multi-index is not " - "implemented") - jl = overlap[0] + if self_is_mi and other_is_mi: + + # Drop the non-matching levels from left and right respectively + ldrop_names = list(self_names - overlap) + rdrop_names = list(other_names - overlap) + + self_jnlevels = self.droplevel(ldrop_names) + other_jnlevels = other.droplevel(rdrop_names) + + # Join left and right + # Join on same leveled multi-index frames is supported + join_idx, lidx, ridx = self_jnlevels.join(other_jnlevels, how, + return_indexers=True) + + # Restore the dropped levels + # Returned index level order is + # common levels, ldrop_names, rdrop_names + dropped_names = ldrop_names + rdrop_names + levels, labels, names = ( + _restore_dropped_levels_multijoin(self, other, + dropped_names, + join_idx, + lidx, ridx)) + + # Re-create the multi-index + multi_join_idx = MultiIndex(levels=levels, labels=labels, + names=names, verify_integrity=False) + + multi_join_idx = multi_join_idx.remove_unused_levels() + + return multi_join_idx, lidx, ridx + + jl = list(overlap)[0] + + # Case where only one index is multi # make the indices into mi's that match - if not (self_is_mi and other_is_mi): - - flip_order = False - if self_is_mi: - self, other = other, self - flip_order = True - # flip if join method is right or left - how = {'right': 'left', 'left': 'right'}.get(how, how) - - level = other.names.index(jl) - result = self._join_level(other, level, how=how, - return_indexers=return_indexers) - - if flip_order: - if isinstance(result, tuple): - return result[0], result[2], result[1] - return result + flip_order = False + if self_is_mi: + self, other = other, self + flip_order = True + # flip if join method is right or left + how = {'right': 'left', 'left': 'right'}.get(how, how) + + level = other.names.index(jl) + result = self._join_level(other, level, how=how, + return_indexers=return_indexers) - # 2 multi-indexes - raise NotImplementedError("merging with both multi-indexes is not " - "implemented") + if flip_order: + if isinstance(result, tuple): + return result[0], result[2], result[1] + return result def _join_non_unique(self, other, how='left', return_indexers=False): from pandas.core.reshape.merge import _get_join_indexers diff --git a/pandas/core/panel.py b/pandas/core/panel.py index c878d16fac2e9..5ae7848b5adc6 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -32,7 +32,7 @@ create_block_manager_from_blocks) from pandas.core.series import Series from pandas.core.reshape.util import cartesian_product -from pandas.util._decorators import Appender, Substitution +from pandas.util._decorators import Appender, Substitution, deprecate_kwarg from pandas.util._validators import validate_axis_style_args _shared_doc_kwargs = dict( @@ -1235,7 +1235,12 @@ def reindex(self, *args, **kwargs): kwargs.update(axes) kwargs.pop('axis', None) kwargs.pop('labels', None) - return super(Panel, self).reindex(**kwargs) + + with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + # do not warn about constructing Panel when reindexing + result = super(Panel, self).reindex(**kwargs) + return result @Substitution(**_shared_doc_kwargs) @Appender(NDFrame.rename.__doc__) @@ -1377,25 +1382,37 @@ def join(self, other, how='left', lsuffix='', rsuffix=''): return concat([self] + list(other), axis=0, join=how, join_axes=join_axes, verify_integrity=True) + @deprecate_kwarg(old_arg_name='raise_conflict', new_arg_name='errors', + mapping={False: 'ignore', True: 'raise'}) def update(self, other, join='left', overwrite=True, filter_func=None, - raise_conflict=False): + errors='ignore'): """ - Modify Panel in place using non-NA values from passed - Panel, or object coercible to Panel. Aligns on items + Modify Panel in place using non-NA values from other Panel. + + May also use object coercible to Panel. Will align on items. Parameters ---------- other : Panel, or object coercible to Panel - join : How to join individual DataFrames - {'left', 'right', 'outer', 'inner'}, default 'left' - overwrite : boolean, default True - If True then overwrite values for common keys in the calling panel - filter_func : callable(1d-array) -> 1d-array, default None + The object from which the caller will be udpated. + join : {'left', 'right', 'outer', 'inner'}, default 'left' + How individual DataFrames are joined. + overwrite : bool, default True + If True then overwrite values for common keys in the calling Panel. + filter_func : callable(1d-array) -> 1d-array, default None Can choose to replace values other than NA. Return True for values - that should be updated - raise_conflict : bool - If True, will raise an error if a DataFrame and other both - contain data in the same place. + that should be updated. + errors : {'raise', 'ignore'}, default 'ignore' + If 'raise', will raise an error if a DataFrame and other both. + + .. versionchanged :: 0.24.0 + Changed from `raise_conflict=False|True` + to `errors='ignore'|'raise'`. + + See Also + -------- + DataFrame.update : Similar method for DataFrames. + dict.update : Similar method for dictionaries. """ if not isinstance(other, self._constructor): @@ -1406,8 +1423,8 @@ def update(self, other, join='left', overwrite=True, filter_func=None, other = other.reindex(**{axis_name: axis_values}) for frame in axis_values: - self[frame].update(other[frame], join, overwrite, filter_func, - raise_conflict) + self[frame].update(other[frame], join=join, overwrite=overwrite, + filter_func=filter_func, errors=errors) def _get_join_index(self, other, how): if how == 'left': diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 3d6f55c907269..93a6e4538cbc1 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1122,6 +1122,95 @@ def _get_join_indexers(left_keys, right_keys, sort=False, how='inner', return join_func(lkey, rkey, count, **kwargs) +def _restore_dropped_levels_multijoin(left, right, dropped_level_names, + join_index, lindexer, rindexer): + """ + *this is an internal non-public method* + + Returns the levels, labels and names of a multi-index to multi-index join. + Depending on the type of join, this method restores the appropriate + dropped levels of the joined multi-index. + The method relies on lidx, rindexer which hold the index positions of + left and right, where a join was feasible + + Parameters + ---------- + left : MultiIndex + left index + right : MultiIndex + right index + dropped_level_names : str array + list of non-common level names + join_index : MultiIndex + the index of the join between the + common levels of left and right + lindexer : intp array + left indexer + rindexer : intp array + right indexer + + Returns + ------- + levels : list of Index + levels of combined multiindexes + labels : intp array + labels of combined multiindexes + names : str array + names of combined multiindexes + + """ + + def _convert_to_mulitindex(index): + if isinstance(index, MultiIndex): + return index + else: + return MultiIndex.from_arrays([index.values], + names=[index.name]) + + # For multi-multi joins with one overlapping level, + # the returned index if of type Index + # Assure that join_index is of type MultiIndex + # so that dropped levels can be appended + join_index = _convert_to_mulitindex(join_index) + + join_levels = join_index.levels + join_labels = join_index.labels + join_names = join_index.names + + # lindexer and rindexer hold the indexes where the join occurred + # for left and right respectively. If left/right is None then + # the join occurred on all indices of left/right + if lindexer is None: + lindexer = range(left.size) + + if rindexer is None: + rindexer = range(right.size) + + # Iterate through the levels that must be restored + for dropped_level_name in dropped_level_names: + if dropped_level_name in left.names: + idx = left + indexer = lindexer + else: + idx = right + indexer = rindexer + + # The index of the level name to be restored + name_idx = idx.names.index(dropped_level_name) + + restore_levels = idx.levels[name_idx] + # Inject -1 in the labels list where a join was not possible + # IOW indexer[i]=-1 + labels = idx.labels[name_idx] + restore_labels = algos.take_nd(labels, indexer, fill_value=-1) + + join_levels = join_levels + [restore_levels] + join_labels = join_labels + [restore_labels] + join_names = join_names + [dropped_level_name] + + return join_levels, join_labels, join_names + + class _OrderedMerge(_MergeOperation): _merge_type = 'ordered_merge' diff --git a/pandas/core/series.py b/pandas/core/series.py index 7f832009ca273..8fba3030be9d4 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -314,7 +314,6 @@ def from_array(cls, arr, index=None, name=None, dtype=None, copy=False, .. deprecated :: 0.23.0 Use pd.Series(..) constructor instead. - """ warnings.warn("'from_array' is deprecated and will be removed in a " "future version. Please use the pd.Series(..) " @@ -437,7 +436,6 @@ def values(self): array(['2013-01-01T05:00:00.000000000', '2013-01-02T05:00:00.000000000', '2013-01-03T05:00:00.000000000'], dtype='datetime64[ns]') - """ return self._data.external_values() @@ -1824,7 +1822,6 @@ def round(self, decimals=0, *args, **kwargs): -------- numpy.around DataFrame.round - """ nv.validate_round(args, kwargs) result = com.values_from_object(self).round(decimals) @@ -1906,7 +1903,6 @@ def corr(self, other, method='pearson', min_periods=None): min_periods : int, optional Minimum number of observations needed to have a valid result - Returns ------- correlation : float @@ -2230,8 +2226,6 @@ def append(self, to_append, ignore_index=False, verify_integrity=False): Traceback (most recent call last): ... ValueError: Indexes have overlapping values: [0, 1, 2] - - """ from pandas.core.reshape.concat import concat @@ -2436,7 +2430,6 @@ def update(self, other): 1 2 2 6 dtype: int64 - """ other = other.reindex_like(self) mask = notna(other) @@ -3011,7 +3004,6 @@ def swaplevel(self, i=-2, j=-1, copy=True): The indexes ``i`` and ``j`` are now optional, and default to the two innermost levels of the index. - """ new_index = self.index.swaplevel(i, j) return self._constructor(self._values, index=new_index, @@ -3336,8 +3328,6 @@ def apply(self, func, convert_dtype=True, args=(), **kwds): New York 3.044522 Helsinki 2.484907 dtype: float64 - - """ if len(self) == 0: return self._constructor(dtype=self.dtype, diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 6f64605bcf175..b63e44c6c3437 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -88,6 +88,10 @@ Maximum number of columns to display in the console. show_dimensions : bool, default False Display DataFrame dimensions (number of rows by number of columns). + decimal : str, default '.' + Character recognized as decimal separator, e.g. ',' in Europe. + + .. versionadded:: 0.18.0 """ _VALID_JUSTIFY_PARAMETERS = ("left", "right", "center", "justify", @@ -101,8 +105,6 @@ String representation of the dataframe. """ -docstring_to_string = common_docstring + return_docstring - class CategoricalFormatter(object): @@ -608,11 +610,6 @@ def to_string(self): else: # max_cols == 0. Try to fit frame to terminal text = self.adj.adjoin(1, *strcols).split('\n') max_len = Series(text).str.len().max() - headers = [ele[0] for ele in strcols] - # Size of last col determines dot col size. See - # `self._to_str_columns - size_tr_col = len(headers[self.tr_size_col]) - max_len += size_tr_col # Need to make space for largest row # plus truncate dot col dif = max_len - self.w # '+ 1' to avoid too wide repr (GH PR #17023) diff --git a/pandas/io/formats/html.py b/pandas/io/formats/html.py index 2a2a3e57729ec..967e5fca5f711 100644 --- a/pandas/io/formats/html.py +++ b/pandas/io/formats/html.py @@ -305,6 +305,8 @@ def _column_header(): align = self.fmt.justify if truncate_h: + if not self.fmt.index: + row_levels = 0 ins_col = row_levels + self.fmt.tr_col_num col_row.insert(ins_col, '...') @@ -336,15 +338,10 @@ def _write_body(self, indent): fmt_values[i] = self.fmt._format_col(i) # write values - if self.fmt.index: - if isinstance(self.frame.index, ABCMultiIndex): - self._write_hierarchical_rows(fmt_values, indent) - else: - self._write_regular_rows(fmt_values, indent) + if self.fmt.index and isinstance(self.frame.index, ABCMultiIndex): + self._write_hierarchical_rows(fmt_values, indent) else: - for i in range(min(len(self.frame), self.max_rows)): - row = [fmt_values[j][i] for j in range(len(self.columns))] - self.write_tr(row, indent, self.indent_delta, tags=None) + self._write_regular_rows(fmt_values, indent) indent -= self.indent_delta self.write('', indent) @@ -358,11 +355,16 @@ def _write_regular_rows(self, fmt_values, indent): ncols = len(self.fmt.tr_frame.columns) nrows = len(self.fmt.tr_frame) - fmt = self.fmt._get_formatter('__index__') - if fmt is not None: - index_values = self.fmt.tr_frame.index.map(fmt) + + if self.fmt.index: + fmt = self.fmt._get_formatter('__index__') + if fmt is not None: + index_values = self.fmt.tr_frame.index.map(fmt) + else: + index_values = self.fmt.tr_frame.index.format() + row_levels = 1 else: - index_values = self.fmt.tr_frame.index.format() + row_levels = 0 row = [] for i in range(nrows): @@ -370,17 +372,18 @@ def _write_regular_rows(self, fmt_values, indent): if truncate_v and i == (self.fmt.tr_row_num): str_sep_row = ['...'] * len(row) self.write_tr(str_sep_row, indent, self.indent_delta, - tags=None, nindex_levels=1) + tags=None, nindex_levels=row_levels) row = [] - row.append(index_values[i]) + if self.fmt.index: + row.append(index_values[i]) row.extend(fmt_values[j][i] for j in range(ncols)) if truncate_h: - dot_col_ix = self.fmt.tr_col_num + 1 + dot_col_ix = self.fmt.tr_col_num + row_levels row.insert(dot_col_ix, '...') self.write_tr(row, indent, self.indent_delta, tags=None, - nindex_levels=1) + nindex_levels=row_levels) def _write_hierarchical_rows(self, fmt_values, indent): template = 'rowspan="{span}" valign="top"' diff --git a/pandas/tests/frame/test_combine_concat.py b/pandas/tests/frame/test_combine_concat.py index 22c5d146e1a06..25c5222b5f03c 100644 --- a/pandas/tests/frame/test_combine_concat.py +++ b/pandas/tests/frame/test_combine_concat.py @@ -313,7 +313,17 @@ def test_update_filtered(self): [1.5, nan, 7.]]) assert_frame_equal(df, expected) - def test_update_raise(self): + @pytest.mark.parametrize('bad_kwarg, exception, msg', [ + # errors must be 'ignore' or 'raise' + ({'errors': 'something'}, ValueError, 'The parameter errors must.*'), + ({'join': 'inner'}, NotImplementedError, 'Only left join is supported') + ]) + def test_update_raise_bad_parameter(self, bad_kwarg, exception, msg): + df = DataFrame([[1.5, 1, 3.]]) + with pytest.raises(exception, match=msg): + df.update(df, **bad_kwarg) + + def test_update_raise_on_overlap(self): df = DataFrame([[1.5, 1, 3.], [1.5, nan, 3.], [1.5, nan, 3], @@ -322,7 +332,14 @@ def test_update_raise(self): other = DataFrame([[2., nan], [nan, 7]], index=[1, 3], columns=[1, 2]) with pytest.raises(ValueError, match="Data overlaps"): - df.update(other, raise_conflict=True) + df.update(other, errors='raise') + + @pytest.mark.parametrize('raise_conflict', [True, False]) + def test_update_deprecation(self, raise_conflict): + df = DataFrame([[1.5, 1, 3.]]) + other = DataFrame() + with tm.assert_produces_warning(FutureWarning): + df.update(other, raise_conflict=raise_conflict) def test_update_from_non_df(self): d = {'a': Series([1, 2, 3, 4]), 'b': Series([5, 6, 7, 8])} diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 619f60a42e0be..424f6b1f9a77a 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -504,6 +504,13 @@ def test_constructor_cast(self): with pytest.raises(ValueError, match=msg): Index(["a", "b", "c"], dtype=float) + def test_constructor_unwraps_index(self, indices): + if isinstance(indices, pd.MultiIndex): + raise pytest.skip("MultiIndex has no ._data") + a = indices + b = type(a)(a) + tm.assert_equal(a._data, b._data) + def test_view_with_args(self): restricted = ['unicodeIndex', 'strIndex', 'catIndex', 'boolIndex', diff --git a/pandas/tests/io/formats/data/gh15019_expected_output.html b/pandas/tests/io/formats/data/gh15019_expected_output.html new file mode 100644 index 0000000000000..5fb9d960f4465 --- /dev/null +++ b/pandas/tests/io/formats/data/gh15019_expected_output.html @@ -0,0 +1,30 @@ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + +
01
1.7640520.400157
0.9787382.240893
......
0.950088-0.151357
-0.1032190.410599
diff --git a/pandas/tests/io/formats/data/gh22783_expected_output.html b/pandas/tests/io/formats/data/gh22783_expected_output.html new file mode 100644 index 0000000000000..107db43c48639 --- /dev/null +++ b/pandas/tests/io/formats/data/gh22783_expected_output.html @@ -0,0 +1,27 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + +
01...34
1.7640520.400157...2.2408931.867558
-0.9772780.950088...-0.1032190.410599
diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 28aa8a92cc410..0814df8240e13 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -305,14 +305,10 @@ def test_repr_non_interactive(self): assert not has_truncated_repr(df) assert not has_expanded_repr(df) - def test_repr_truncates_terminal_size(self): + def test_repr_truncates_terminal_size(self, mock): # https://github.com/pandas-dev/pandas/issues/21180 # TODO: use mock fixutre. # This is being backported, so doing it directly here. - try: - from unittest import mock - except ImportError: - mock = pytest.importorskip("mock") terminal_size = (118, 96) p1 = mock.patch('pandas.io.formats.console.get_terminal_size', @@ -343,6 +339,17 @@ def test_repr_truncates_terminal_size(self): assert df2.columns[0] in result.split('\n')[0] + def test_repr_truncates_terminal_size_full(self, mock): + # GH 22984 ensure entire window is filled + terminal_size = (80, 24) + df = pd.DataFrame(np.random.rand(1, 7)) + p1 = mock.patch('pandas.io.formats.console.get_terminal_size', + return_value=terminal_size) + p2 = mock.patch('pandas.io.formats.format.get_terminal_size', + return_value=terminal_size) + with p1, p2: + assert "..." not in str(df) + def test_repr_max_columns_max_rows(self): term_width, term_height = get_terminal_size() if term_width < 10 or term_height < 10: @@ -1451,6 +1458,12 @@ def test_to_string_format_na(self): '4 4.0 bar') assert result == expected + def test_to_string_decimal(self): + # Issue #23614 + df = DataFrame({'A': [6.0, 3.1, 2.2]}) + expected = ' A\n0 6,0\n1 3,1\n2 2,2' + assert df.to_string(decimal=',') == expected + def test_to_string_line_width(self): df = DataFrame(123, lrange(10, 15), lrange(30)) s = df.to_string(line_width=80) diff --git a/pandas/tests/io/formats/test_to_html.py b/pandas/tests/io/formats/test_to_html.py index 0416cf6da7912..32cf21ddf5f38 100644 --- a/pandas/tests/io/formats/test_to_html.py +++ b/pandas/tests/io/formats/test_to_html.py @@ -22,6 +22,28 @@ pass +def expected_html(datapath, name): + """ + Read HTML file from formats data directory. + + Parameters + ---------- + datapath : pytest fixture + The datapath fixture injected into a test by pytest. + name : str + The name of the HTML file without the suffix. + + Returns + ------- + str : contents of HTML file. + """ + filename = '.'.join([name, 'html']) + filepath = datapath('io', 'formats', 'data', filename) + with open(filepath) as f: + html = f.read() + return html.rstrip() + + class TestToHTML(object): def test_to_html_with_col_space(self): @@ -1881,6 +1903,29 @@ def test_to_html_multiindex_max_cols(self): """) assert result == expected + @pytest.mark.parametrize('index', [False, 0]) + def test_to_html_truncation_index_false_max_rows(self, datapath, index): + # GH 15019 + data = [[1.764052, 0.400157], + [0.978738, 2.240893], + [1.867558, -0.977278], + [0.950088, -0.151357], + [-0.103219, 0.410599]] + df = pd.DataFrame(data) + result = df.to_html(max_rows=4, index=index) + expected = expected_html(datapath, 'gh15019_expected_output') + assert result == expected + + @pytest.mark.parametrize('index', [False, 0]) + def test_to_html_truncation_index_false_max_cols(self, datapath, index): + # GH 22783 + data = [[1.764052, 0.400157, 0.978738, 2.240893, 1.867558], + [-0.977278, 0.950088, -0.151357, -0.103219, 0.410599]] + df = pd.DataFrame(data) + result = df.to_html(max_cols=4, index=index) + expected = expected_html(datapath, 'gh22783_expected_output') + assert result == expected + def test_to_html_notebook_has_style(self): df = pd.DataFrame({"A": [1, 2, 3]}) result = df.to_html(notebook=True) diff --git a/pandas/tests/io/parser/python_parser_only.py b/pandas/tests/io/parser/python_parser_only.py index 590736f720e67..6a41b4636e532 100644 --- a/pandas/tests/io/parser/python_parser_only.py +++ b/pandas/tests/io/parser/python_parser_only.py @@ -8,6 +8,7 @@ """ import csv +import sys import pytest @@ -230,6 +231,7 @@ def test_multi_char_sep_quotes(self): self.read_csv(StringIO(data), sep=',,', quoting=csv.QUOTE_NONE) + @tm.capture_stderr def test_none_delimiter(self): # see gh-13374 and gh-17465 @@ -247,6 +249,9 @@ def test_none_delimiter(self): warn_bad_lines=True) tm.assert_frame_equal(result, expected) + warning = sys.stderr.getvalue() + assert 'Skipping line 3' in warning + def test_skipfooter_bad_row(self): # see gh-13879 # see gh-15910 diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index d9297cdc5ab3e..7ee88f223cd95 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -8,15 +8,14 @@ import numpy as np import pytest from numpy import nan -from numpy.random import randn import pandas as pd import pandas.util.testing as tm from pandas import (Categorical, CategoricalIndex, DataFrame, DatetimeIndex, - Float64Index, Index, Int64Index, MultiIndex, RangeIndex, + Float64Index, Int64Index, MultiIndex, RangeIndex, Series, UInt64Index) from pandas.api.types import CategoricalDtype as CDT -from pandas.compat import lrange, lzip +from pandas.compat import lrange from pandas.core.dtypes.common import is_categorical_dtype, is_object_dtype from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.reshape.concat import concat @@ -920,521 +919,6 @@ def _check_merge(x, y): assert_frame_equal(result, expected, check_names=False) -class TestMergeMulti(object): - - def setup_method(self, method): - self.index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], - ['one', 'two', 'three']], - labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['first', 'second']) - self.to_join = DataFrame(np.random.randn(10, 3), index=self.index, - columns=['j_one', 'j_two', 'j_three']) - - # a little relevant example with NAs - key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux', - 'qux', 'snap'] - key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two', - 'three', 'one'] - - data = np.random.randn(len(key1)) - self.data = DataFrame({'key1': key1, 'key2': key2, - 'data': data}) - - def test_merge_on_multikey(self): - joined = self.data.join(self.to_join, on=['key1', 'key2']) - - join_key = Index(lzip(self.data['key1'], self.data['key2'])) - indexer = self.to_join.index.get_indexer(join_key) - ex_values = self.to_join.values.take(indexer, axis=0) - ex_values[indexer == -1] = np.nan - expected = self.data.join(DataFrame(ex_values, - columns=self.to_join.columns)) - - # TODO: columns aren't in the same order yet - assert_frame_equal(joined, expected.loc[:, joined.columns]) - - left = self.data.join(self.to_join, on=['key1', 'key2'], sort=True) - right = expected.loc[:, joined.columns].sort_values(['key1', 'key2'], - kind='mergesort') - assert_frame_equal(left, right) - - def test_left_join_multi_index(self): - icols = ['1st', '2nd', '3rd'] - - def bind_cols(df): - iord = lambda a: 0 if a != a else ord(a) - f = lambda ts: ts.map(iord) - ord('a') - return (f(df['1st']) + f(df['3rd']) * 1e2 + - df['2nd'].fillna(0) * 1e4) - - def run_asserts(left, right): - for sort in [False, True]: - res = left.join(right, on=icols, how='left', sort=sort) - - assert len(left) < len(res) + 1 - assert not res['4th'].isna().any() - assert not res['5th'].isna().any() - - tm.assert_series_equal( - res['4th'], - res['5th'], check_names=False) - result = bind_cols(res.iloc[:, :-2]) - tm.assert_series_equal(res['4th'], result, check_names=False) - assert result.name is None - - if sort: - tm.assert_frame_equal( - res, res.sort_values(icols, kind='mergesort')) - - out = merge(left, right.reset_index(), on=icols, - sort=sort, how='left') - - res.index = np.arange(len(res)) - tm.assert_frame_equal(out, res) - - lc = list(map(chr, np.arange(ord('a'), ord('z') + 1))) - left = DataFrame(np.random.choice(lc, (5000, 2)), - columns=['1st', '3rd']) - left.insert(1, '2nd', np.random.randint(0, 1000, len(left))) - - i = np.random.permutation(len(left)) - right = left.iloc[i].copy() - - left['4th'] = bind_cols(left) - right['5th'] = - bind_cols(right) - right.set_index(icols, inplace=True) - - run_asserts(left, right) - - # inject some nulls - left.loc[1::23, '1st'] = np.nan - left.loc[2::37, '2nd'] = np.nan - left.loc[3::43, '3rd'] = np.nan - left['4th'] = bind_cols(left) - - i = np.random.permutation(len(left)) - right = left.iloc[i, :-1] - right['5th'] = - bind_cols(right) - right.set_index(icols, inplace=True) - - run_asserts(left, right) - - def test_merge_right_vs_left(self): - # compare left vs right merge with multikey - for sort in [False, True]: - merged1 = self.data.merge(self.to_join, left_on=['key1', 'key2'], - right_index=True, how='left', sort=sort) - - merged2 = self.to_join.merge(self.data, right_on=['key1', 'key2'], - left_index=True, how='right', - sort=sort) - - merged2 = merged2.loc[:, merged1.columns] - assert_frame_equal(merged1, merged2) - - def test_compress_group_combinations(self): - - # ~ 40000000 possible unique groups - key1 = tm.rands_array(10, 10000) - key1 = np.tile(key1, 2) - key2 = key1[::-1] - - df = DataFrame({'key1': key1, 'key2': key2, - 'value1': np.random.randn(20000)}) - - df2 = DataFrame({'key1': key1[::2], 'key2': key2[::2], - 'value2': np.random.randn(10000)}) - - # just to hit the label compression code path - merge(df, df2, how='outer') - - def test_left_join_index_preserve_order(self): - - left = DataFrame({'k1': [0, 1, 2] * 8, - 'k2': ['foo', 'bar'] * 12, - 'v': np.array(np.arange(24), dtype=np.int64)}) - - index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')]) - right = DataFrame({'v2': [5, 7]}, index=index) - - result = left.join(right, on=['k1', 'k2']) - - expected = left.copy() - expected['v2'] = np.nan - expected.loc[(expected.k1 == 2) & (expected.k2 == 'bar'), 'v2'] = 5 - expected.loc[(expected.k1 == 1) & (expected.k2 == 'foo'), 'v2'] = 7 - - tm.assert_frame_equal(result, expected) - tm.assert_frame_equal( - result.sort_values(['k1', 'k2'], kind='mergesort'), - left.join(right, on=['k1', 'k2'], sort=True)) - - # test join with multi dtypes blocks - left = DataFrame({'k1': [0, 1, 2] * 8, - 'k2': ['foo', 'bar'] * 12, - 'k3': np.array([0, 1, 2] * 8, dtype=np.float32), - 'v': np.array(np.arange(24), dtype=np.int32)}) - - index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')]) - right = DataFrame({'v2': [5, 7]}, index=index) - - result = left.join(right, on=['k1', 'k2']) - - expected = left.copy() - expected['v2'] = np.nan - expected.loc[(expected.k1 == 2) & (expected.k2 == 'bar'), 'v2'] = 5 - expected.loc[(expected.k1 == 1) & (expected.k2 == 'foo'), 'v2'] = 7 - - tm.assert_frame_equal(result, expected) - tm.assert_frame_equal( - result.sort_values(['k1', 'k2'], kind='mergesort'), - left.join(right, on=['k1', 'k2'], sort=True)) - - # do a right join for an extra test - joined = merge(right, left, left_index=True, - right_on=['k1', 'k2'], how='right') - tm.assert_frame_equal(joined.loc[:, expected.columns], expected) - - def test_left_join_index_multi_match_multiindex(self): - left = DataFrame([ - ['X', 'Y', 'C', 'a'], - ['W', 'Y', 'C', 'e'], - ['V', 'Q', 'A', 'h'], - ['V', 'R', 'D', 'i'], - ['X', 'Y', 'D', 'b'], - ['X', 'Y', 'A', 'c'], - ['W', 'Q', 'B', 'f'], - ['W', 'R', 'C', 'g'], - ['V', 'Y', 'C', 'j'], - ['X', 'Y', 'B', 'd']], - columns=['cola', 'colb', 'colc', 'tag'], - index=[3, 2, 0, 1, 7, 6, 4, 5, 9, 8]) - - right = DataFrame([ - ['W', 'R', 'C', 0], - ['W', 'Q', 'B', 3], - ['W', 'Q', 'B', 8], - ['X', 'Y', 'A', 1], - ['X', 'Y', 'A', 4], - ['X', 'Y', 'B', 5], - ['X', 'Y', 'C', 6], - ['X', 'Y', 'C', 9], - ['X', 'Q', 'C', -6], - ['X', 'R', 'C', -9], - ['V', 'Y', 'C', 7], - ['V', 'R', 'D', 2], - ['V', 'R', 'D', -1], - ['V', 'Q', 'A', -3]], - columns=['col1', 'col2', 'col3', 'val']) - - right.set_index(['col1', 'col2', 'col3'], inplace=True) - result = left.join(right, on=['cola', 'colb', 'colc'], how='left') - - expected = DataFrame([ - ['X', 'Y', 'C', 'a', 6], - ['X', 'Y', 'C', 'a', 9], - ['W', 'Y', 'C', 'e', nan], - ['V', 'Q', 'A', 'h', -3], - ['V', 'R', 'D', 'i', 2], - ['V', 'R', 'D', 'i', -1], - ['X', 'Y', 'D', 'b', nan], - ['X', 'Y', 'A', 'c', 1], - ['X', 'Y', 'A', 'c', 4], - ['W', 'Q', 'B', 'f', 3], - ['W', 'Q', 'B', 'f', 8], - ['W', 'R', 'C', 'g', 0], - ['V', 'Y', 'C', 'j', 7], - ['X', 'Y', 'B', 'd', 5]], - columns=['cola', 'colb', 'colc', 'tag', 'val'], - index=[3, 3, 2, 0, 1, 1, 7, 6, 6, 4, 4, 5, 9, 8]) - - tm.assert_frame_equal(result, expected) - - result = left.join(right, on=['cola', 'colb', 'colc'], - how='left', sort=True) - - tm.assert_frame_equal( - result, - expected.sort_values(['cola', 'colb', 'colc'], kind='mergesort')) - - # GH7331 - maintain left frame order in left merge - right.reset_index(inplace=True) - right.columns = left.columns[:3].tolist() + right.columns[-1:].tolist() - result = merge(left, right, how='left', on=left.columns[:-1].tolist()) - expected.index = np.arange(len(expected)) - tm.assert_frame_equal(result, expected) - - def test_left_join_index_multi_match(self): - left = DataFrame([ - ['c', 0], - ['b', 1], - ['a', 2], - ['b', 3]], - columns=['tag', 'val'], - index=[2, 0, 1, 3]) - - right = DataFrame([ - ['a', 'v'], - ['c', 'w'], - ['c', 'x'], - ['d', 'y'], - ['a', 'z'], - ['c', 'r'], - ['e', 'q'], - ['c', 's']], - columns=['tag', 'char']) - - right.set_index('tag', inplace=True) - result = left.join(right, on='tag', how='left') - - expected = DataFrame([ - ['c', 0, 'w'], - ['c', 0, 'x'], - ['c', 0, 'r'], - ['c', 0, 's'], - ['b', 1, nan], - ['a', 2, 'v'], - ['a', 2, 'z'], - ['b', 3, nan]], - columns=['tag', 'val', 'char'], - index=[2, 2, 2, 2, 0, 1, 1, 3]) - - tm.assert_frame_equal(result, expected) - - result = left.join(right, on='tag', how='left', sort=True) - tm.assert_frame_equal( - result, expected.sort_values('tag', kind='mergesort')) - - # GH7331 - maintain left frame order in left merge - result = merge(left, right.reset_index(), how='left', on='tag') - expected.index = np.arange(len(expected)) - tm.assert_frame_equal(result, expected) - - def test_left_merge_na_buglet(self): - left = DataFrame({'id': list('abcde'), 'v1': randn(5), - 'v2': randn(5), 'dummy': list('abcde'), - 'v3': randn(5)}, - columns=['id', 'v1', 'v2', 'dummy', 'v3']) - right = DataFrame({'id': ['a', 'b', np.nan, np.nan, np.nan], - 'sv3': [1.234, 5.678, np.nan, np.nan, np.nan]}) - - merged = merge(left, right, on='id', how='left') - - rdf = right.drop(['id'], axis=1) - expected = left.join(rdf) - tm.assert_frame_equal(merged, expected) - - def test_merge_na_keys(self): - data = [[1950, "A", 1.5], - [1950, "B", 1.5], - [1955, "B", 1.5], - [1960, "B", np.nan], - [1970, "B", 4.], - [1950, "C", 4.], - [1960, "C", np.nan], - [1965, "C", 3.], - [1970, "C", 4.]] - - frame = DataFrame(data, columns=["year", "panel", "data"]) - - other_data = [[1960, 'A', np.nan], - [1970, 'A', np.nan], - [1955, 'A', np.nan], - [1965, 'A', np.nan], - [1965, 'B', np.nan], - [1955, 'C', np.nan]] - other = DataFrame(other_data, columns=['year', 'panel', 'data']) - - result = frame.merge(other, how='outer') - - expected = frame.fillna(-999).merge(other.fillna(-999), how='outer') - expected = expected.replace(-999, np.nan) - - tm.assert_frame_equal(result, expected) - - def test_join_multi_levels(self): - - # GH 3662 - # merge multi-levels - household = ( - DataFrame( - dict(household_id=[1, 2, 3], - male=[0, 1, 0], - wealth=[196087.3, 316478.7, 294750]), - columns=['household_id', 'male', 'wealth']) - .set_index('household_id')) - portfolio = ( - DataFrame( - dict(household_id=[1, 2, 2, 3, 3, 3, 4], - asset_id=["nl0000301109", "nl0000289783", "gb00b03mlx29", - "gb00b03mlx29", "lu0197800237", "nl0000289965", - np.nan], - name=["ABN Amro", "Robeco", "Royal Dutch Shell", - "Royal Dutch Shell", - "AAB Eastern Europe Equity Fund", - "Postbank BioTech Fonds", np.nan], - share=[1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0]), - columns=['household_id', 'asset_id', 'name', 'share']) - .set_index(['household_id', 'asset_id'])) - result = household.join(portfolio, how='inner') - expected = ( - DataFrame( - dict(male=[0, 1, 1, 0, 0, 0], - wealth=[196087.3, 316478.7, 316478.7, - 294750.0, 294750.0, 294750.0], - name=['ABN Amro', 'Robeco', 'Royal Dutch Shell', - 'Royal Dutch Shell', - 'AAB Eastern Europe Equity Fund', - 'Postbank BioTech Fonds'], - share=[1.00, 0.40, 0.60, 0.15, 0.60, 0.25], - household_id=[1, 2, 2, 3, 3, 3], - asset_id=['nl0000301109', 'nl0000289783', 'gb00b03mlx29', - 'gb00b03mlx29', 'lu0197800237', - 'nl0000289965'])) - .set_index(['household_id', 'asset_id']) - .reindex(columns=['male', 'wealth', 'name', 'share'])) - assert_frame_equal(result, expected) - - assert_frame_equal(result, expected) - - # equivalency - result2 = (merge(household.reset_index(), portfolio.reset_index(), - on=['household_id'], how='inner') - .set_index(['household_id', 'asset_id'])) - assert_frame_equal(result2, expected) - - result = household.join(portfolio, how='outer') - expected = (concat([ - expected, - (DataFrame( - dict(share=[1.00]), - index=MultiIndex.from_tuples( - [(4, np.nan)], - names=['household_id', 'asset_id']))) - ], axis=0, sort=True).reindex(columns=expected.columns)) - assert_frame_equal(result, expected) - - # invalid cases - household.index.name = 'foo' - - def f(): - household.join(portfolio, how='inner') - - pytest.raises(ValueError, f) - - portfolio2 = portfolio.copy() - portfolio2.index.set_names(['household_id', 'foo']) - - def f(): - portfolio2.join(portfolio, how='inner') - - pytest.raises(ValueError, f) - - def test_join_multi_levels2(self): - - # some more advanced merges - # GH6360 - household = ( - DataFrame( - dict(household_id=[1, 2, 2, 3, 3, 3, 4], - asset_id=["nl0000301109", "nl0000301109", "gb00b03mlx29", - "gb00b03mlx29", "lu0197800237", "nl0000289965", - np.nan], - share=[1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0]), - columns=['household_id', 'asset_id', 'share']) - .set_index(['household_id', 'asset_id'])) - - log_return = DataFrame(dict( - asset_id=["gb00b03mlx29", "gb00b03mlx29", - "gb00b03mlx29", "lu0197800237", "lu0197800237"], - t=[233, 234, 235, 180, 181], - log_return=[.09604978, -.06524096, .03532373, .03025441, .036997] - )).set_index(["asset_id", "t"]) - - expected = ( - DataFrame(dict( - household_id=[2, 2, 2, 3, 3, 3, 3, 3], - asset_id=["gb00b03mlx29", "gb00b03mlx29", - "gb00b03mlx29", "gb00b03mlx29", - "gb00b03mlx29", "gb00b03mlx29", - "lu0197800237", "lu0197800237"], - t=[233, 234, 235, 233, 234, 235, 180, 181], - share=[0.6, 0.6, 0.6, 0.15, 0.15, 0.15, 0.6, 0.6], - log_return=[.09604978, -.06524096, .03532373, - .09604978, -.06524096, .03532373, - .03025441, .036997] - )) - .set_index(["household_id", "asset_id", "t"]) - .reindex(columns=['share', 'log_return'])) - - def f(): - household.join(log_return, how='inner') - - pytest.raises(NotImplementedError, f) - - # this is the equivalency - result = (merge(household.reset_index(), log_return.reset_index(), - on=['asset_id'], how='inner') - .set_index(['household_id', 'asset_id', 't'])) - assert_frame_equal(result, expected) - - expected = ( - DataFrame(dict( - household_id=[1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4], - asset_id=["nl0000301109", "nl0000289783", "gb00b03mlx29", - "gb00b03mlx29", "gb00b03mlx29", - "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", - "lu0197800237", "lu0197800237", - "nl0000289965", None], - t=[None, None, 233, 234, 235, 233, 234, - 235, 180, 181, None, None], - share=[1.0, 0.4, 0.6, 0.6, 0.6, 0.15, - 0.15, 0.15, 0.6, 0.6, 0.25, 1.0], - log_return=[None, None, .09604978, -.06524096, .03532373, - .09604978, -.06524096, .03532373, - .03025441, .036997, None, None] - )) - .set_index(["household_id", "asset_id", "t"])) - - def f(): - household.join(log_return, how='outer') - - pytest.raises(NotImplementedError, f) - - @pytest.mark.parametrize("klass", [None, np.asarray, Series, Index]) - def test_merge_datetime_index(self, klass): - # see gh-19038 - df = DataFrame([1, 2, 3], - ["2016-01-01", "2017-01-01", "2018-01-01"], - columns=["a"]) - df.index = pd.to_datetime(df.index) - on_vector = df.index.year - - if klass is not None: - on_vector = klass(on_vector) - - expected = DataFrame( - OrderedDict([ - ("a", [1, 2, 3]), - ("key_1", [2016, 2017, 2018]), - ]) - ) - - result = df.merge(df, on=["a", on_vector], how="inner") - tm.assert_frame_equal(result, expected) - - expected = DataFrame( - OrderedDict([ - ("key_0", [2016, 2017, 2018]), - ("a_x", [1, 2, 3]), - ("a_y", [1, 2, 3]), - ]) - ) - - result = df.merge(df, on=[df.index.year], how="inner") - tm.assert_frame_equal(result, expected) - - class TestMergeDtypes(object): @pytest.mark.parametrize('right_vals', [ diff --git a/pandas/tests/reshape/merge/test_multi.py b/pandas/tests/reshape/merge/test_multi.py new file mode 100644 index 0000000000000..a1158201844b0 --- /dev/null +++ b/pandas/tests/reshape/merge/test_multi.py @@ -0,0 +1,672 @@ +# pylint: disable=E1103 + +from collections import OrderedDict + +import numpy as np +from numpy import nan +from numpy.random import randn +import pytest + +import pandas as pd +from pandas import DataFrame, Index, MultiIndex, Series +from pandas.core.reshape.concat import concat +from pandas.core.reshape.merge import merge +import pandas.util.testing as tm + + +@pytest.fixture +def left(): + """left dataframe (not multi-indexed) for multi-index join tests""" + # a little relevant example with NAs + key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux', + 'qux', 'snap'] + key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two', + 'three', 'one'] + + data = np.random.randn(len(key1)) + return DataFrame({'key1': key1, 'key2': key2, 'data': data}) + + +@pytest.fixture +def right(): + """right dataframe (multi-indexed) for multi-index join tests""" + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], + ['one', 'two', 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['key1', 'key2']) + + return DataFrame(np.random.randn(10, 3), index=index, + columns=['j_one', 'j_two', 'j_three']) + + +@pytest.fixture +def left_multi(): + return ( + DataFrame( + dict(Origin=['A', 'A', 'B', 'B', 'C'], + Destination=['A', 'B', 'A', 'C', 'A'], + Period=['AM', 'AM', 'IP', 'AM', 'OP'], + TripPurp=['hbw', 'nhb', 'hbo', 'nhb', 'hbw'], + Trips=[1987, 3647, 2470, 4296, 4444]), + columns=['Origin', 'Destination', 'Period', + 'TripPurp', 'Trips']) + .set_index(['Origin', 'Destination', 'Period', 'TripPurp'])) + + +@pytest.fixture +def right_multi(): + return ( + DataFrame( + dict(Origin=['A', 'A', 'B', 'B', 'C', 'C', 'E'], + Destination=['A', 'B', 'A', 'B', 'A', 'B', 'F'], + Period=['AM', 'AM', 'IP', 'AM', 'OP', 'IP', 'AM'], + LinkType=['a', 'b', 'c', 'b', 'a', 'b', 'a'], + Distance=[100, 80, 90, 80, 75, 35, 55]), + columns=['Origin', 'Destination', 'Period', + 'LinkType', 'Distance']) + .set_index(['Origin', 'Destination', 'Period', 'LinkType'])) + + +@pytest.fixture +def on_cols_multi(): + return ['Origin', 'Destination', 'Period'] + + +@pytest.fixture +def idx_cols_multi(): + return ['Origin', 'Destination', 'Period', 'TripPurp', 'LinkType'] + + +class TestMergeMulti(object): + + def setup_method(self): + self.index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], + ['one', 'two', 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['first', 'second']) + self.to_join = DataFrame(np.random.randn(10, 3), index=self.index, + columns=['j_one', 'j_two', 'j_three']) + + # a little relevant example with NAs + key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux', + 'qux', 'snap'] + key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two', + 'three', 'one'] + + data = np.random.randn(len(key1)) + self.data = DataFrame({'key1': key1, 'key2': key2, + 'data': data}) + + def test_merge_on_multikey(self, left, right, join_type): + on_cols = ['key1', 'key2'] + result = (left.join(right, on=on_cols, how=join_type) + .reset_index(drop=True)) + + expected = pd.merge(left, right.reset_index(), + on=on_cols, how=join_type) + + tm.assert_frame_equal(result, expected) + + result = (left.join(right, on=on_cols, how=join_type, sort=True) + .reset_index(drop=True)) + + expected = pd.merge(left, right.reset_index(), + on=on_cols, how=join_type, sort=True) + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("sort", [False, True]) + def test_left_join_multi_index(self, left, right, sort): + icols = ['1st', '2nd', '3rd'] + + def bind_cols(df): + iord = lambda a: 0 if a != a else ord(a) + f = lambda ts: ts.map(iord) - ord('a') + return (f(df['1st']) + f(df['3rd']) * 1e2 + + df['2nd'].fillna(0) * 1e4) + + def run_asserts(left, right, sort): + res = left.join(right, on=icols, how='left', sort=sort) + + assert len(left) < len(res) + 1 + assert not res['4th'].isna().any() + assert not res['5th'].isna().any() + + tm.assert_series_equal( + res['4th'], - res['5th'], check_names=False) + result = bind_cols(res.iloc[:, :-2]) + tm.assert_series_equal(res['4th'], result, check_names=False) + assert result.name is None + + if sort: + tm.assert_frame_equal( + res, res.sort_values(icols, kind='mergesort')) + + out = merge(left, right.reset_index(), on=icols, + sort=sort, how='left') + + res.index = np.arange(len(res)) + tm.assert_frame_equal(out, res) + + lc = list(map(chr, np.arange(ord('a'), ord('z') + 1))) + left = DataFrame(np.random.choice(lc, (5000, 2)), + columns=['1st', '3rd']) + left.insert(1, '2nd', np.random.randint(0, 1000, len(left))) + + i = np.random.permutation(len(left)) + right = left.iloc[i].copy() + + left['4th'] = bind_cols(left) + right['5th'] = - bind_cols(right) + right.set_index(icols, inplace=True) + + run_asserts(left, right, sort) + + # inject some nulls + left.loc[1::23, '1st'] = np.nan + left.loc[2::37, '2nd'] = np.nan + left.loc[3::43, '3rd'] = np.nan + left['4th'] = bind_cols(left) + + i = np.random.permutation(len(left)) + right = left.iloc[i, :-1] + right['5th'] = - bind_cols(right) + right.set_index(icols, inplace=True) + + run_asserts(left, right, sort) + + @pytest.mark.parametrize("sort", [False, True]) + def test_merge_right_vs_left(self, left, right, sort): + # compare left vs right merge with multikey + on_cols = ['key1', 'key2'] + merged_left_right = left.merge(right, + left_on=on_cols, right_index=True, + how='left', sort=sort) + + merge_right_left = right.merge(left, + right_on=on_cols, left_index=True, + how='right', sort=sort) + + # Reorder columns + merge_right_left = merge_right_left[merged_left_right.columns] + + tm.assert_frame_equal(merged_left_right, merge_right_left) + + def test_compress_group_combinations(self): + + # ~ 40000000 possible unique groups + key1 = tm.rands_array(10, 10000) + key1 = np.tile(key1, 2) + key2 = key1[::-1] + + df = DataFrame({'key1': key1, 'key2': key2, + 'value1': np.random.randn(20000)}) + + df2 = DataFrame({'key1': key1[::2], 'key2': key2[::2], + 'value2': np.random.randn(10000)}) + + # just to hit the label compression code path + merge(df, df2, how='outer') + + def test_left_join_index_preserve_order(self): + + on_cols = ['k1', 'k2'] + left = DataFrame({'k1': [0, 1, 2] * 8, + 'k2': ['foo', 'bar'] * 12, + 'v': np.array(np.arange(24), dtype=np.int64)}) + + index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')]) + right = DataFrame({'v2': [5, 7]}, index=index) + + result = left.join(right, on=on_cols) + + expected = left.copy() + expected['v2'] = np.nan + expected.loc[(expected.k1 == 2) & (expected.k2 == 'bar'), 'v2'] = 5 + expected.loc[(expected.k1 == 1) & (expected.k2 == 'foo'), 'v2'] = 7 + + tm.assert_frame_equal(result, expected) + + result.sort_values(on_cols, kind='mergesort', inplace=True) + expected = left.join(right, on=on_cols, sort=True) + + tm.assert_frame_equal(result, expected) + + # test join with multi dtypes blocks + left = DataFrame({'k1': [0, 1, 2] * 8, + 'k2': ['foo', 'bar'] * 12, + 'k3': np.array([0, 1, 2] * 8, dtype=np.float32), + 'v': np.array(np.arange(24), dtype=np.int32)}) + + index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')]) + right = DataFrame({'v2': [5, 7]}, index=index) + + result = left.join(right, on=on_cols) + + expected = left.copy() + expected['v2'] = np.nan + expected.loc[(expected.k1 == 2) & (expected.k2 == 'bar'), 'v2'] = 5 + expected.loc[(expected.k1 == 1) & (expected.k2 == 'foo'), 'v2'] = 7 + + tm.assert_frame_equal(result, expected) + + result = result.sort_values(on_cols, kind='mergesort') + expected = left.join(right, on=on_cols, sort=True) + + tm.assert_frame_equal(result, expected) + + def test_left_join_index_multi_match_multiindex(self): + left = DataFrame([ + ['X', 'Y', 'C', 'a'], + ['W', 'Y', 'C', 'e'], + ['V', 'Q', 'A', 'h'], + ['V', 'R', 'D', 'i'], + ['X', 'Y', 'D', 'b'], + ['X', 'Y', 'A', 'c'], + ['W', 'Q', 'B', 'f'], + ['W', 'R', 'C', 'g'], + ['V', 'Y', 'C', 'j'], + ['X', 'Y', 'B', 'd']], + columns=['cola', 'colb', 'colc', 'tag'], + index=[3, 2, 0, 1, 7, 6, 4, 5, 9, 8]) + + right = (DataFrame([ + ['W', 'R', 'C', 0], + ['W', 'Q', 'B', 3], + ['W', 'Q', 'B', 8], + ['X', 'Y', 'A', 1], + ['X', 'Y', 'A', 4], + ['X', 'Y', 'B', 5], + ['X', 'Y', 'C', 6], + ['X', 'Y', 'C', 9], + ['X', 'Q', 'C', -6], + ['X', 'R', 'C', -9], + ['V', 'Y', 'C', 7], + ['V', 'R', 'D', 2], + ['V', 'R', 'D', -1], + ['V', 'Q', 'A', -3]], + columns=['col1', 'col2', 'col3', 'val']) + .set_index(['col1', 'col2', 'col3'])) + + result = left.join(right, on=['cola', 'colb', 'colc'], how='left') + + expected = DataFrame([ + ['X', 'Y', 'C', 'a', 6], + ['X', 'Y', 'C', 'a', 9], + ['W', 'Y', 'C', 'e', nan], + ['V', 'Q', 'A', 'h', -3], + ['V', 'R', 'D', 'i', 2], + ['V', 'R', 'D', 'i', -1], + ['X', 'Y', 'D', 'b', nan], + ['X', 'Y', 'A', 'c', 1], + ['X', 'Y', 'A', 'c', 4], + ['W', 'Q', 'B', 'f', 3], + ['W', 'Q', 'B', 'f', 8], + ['W', 'R', 'C', 'g', 0], + ['V', 'Y', 'C', 'j', 7], + ['X', 'Y', 'B', 'd', 5]], + columns=['cola', 'colb', 'colc', 'tag', 'val'], + index=[3, 3, 2, 0, 1, 1, 7, 6, 6, 4, 4, 5, 9, 8]) + + tm.assert_frame_equal(result, expected) + + result = left.join(right, on=['cola', 'colb', 'colc'], + how='left', sort=True) + + expected = expected.sort_values(['cola', 'colb', 'colc'], + kind='mergesort') + + tm.assert_frame_equal(result, expected) + + def test_left_join_index_multi_match(self): + left = DataFrame([ + ['c', 0], + ['b', 1], + ['a', 2], + ['b', 3]], + columns=['tag', 'val'], + index=[2, 0, 1, 3]) + + right = (DataFrame([ + ['a', 'v'], + ['c', 'w'], + ['c', 'x'], + ['d', 'y'], + ['a', 'z'], + ['c', 'r'], + ['e', 'q'], + ['c', 's']], + columns=['tag', 'char']) + .set_index('tag')) + + result = left.join(right, on='tag', how='left') + + expected = DataFrame([ + ['c', 0, 'w'], + ['c', 0, 'x'], + ['c', 0, 'r'], + ['c', 0, 's'], + ['b', 1, nan], + ['a', 2, 'v'], + ['a', 2, 'z'], + ['b', 3, nan]], + columns=['tag', 'val', 'char'], + index=[2, 2, 2, 2, 0, 1, 1, 3]) + + tm.assert_frame_equal(result, expected) + + result = left.join(right, on='tag', how='left', sort=True) + expected2 = expected.sort_values('tag', kind='mergesort') + + tm.assert_frame_equal(result, expected2) + + # GH7331 - maintain left frame order in left merge + result = merge(left, right.reset_index(), how='left', on='tag') + expected.index = np.arange(len(expected)) + tm.assert_frame_equal(result, expected) + + def test_left_merge_na_buglet(self): + left = DataFrame({'id': list('abcde'), 'v1': randn(5), + 'v2': randn(5), 'dummy': list('abcde'), + 'v3': randn(5)}, + columns=['id', 'v1', 'v2', 'dummy', 'v3']) + right = DataFrame({'id': ['a', 'b', np.nan, np.nan, np.nan], + 'sv3': [1.234, 5.678, np.nan, np.nan, np.nan]}) + + result = merge(left, right, on='id', how='left') + + rdf = right.drop(['id'], axis=1) + expected = left.join(rdf) + tm.assert_frame_equal(result, expected) + + def test_merge_na_keys(self): + data = [[1950, "A", 1.5], + [1950, "B", 1.5], + [1955, "B", 1.5], + [1960, "B", np.nan], + [1970, "B", 4.], + [1950, "C", 4.], + [1960, "C", np.nan], + [1965, "C", 3.], + [1970, "C", 4.]] + + frame = DataFrame(data, columns=["year", "panel", "data"]) + + other_data = [[1960, 'A', np.nan], + [1970, 'A', np.nan], + [1955, 'A', np.nan], + [1965, 'A', np.nan], + [1965, 'B', np.nan], + [1955, 'C', np.nan]] + other = DataFrame(other_data, columns=['year', 'panel', 'data']) + + result = frame.merge(other, how='outer') + + expected = frame.fillna(-999).merge(other.fillna(-999), how='outer') + expected = expected.replace(-999, np.nan) + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("klass", [None, np.asarray, Series, Index]) + def test_merge_datetime_index(self, klass): + # see gh-19038 + df = DataFrame([1, 2, 3], + ["2016-01-01", "2017-01-01", "2018-01-01"], + columns=["a"]) + df.index = pd.to_datetime(df.index) + on_vector = df.index.year + + if klass is not None: + on_vector = klass(on_vector) + + expected = DataFrame( + OrderedDict([ + ("a", [1, 2, 3]), + ("key_1", [2016, 2017, 2018]), + ]) + ) + + result = df.merge(df, on=["a", on_vector], how="inner") + tm.assert_frame_equal(result, expected) + + expected = DataFrame( + OrderedDict([ + ("key_0", [2016, 2017, 2018]), + ("a_x", [1, 2, 3]), + ("a_y", [1, 2, 3]), + ]) + ) + + result = df.merge(df, on=[df.index.year], how="inner") + tm.assert_frame_equal(result, expected) + + def test_join_multi_levels(self): + + # GH 3662 + # merge multi-levels + household = ( + DataFrame( + dict(household_id=[1, 2, 3], + male=[0, 1, 0], + wealth=[196087.3, 316478.7, 294750]), + columns=['household_id', 'male', 'wealth']) + .set_index('household_id')) + portfolio = ( + DataFrame( + dict(household_id=[1, 2, 2, 3, 3, 3, 4], + asset_id=["nl0000301109", "nl0000289783", "gb00b03mlx29", + "gb00b03mlx29", "lu0197800237", "nl0000289965", + np.nan], + name=["ABN Amro", "Robeco", "Royal Dutch Shell", + "Royal Dutch Shell", + "AAB Eastern Europe Equity Fund", + "Postbank BioTech Fonds", np.nan], + share=[1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0]), + columns=['household_id', 'asset_id', 'name', 'share']) + .set_index(['household_id', 'asset_id'])) + result = household.join(portfolio, how='inner') + expected = ( + DataFrame( + dict(male=[0, 1, 1, 0, 0, 0], + wealth=[196087.3, 316478.7, 316478.7, + 294750.0, 294750.0, 294750.0], + name=['ABN Amro', 'Robeco', 'Royal Dutch Shell', + 'Royal Dutch Shell', + 'AAB Eastern Europe Equity Fund', + 'Postbank BioTech Fonds'], + share=[1.00, 0.40, 0.60, 0.15, 0.60, 0.25], + household_id=[1, 2, 2, 3, 3, 3], + asset_id=['nl0000301109', 'nl0000289783', 'gb00b03mlx29', + 'gb00b03mlx29', 'lu0197800237', + 'nl0000289965'])) + .set_index(['household_id', 'asset_id']) + .reindex(columns=['male', 'wealth', 'name', 'share'])) + tm.assert_frame_equal(result, expected) + + # equivalency + result = (merge(household.reset_index(), portfolio.reset_index(), + on=['household_id'], how='inner') + .set_index(['household_id', 'asset_id'])) + tm.assert_frame_equal(result, expected) + + result = household.join(portfolio, how='outer') + expected = (concat([ + expected, + (DataFrame( + dict(share=[1.00]), + index=MultiIndex.from_tuples( + [(4, np.nan)], + names=['household_id', 'asset_id']))) + ], axis=0, sort=True).reindex(columns=expected.columns)) + tm.assert_frame_equal(result, expected) + + # invalid cases + household.index.name = 'foo' + + def f(): + household.join(portfolio, how='inner') + + pytest.raises(ValueError, f) + + portfolio2 = portfolio.copy() + portfolio2.index.set_names(['household_id', 'foo']) + + def f(): + portfolio2.join(portfolio, how='inner') + + pytest.raises(ValueError, f) + + def test_join_multi_levels2(self): + + # some more advanced merges + # GH6360 + household = ( + DataFrame( + dict(household_id=[1, 2, 2, 3, 3, 3, 4], + asset_id=["nl0000301109", "nl0000301109", "gb00b03mlx29", + "gb00b03mlx29", "lu0197800237", "nl0000289965", + np.nan], + share=[1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0]), + columns=['household_id', 'asset_id', 'share']) + .set_index(['household_id', 'asset_id'])) + + log_return = DataFrame(dict( + asset_id=["gb00b03mlx29", "gb00b03mlx29", + "gb00b03mlx29", "lu0197800237", "lu0197800237"], + t=[233, 234, 235, 180, 181], + log_return=[.09604978, -.06524096, .03532373, .03025441, .036997] + )).set_index(["asset_id", "t"]) + + expected = ( + DataFrame(dict( + household_id=[2, 2, 2, 3, 3, 3, 3, 3], + asset_id=["gb00b03mlx29", "gb00b03mlx29", + "gb00b03mlx29", "gb00b03mlx29", + "gb00b03mlx29", "gb00b03mlx29", + "lu0197800237", "lu0197800237"], + t=[233, 234, 235, 233, 234, 235, 180, 181], + share=[0.6, 0.6, 0.6, 0.15, 0.15, 0.15, 0.6, 0.6], + log_return=[.09604978, -.06524096, .03532373, + .09604978, -.06524096, .03532373, + .03025441, .036997] + )) + .set_index(["household_id", "asset_id", "t"]) + .reindex(columns=['share', 'log_return'])) + + # this is the equivalency + result = (merge(household.reset_index(), log_return.reset_index(), + on=['asset_id'], how='inner') + .set_index(['household_id', 'asset_id', 't'])) + tm.assert_frame_equal(result, expected) + + expected = ( + DataFrame(dict( + household_id=[1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4], + asset_id=["nl0000301109", "nl0000301109", "gb00b03mlx29", + "gb00b03mlx29", "gb00b03mlx29", + "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", + "lu0197800237", "lu0197800237", + "nl0000289965", None], + t=[None, None, 233, 234, 235, 233, 234, + 235, 180, 181, None, None], + share=[1.0, 0.4, 0.6, 0.6, 0.6, 0.15, + 0.15, 0.15, 0.6, 0.6, 0.25, 1.0], + log_return=[None, None, .09604978, -.06524096, .03532373, + .09604978, -.06524096, .03532373, + .03025441, .036997, None, None] + )) + .set_index(["household_id", "asset_id", "t"]) + .reindex(columns=['share', 'log_return'])) + + result = (merge(household.reset_index(), log_return.reset_index(), + on=['asset_id'], how='outer') + .set_index(['household_id', 'asset_id', 't'])) + + tm.assert_frame_equal(result, expected) + + +class TestJoinMultiMulti(object): + + def test_join_multi_multi(self, left_multi, right_multi, join_type, + on_cols_multi, idx_cols_multi): + # Multi-index join tests + expected = (pd.merge(left_multi.reset_index(), + right_multi.reset_index(), + how=join_type, on=on_cols_multi). + set_index(idx_cols_multi).sort_index()) + + result = left_multi.join(right_multi, how=join_type).sort_index() + tm.assert_frame_equal(result, expected) + + def test_join_multi_empty_frames(self, left_multi, right_multi, join_type, + on_cols_multi, idx_cols_multi): + + left_multi = left_multi.drop(columns=left_multi.columns) + right_multi = right_multi.drop(columns=right_multi.columns) + + expected = (pd.merge(left_multi.reset_index(), + right_multi.reset_index(), + how=join_type, on=on_cols_multi) + .set_index(idx_cols_multi).sort_index()) + + result = left_multi.join(right_multi, how=join_type).sort_index() + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("box", [None, np.asarray, Series, Index]) + def test_merge_datetime_index(self, box): + # see gh-19038 + df = DataFrame([1, 2, 3], + ["2016-01-01", "2017-01-01", "2018-01-01"], + columns=["a"]) + df.index = pd.to_datetime(df.index) + on_vector = df.index.year + + if box is not None: + on_vector = box(on_vector) + + expected = DataFrame( + OrderedDict([ + ("a", [1, 2, 3]), + ("key_1", [2016, 2017, 2018]), + ]) + ) + + result = df.merge(df, on=["a", on_vector], how="inner") + tm.assert_frame_equal(result, expected) + + expected = DataFrame( + OrderedDict([ + ("key_0", [2016, 2017, 2018]), + ("a_x", [1, 2, 3]), + ("a_y", [1, 2, 3]), + ]) + ) + + result = df.merge(df, on=[df.index.year], how="inner") + tm.assert_frame_equal(result, expected) + + def test_single_common_level(self): + index_left = pd.MultiIndex.from_tuples([('K0', 'X0'), ('K0', 'X1'), + ('K1', 'X2')], + names=['key', 'X']) + + left = pd.DataFrame({'A': ['A0', 'A1', 'A2'], + 'B': ['B0', 'B1', 'B2']}, + index=index_left) + + index_right = pd.MultiIndex.from_tuples([('K0', 'Y0'), ('K1', 'Y1'), + ('K2', 'Y2'), ('K2', 'Y3')], + names=['key', 'Y']) + + right = pd.DataFrame({'C': ['C0', 'C1', 'C2', 'C3'], + 'D': ['D0', 'D1', 'D2', 'D3']}, + index=index_right) + + result = left.join(right) + expected = (pd.merge(left.reset_index(), right.reset_index(), + on=['key'], how='inner') + .set_index(['key', 'X', 'Y'])) + + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index c7fba47a8f27c..07b00cef2669e 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -1010,6 +1010,21 @@ def test_append_missing_column_proper_upcast(self, sort): assert appended['A'].dtype == 'f8' assert appended['B'].dtype == 'O' + def test_append_empty_frame_to_series_with_dateutil_tz(self): + # GH 23682 + date = Timestamp('2018-10-24 07:30:00', tz=dateutil.tz.tzutc()) + s = Series({'date': date, 'a': 1.0, 'b': 2.0}) + df = DataFrame(columns=['c', 'd']) + result = df.append(s, ignore_index=True) + expected = DataFrame([[np.nan, np.nan, 1., 2., date]], + columns=['c', 'd', 'a', 'b', 'date']) + # These columns get cast to object after append + object_cols = ['c', 'd', 'date'] + expected.loc[:, object_cols] = expected.loc[:, object_cols].astype( + object + ) + assert_frame_equal(result, expected) + class TestConcatenate(ConcatenateBase): diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index 4cce26d135443..bcecedc2bba97 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -189,20 +189,7 @@ def test_scalar_na_logical_ops_corners(self): operator.and_, operator.or_, operator.xor, - pytest.param(ops.rand_, - marks=pytest.mark.xfail(reason="GH#22092 Index " - "implementation returns " - "Index", - raises=AssertionError, - strict=True)), - pytest.param(ops.ror_, - marks=pytest.mark.xfail(reason="GH#22092 Index " - "implementation raises", - raises=ValueError, strict=True)), - pytest.param(ops.rxor, - marks=pytest.mark.xfail(reason="GH#22092 Index " - "implementation raises", - raises=TypeError, strict=True)) + ]) def test_logical_ops_with_index(self, op): # GH#22092, GH#19792 @@ -221,6 +208,19 @@ def test_logical_ops_with_index(self, op): result = op(ser, idx2) assert_series_equal(result, expected) + @pytest.mark.parametrize("op, expected", [ + (ops.rand_, pd.Index([False, True])), + (ops.ror_, pd.Index([False, True])), + (ops.rxor, pd.Index([])), + ]) + def test_reverse_ops_with_index(self, op, expected): + # https://github.com/pandas-dev/pandas/pull/23628 + # multi-set Index ops are buggy, so let's avoid duplicates... + ser = Series([True, False]) + idx = Index([False, True]) + result = op(ser, idx) + tm.assert_index_equal(result, expected) + def test_logical_ops_label_based(self): # GH#4947 # logical ops should be label based diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index bc644071e914f..0e45fd6411ac0 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -2341,7 +2341,17 @@ def test_update_filtered(self): assert_panel_equal(pan, expected) - def test_update_raise(self): + @pytest.mark.parametrize('bad_kwarg, exception, msg', [ + # errors must be 'ignore' or 'raise' + ({'errors': 'something'}, ValueError, 'The parameter errors must.*'), + ({'join': 'inner'}, NotImplementedError, 'Only left join is supported') + ]) + def test_update_raise_bad_parameter(self, bad_kwarg, exception, msg): + pan = Panel([[[1.5, np.nan, 3.]]]) + with pytest.raises(exception, match=msg): + pan.update(pan, **bad_kwarg) + + def test_update_raise_on_overlap(self): pan = Panel([[[1.5, np.nan, 3.], [1.5, np.nan, 3.], [1.5, np.nan, 3.], [1.5, np.nan, 3.]], @@ -2349,8 +2359,15 @@ def test_update_raise(self): [1.5, np.nan, 3.], [1.5, np.nan, 3.]]]) - pytest.raises(Exception, pan.update, *(pan, ), - **{'raise_conflict': True}) + with pytest.raises(ValueError, match='Data overlaps'): + pan.update(pan, errors='raise') + + @pytest.mark.parametrize('raise_conflict', [True, False]) + def test_update_deprecation(self, raise_conflict): + pan = Panel([[[1.5, np.nan, 3.]]]) + other = Panel([[[]]]) + with tm.assert_produces_warning(FutureWarning): + pan.update(other, raise_conflict=raise_conflict) def test_all_any(self): assert (self.panel.all(axis=0).values == nanall( diff --git a/scripts/tests/test_validate_docstrings.py b/scripts/tests/test_validate_docstrings.py index c1bdab73c2671..60b60603f0289 100644 --- a/scripts/tests/test_validate_docstrings.py +++ b/scripts/tests/test_validate_docstrings.py @@ -761,9 +761,8 @@ def test_bad_generic_functions(self, func): ('BadParameters', 'missing_params', ('Parameters {**kwargs} not documented',)), ('BadParameters', 'bad_colon_spacing', - ('Parameters {kind} not documented', - 'Unknown parameters {kind: str}', - 'Parameter "kind: str" has no type')), + ('Parameter "kind" requires a space before the colon ' + 'separating the parameter name and type',)), ('BadParameters', 'no_description_period', ('Parameter "kind" description should finish with "."',)), ('BadParameters', 'no_description_period_with_directive', diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py index 7da77a1f60ad5..873ba71d6539d 100755 --- a/scripts/validate_docstrings.py +++ b/scripts/validate_docstrings.py @@ -97,6 +97,8 @@ 'PR08': 'Parameter "{param_name}" description should start with a ' 'capital letter', 'PR09': 'Parameter "{param_name}" description should finish with "."', + 'PR10': 'Parameter "{param_name}" requires a space before the colon ' + 'separating the parameter name and type', 'RT01': 'No Returns section found', 'YD01': 'No Yields section found', 'SA01': 'See Also section not found', @@ -644,7 +646,11 @@ def validate_one(func_name): for param in doc.doc_parameters: if not param.startswith("*"): # Check can ignore var / kwargs if not doc.parameter_type(param): - errs.append(error('PR04', param_name=param)) + if ':' in param: + errs.append(error('PR10', + param_name=param.split(':')[0])) + else: + errs.append(error('PR04', param_name=param)) else: if doc.parameter_type(param)[-1] == '.': errs.append(error('PR05', param_name=param)) From eae7716b073d28fc243774869bc3222b5cf0cabb Mon Sep 17 00:00:00 2001 From: Michael Silverstein Date: Thu, 15 Nov 2018 13:11:39 -0500 Subject: [PATCH 25/32] Merge branch 'master' of https://github.com/pandas-dev/pandas into dev_melt_column_check # Conflicts: # doc/source/whatsnew/v0.24.0.rst --- doc/source/whatsnew/v0.24.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 28e6f1c2c3573..f93bf27259ca5 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -26,6 +26,7 @@ New features - :meth:`DataFrame.corr` and :meth:`Series.corr` now accept a callable for generic calculation methods of correlation, e.g. histogram intersection (:issue:`22684`) - :func:`DataFrame.to_string` now accepts ``decimal`` as an argument, allowing the user to specify which decimal separator should be used in the output. (:issue:`23614`) +- :func:`pd.melt` now requires `id_vars` and `value_vars` to be in the ``DataFrame`` .. _whatsnew_0240.enhancements.extension_array_operators: From fba641fdb53beb3ec1b952cf9329544af1c378e5 Mon Sep 17 00:00:00 2001 From: Michael Silverstein Date: Thu, 15 Nov 2018 15:16:24 -0500 Subject: [PATCH 26/32] handle multiindex columns --- pandas/core/reshape/melt.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index 60496ba64da5a..f612712332437 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -25,6 +25,10 @@ def melt(frame, id_vars=None, value_vars=None, var_name=None, value_name='value', col_level=None): # TODO: what about the existing index? + if isinstance(frame.columns, ABCMultiIndex): + cols = [x for c in frame.columns for x in c] + else: + cols = list(frame.columns) if id_vars is not None: if not is_list_like(id_vars): id_vars = [id_vars] @@ -35,7 +39,7 @@ def melt(frame, id_vars=None, value_vars=None, var_name=None, else: # Check that `id_vars` are in frame id_vars = list(id_vars) - missing = Index(id_vars).difference(frame.columns) + missing = Index(np.ravel(id_vars)).difference(cols) if not missing.empty: raise KeyError("The following 'id_vars' are not present" " in the DataFrame: {missing}" @@ -53,7 +57,7 @@ def melt(frame, id_vars=None, value_vars=None, var_name=None, else: value_vars = list(value_vars) # Check that `value_vars` are in frame - missing = Index(value_vars).difference(frame.columns) + missing = Index(np.ravel(value_vars)).difference(cols) if not missing.empty: raise KeyError("The following 'value_vars' are not present in" " the DataFrame: {missing}" From 06b7cdbe058ca67854630e2cb38eb958e1f0454e Mon Sep 17 00:00:00 2001 From: Michael Silverstein Date: Thu, 15 Nov 2018 17:09:03 -0500 Subject: [PATCH 27/32] test single var melt with multiindex --- pandas/tests/reshape/test_melt.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index 67d3acfef737c..192ea5a0882de 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -101,6 +101,14 @@ def test_vars_work_with_multiindex(self): result = self.df1.melt(id_vars=[('A', 'a')], value_vars=[('B', 'b')]) tm.assert_frame_equal(result, expected) + def test_single_vars_work_with_multiindex(self): + expected = DataFrame( + {'A': {0: 1.067683, 1: -1.321405, 2: -0.807333}, + 'CAP': {0: 'B', 1: 'B', 2: 'B'}, + 'value': {0: -1.110463, 1: 0.368915, 2: 0.08298}}) + result = self.df1.melt(['A'], ['B'], col_level=0) + tm.assert_frame_equal(result, expected) + def test_tuple_vars_fail_with_multiindex(self): # melt should fail with an informative error message if # the columns have a MultiIndex and a tuple is passed From 39c746be9c7ba618a609aa3aa12b347b952fa70b Mon Sep 17 00:00:00 2001 From: Michael Silverstein Date: Thu, 15 Nov 2018 17:11:57 -0500 Subject: [PATCH 28/32] test single var melt with multiindex --- pandas/tests/reshape/test_melt.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index 192ea5a0882de..12406615ccfe0 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -102,8 +102,8 @@ def test_vars_work_with_multiindex(self): tm.assert_frame_equal(result, expected) def test_single_vars_work_with_multiindex(self): - expected = DataFrame( - {'A': {0: 1.067683, 1: -1.321405, 2: -0.807333}, + expected = DataFrame({ + 'A': {0: 1.067683, 1: -1.321405, 2: -0.807333}, 'CAP': {0: 'B', 1: 'B', 2: 'B'}, 'value': {0: -1.110463, 1: 0.368915, 2: 0.08298}}) result = self.df1.melt(['A'], ['B'], col_level=0) From af170e11a338ab69e0ab0ccca55eafecd5e265ce Mon Sep 17 00:00:00 2001 From: Michael Silverstein Date: Fri, 16 Nov 2018 10:35:21 -0500 Subject: [PATCH 29/32] pep8 and index sorting --- pandas/core/reshape/melt.py | 2 +- pandas/tests/reshape/test_melt.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index f612712332437..439f4513e1029 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -12,8 +12,8 @@ from pandas import compat from pandas.core.arrays import Categorical -from pandas.core.indexes.base import Index from pandas.core.frame import _shared_docs +from pandas.core.indexes.base import Index from pandas.core.reshape.concat import concat from pandas.core.tools.numeric import to_numeric diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index 12406615ccfe0..c9c35c1086ee6 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -269,6 +269,7 @@ def test_melt_missing_columns_raises(self): Col="\\['not_here', 'or_there'\\]")): df.melt(['a', 'b', 'not_here', 'or_there'], ['c', 'd']) + class TestLreshape(object): def test_pairs(self): @@ -340,6 +341,7 @@ def test_pairs(self): 'wt': ['wt%d' % i for i in range(1, 4)]} pytest.raises(ValueError, lreshape, df, spec) + class TestWideToLong(object): def test_simple(self): From 4c9bc9f81fb463631186412e8b656f0ed9e39ec1 Mon Sep 17 00:00:00 2001 From: Michael Silverstein Date: Wed, 21 Nov 2018 11:00:57 -0500 Subject: [PATCH 30/32] rm extra description --- doc/source/whatsnew/v0.24.0.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index f93bf27259ca5..28e6f1c2c3573 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -26,7 +26,6 @@ New features - :meth:`DataFrame.corr` and :meth:`Series.corr` now accept a callable for generic calculation methods of correlation, e.g. histogram intersection (:issue:`22684`) - :func:`DataFrame.to_string` now accepts ``decimal`` as an argument, allowing the user to specify which decimal separator should be used in the output. (:issue:`23614`) -- :func:`pd.melt` now requires `id_vars` and `value_vars` to be in the ``DataFrame`` .. _whatsnew_0240.enhancements.extension_array_operators: From c59d29f468da197f0d23bce03eac401faf8e6e0c Mon Sep 17 00:00:00 2001 From: Michael Silverstein Date: Wed, 21 Nov 2018 11:01:09 -0500 Subject: [PATCH 31/32] add comment --- pandas/core/reshape/melt.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index 439f4513e1029..2dd6dc71b9d98 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -25,6 +25,8 @@ def melt(frame, id_vars=None, value_vars=None, var_name=None, value_name='value', col_level=None): # TODO: what about the existing index? + # If multiindex, gather names of columns on all level for checking presence + # of `id_vars` and `value_vars` if isinstance(frame.columns, ABCMultiIndex): cols = [x for c in frame.columns for x in c] else: From 0db8838e26f74caec9fef7e604ab530437cd9fcc Mon Sep 17 00:00:00 2001 From: Michael Silverstein Date: Wed, 21 Nov 2018 11:01:18 -0500 Subject: [PATCH 32/32] add MI tests --- pandas/tests/reshape/test_melt.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index c9c35c1086ee6..8fd3ae8bb387b 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -269,6 +269,21 @@ def test_melt_missing_columns_raises(self): Col="\\['not_here', 'or_there'\\]")): df.melt(['a', 'b', 'not_here', 'or_there'], ['c', 'd']) + # Multiindex melt fails if column is missing from multilevel melt + multi = df.copy() + multi.columns = [list('ABCD'), list('abcd')] + with pytest.raises( + KeyError, + match=msg.format(Var='id_vars', + Col="\\['E'\\]")): + multi.melt([('E', 'a')], [('B', 'b')]) + # Multiindex fails if column is missing from single level melt + with pytest.raises( + KeyError, + match=msg.format(Var='value_vars', + Col="\\['F'\\]")): + multi.melt(['A'], ['F'], col_level=0) + class TestLreshape(object):