From d463a02246eee759700ec4c0e7b83af0efb5c0c4 Mon Sep 17 00:00:00 2001 From: ChristofKaufmann Date: Fri, 3 Dec 2021 12:03:44 +0100 Subject: [PATCH 1/3] DOC: Improve code example for DataFrame.join The modified code example does not have unique values in the 'key' column in the calling DataFrame `df`. With unique keys the special behavior of `join` has not been highlighted, since it just added a new column. With the modified values the example shows that the 'key' column is really used as key. --- pandas/core/frame.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9e6faa7037dae..71713acbee123 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9124,17 +9124,17 @@ def join( Examples -------- - >>> df = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'], + >>> df = pd.DataFrame({'key': ['K0', 'K1', 'K1', 'K3', 'K0', 'K1'], ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']}) >>> df key A 0 K0 A0 1 K1 A1 - 2 K2 A2 + 2 K1 A2 3 K3 A3 - 4 K4 A4 - 5 K5 A5 + 4 K0 A4 + 5 K1 A5 >>> other = pd.DataFrame({'key': ['K0', 'K1', 'K2'], ... 'B': ['B0', 'B1', 'B2']}) @@ -9151,10 +9151,10 @@ def join( key_caller A key_other B 0 K0 A0 K0 B0 1 K1 A1 K1 B1 - 2 K2 A2 K2 B2 + 2 K1 A2 K2 B2 3 K3 A3 NaN NaN - 4 K4 A4 NaN NaN - 5 K5 A5 NaN NaN + 4 K0 A4 NaN NaN + 5 K1 A5 NaN NaN If we want to join using the key columns, we need to set key to be the index in both `df` and `other`. The joined DataFrame will have @@ -9164,11 +9164,11 @@ def join( A B key K0 A0 B0 + K0 A4 B0 K1 A1 B1 - K2 A2 B2 + K1 A2 B1 + K1 A5 B1 K3 A3 NaN - K4 A4 NaN - K5 A5 NaN Another option to join using the key columns is to use the `on` parameter. DataFrame.join always uses `other`'s index but we can use @@ -9179,10 +9179,10 @@ def join( key A B 0 K0 A0 B0 1 K1 A1 B1 - 2 K2 A2 B2 + 2 K1 A2 B1 3 K3 A3 NaN - 4 K4 A4 NaN - 5 K5 A5 NaN + 4 K0 A4 B0 + 5 K1 A5 B1 """ return self._join_compat( other, on=on, how=how, lsuffix=lsuffix, rsuffix=rsuffix, sort=sort From 932e8f5016aaf0985ff942d9aff9107d27f85608 Mon Sep 17 00:00:00 2001 From: ChristofKaufmann Date: Sat, 4 Dec 2021 01:55:35 +0100 Subject: [PATCH 2/3] Use the modified values as additional example This adds back the original example and adds the new example as an additional one. --- pandas/core/frame.py | 59 ++++++++++++++++++++++++++++++-------------- 1 file changed, 41 insertions(+), 18 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 71713acbee123..0c33daf23707a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9124,57 +9124,80 @@ def join( Examples -------- - >>> df = pd.DataFrame({'key': ['K0', 'K1', 'K1', 'K3', 'K0', 'K1'], + >>> df = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'], ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']}) - + >>> df key A 0 K0 A0 1 K1 A1 - 2 K1 A2 + 2 K2 A2 3 K3 A3 - 4 K0 A4 - 5 K1 A5 - + 4 K4 A4 + 5 K5 A5 + >>> other = pd.DataFrame({'key': ['K0', 'K1', 'K2'], ... 'B': ['B0', 'B1', 'B2']}) - + >>> other key B 0 K0 B0 1 K1 B1 2 K2 B2 - + Join DataFrames using their indexes. - + >>> df.join(other, lsuffix='_caller', rsuffix='_other') key_caller A key_other B 0 K0 A0 K0 B0 1 K1 A1 K1 B1 - 2 K1 A2 K2 B2 + 2 K2 A2 K2 B2 3 K3 A3 NaN NaN - 4 K0 A4 NaN NaN - 5 K1 A5 NaN NaN - + 4 K4 A4 NaN NaN + 5 K5 A5 NaN NaN + If we want to join using the key columns, we need to set key to be the index in both `df` and `other`. The joined DataFrame will have key as its index. - + >>> df.set_index('key').join(other.set_index('key')) A B key K0 A0 B0 - K0 A4 B0 K1 A1 B1 - K1 A2 B1 - K1 A5 B1 + K2 A2 B2 K3 A3 NaN - + K4 A4 NaN + K5 A5 NaN + Another option to join using the key columns is to use the `on` parameter. DataFrame.join always uses `other`'s index but we can use any column in `df`. This method preserves the original DataFrame's index in the result. + + >>> df.join(other.set_index('key'), on='key') + key A B + 0 K0 A0 B0 + 1 K1 A1 B1 + 2 K2 A2 B2 + 3 K3 A3 NaN + 4 K4 A4 NaN + 5 K5 A5 NaN + Using non-unique key values shows how they are matched. + + >>> df = pd.DataFrame({'key': ['K0', 'K1', 'K1', 'K3', 'K0', 'K1'], + ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']}) + + >>> df + key A + 0 K0 A0 + 1 K1 A1 + 2 K1 A2 + 3 K3 A3 + 4 K0 A4 + 5 K1 A5 + >>> df.join(other.set_index('key'), on='key') key A B 0 K0 A0 B0 From c40424ba76fe461d8b5fc7d16e22a8c8942f4712 Mon Sep 17 00:00:00 2001 From: ChristofKaufmann Date: Sat, 4 Dec 2021 01:58:03 +0100 Subject: [PATCH 3/3] Remove whitespace on blank lines --- pandas/core/frame.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0c33daf23707a..a2b53a04302d6 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9126,7 +9126,7 @@ def join( -------- >>> df = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'], ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']}) - + >>> df key A 0 K0 A0 @@ -9135,18 +9135,18 @@ def join( 3 K3 A3 4 K4 A4 5 K5 A5 - + >>> other = pd.DataFrame({'key': ['K0', 'K1', 'K2'], ... 'B': ['B0', 'B1', 'B2']}) - + >>> other key B 0 K0 B0 1 K1 B1 2 K2 B2 - + Join DataFrames using their indexes. - + >>> df.join(other, lsuffix='_caller', rsuffix='_other') key_caller A key_other B 0 K0 A0 K0 B0 @@ -9155,11 +9155,11 @@ def join( 3 K3 A3 NaN NaN 4 K4 A4 NaN NaN 5 K5 A5 NaN NaN - + If we want to join using the key columns, we need to set key to be the index in both `df` and `other`. The joined DataFrame will have key as its index. - + >>> df.set_index('key').join(other.set_index('key')) A B key @@ -9169,12 +9169,12 @@ def join( K3 A3 NaN K4 A4 NaN K5 A5 NaN - + Another option to join using the key columns is to use the `on` parameter. DataFrame.join always uses `other`'s index but we can use any column in `df`. This method preserves the original DataFrame's index in the result. - + >>> df.join(other.set_index('key'), on='key') key A B 0 K0 A0 B0 @@ -9185,10 +9185,10 @@ def join( 5 K5 A5 NaN Using non-unique key values shows how they are matched. - + >>> df = pd.DataFrame({'key': ['K0', 'K1', 'K1', 'K3', 'K0', 'K1'], ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']}) - + >>> df key A 0 K0 A0 @@ -9197,7 +9197,7 @@ def join( 3 K3 A3 4 K0 A4 5 K1 A5 - + >>> df.join(other.set_index('key'), on='key') key A B 0 K0 A0 B0