From 4792d70261c59bd4cec8981c2886efadcd5f63dd Mon Sep 17 00:00:00 2001 From: Maitreyi Nagarkar Date: Sat, 10 Mar 2018 17:15:00 -0800 Subject: [PATCH 1/7] add documentation to rolling.corr --- pandas/core/window.py | 112 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 101 insertions(+), 11 deletions(-) diff --git a/pandas/core/window.py b/pandas/core/window.py index c41b07759d555..44fddb686ac81 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -1028,19 +1028,111 @@ def _get_cov(X, Y): _get_cov, pairwise=bool(pairwise)) _shared_docs['corr'] = dedent(""" - %(name)s sample correlation + Calculate %(name)s correlation. + + This function uses Pearson's definition of correlation. Parameters ---------- - other : Series, DataFrame, or ndarray, optional - if not supplied then will default to self and produce pairwise output + other : Series, DataFrame, or ndarray, optional + If not supplied then will default to self. pairwise : bool, default None - If False then only matching columns between self and other will be - used and the output will be a DataFrame. - If True then all pairwise combinations will be calculated and the - output will be a MultiIndex DataFrame in the case of DataFrame inputs. - In the case of missing elements, only complete pairwise observations - will be used.""") + Calculate pairwise combinations of columns within a + DataFrame. If other is not specified, defaults to True, + otherwise defaults to False. Not relevant for Series. + See notes. + **kwargs + Under Review. + + Returns + ------- + Series or DataFrame + Returned object type is determined by the caller of the + %(name)s calculation. + + See Also + -------- + Series.%(name)s : Calling object with Series data + DataFrame.%(name)s : Calling object with DataFrames + Series.corr : Equivalent method for Series + DataFrame.corr : Equivalent method for DataFrame + %(name)s.cov : Similar method to calculate covariance + numpy.corrcoef : NumPy Pearson's correlation calculation + + Notes + ----- + Other should be always be specified, except for DataFrame inputs with + pairwise set to `True`. All other input combinations will return all 1's. + + Function will return `NaN`s for correlations of equal valued sequences; + this is the result of a 0/0 division error. + + When pairwise is set to `False`, only matching columns between self and + other will be used. + + When pairwise is set to `True`, the output will be a MultiIndex DataFrame + with the original index on the first level, and the "other" DataFrame + columns on the second level. + + In the case of missing elements, only complete pairwise observations + will be used. + + Examples + -------- + The below example shows a rolling calculation with a window size of + four matching the equivalent function call using `numpy.corrcoef`. + + >>> v1 = [3, 3, 3, 5, 8] + >>> v2 = [3, 4, 4, 4, 8] + >>> fmt = "{0:.6f}" # limit the printed precision to 6 digits + >>> import numpy as np + >>> # numpy returns a 2X2 array, the correlation coefficient + >>> # is the number at entry [0][1] + >>> print(fmt.format(np.corrcoef(v1[:-1], v2[:-1])[0][1])) + 0.333333 + >>> print(fmt.format(np.corrcoef(v1[1:], v2[1:])[0][1])) + 0.916949 + >>> s1 = pd.Series(v1) + >>> s2 = pd.Series(v2) + >>> s1.rolling(4).corr(s2) + 0 NaN + 1 NaN + 2 NaN + 3 0.333333 + 4 0.916949 + dtype: float64 + + The below example shows a similar rolling calculation on a + DataFrame using the pairwise option. + + >>> matrix = np.array([[51., 35.], [49., 30.], [47., 32.], [46., 31.], [50., 36.]]) + >>> print(np.corrcoef(matrix[:-1,0], matrix[:-1,1]).round(7)) + [[1. 0.6263001] + [0.6263001 1. ]] + >>> print(np.corrcoef(matrix[1:,0], matrix[1:,1]).round(7)) + [[1. 0.5553681] + [0.5553681 1. ]] + >>> df = pd.DataFrame(matrix, columns=['X','Y']) + >>> df + X Y + 0 51.0 35.0 + 1 49.0 30.0 + 2 47.0 32.0 + 3 46.0 31.0 + 4 50.0 36.0 + >>> df.rolling(4).corr(pairwise=True) + X Y + 0 X NaN NaN + Y NaN NaN + 1 X NaN NaN + Y NaN NaN + 2 X NaN NaN + Y NaN NaN + 3 X 1.000000 0.626300 + Y 0.626300 1.000000 + 4 X 1.000000 0.555368 + Y 0.555368 1.000000 + """) def corr(self, other=None, pairwise=None, **kwargs): if other is None: @@ -1288,7 +1380,6 @@ def cov(self, other=None, pairwise=None, ddof=1, **kwargs): ddof=ddof, **kwargs) @Substitution(name='rolling') - @Appender(_doc_template) @Appender(_shared_docs['corr']) def corr(self, other=None, pairwise=None, **kwargs): return super(Rolling, self).corr(other=other, pairwise=pairwise, @@ -1527,7 +1618,6 @@ def cov(self, other=None, pairwise=None, ddof=1, **kwargs): ddof=ddof, **kwargs) @Substitution(name='expanding') - @Appender(_doc_template) @Appender(_shared_docs['corr']) def corr(self, other=None, pairwise=None, **kwargs): return super(Expanding, self).corr(other=other, pairwise=pairwise, From 98ed79ba2325c129b1d11319f2d3781c52f7e230 Mon Sep 17 00:00:00 2001 From: theandygross Date: Sat, 10 Mar 2018 17:33:47 -0800 Subject: [PATCH 2/7] clean-up pep8 --- pandas/core/window.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/pandas/core/window.py b/pandas/core/window.py index 44fddb686ac81..6ce1832e98b34 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -1034,12 +1034,12 @@ def _get_cov(X, Y): Parameters ---------- - other : Series, DataFrame, or ndarray, optional + other : Series, DataFrame, or ndarray, optional If not supplied then will default to self. pairwise : bool, default None - Calculate pairwise combinations of columns within a + Calculate pairwise combinations of columns within a DataFrame. If other is not specified, defaults to True, - otherwise defaults to False. Not relevant for Series. + otherwise defaults to False. Not relevant for Series. See notes. **kwargs Under Review. @@ -1048,7 +1048,7 @@ def _get_cov(X, Y): ------- Series or DataFrame Returned object type is determined by the caller of the - %(name)s calculation. + %(name)s calculation. See Also -------- @@ -1065,7 +1065,7 @@ def _get_cov(X, Y): pairwise set to `True`. All other input combinations will return all 1's. Function will return `NaN`s for correlations of equal valued sequences; - this is the result of a 0/0 division error. + this is the result of a 0/0 division error. When pairwise is set to `False`, only matching columns between self and other will be used. @@ -1086,7 +1086,7 @@ def _get_cov(X, Y): >>> v2 = [3, 4, 4, 4, 8] >>> fmt = "{0:.6f}" # limit the printed precision to 6 digits >>> import numpy as np - >>> # numpy returns a 2X2 array, the correlation coefficient + >>> # numpy returns a 2X2 array, the correlation coefficient >>> # is the number at entry [0][1] >>> print(fmt.format(np.corrcoef(v1[:-1], v2[:-1])[0][1])) 0.333333 @@ -1102,16 +1102,17 @@ def _get_cov(X, Y): 4 0.916949 dtype: float64 - The below example shows a similar rolling calculation on a + The below example shows a similar rolling calculation on a DataFrame using the pairwise option. - >>> matrix = np.array([[51., 35.], [49., 30.], [47., 32.], [46., 31.], [50., 36.]]) + >>> matrix = np.array([[51., 35.], [49., 30.], [47., 32.],\ + [46., 31.], [50., 36.]]) >>> print(np.corrcoef(matrix[:-1,0], matrix[:-1,1]).round(7)) [[1. 0.6263001] [0.6263001 1. ]] >>> print(np.corrcoef(matrix[1:,0], matrix[1:,1]).round(7)) [[1. 0.5553681] - [0.5553681 1. ]] + [0.5553681 1. ]] >>> df = pd.DataFrame(matrix, columns=['X','Y']) >>> df X Y From ce7fdac5e0dca997c06f521a7657e4717e8e87eb Mon Sep 17 00:00:00 2001 From: theandygross Date: Sat, 10 Mar 2018 17:44:32 -0800 Subject: [PATCH 3/7] clean-up pep8 --- pandas/core/window.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/pandas/core/window.py b/pandas/core/window.py index 6ce1832e98b34..163d1967f0151 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -1063,25 +1063,25 @@ def _get_cov(X, Y): ----- Other should be always be specified, except for DataFrame inputs with pairwise set to `True`. All other input combinations will return all 1's. - + Function will return `NaN`s for correlations of equal valued sequences; this is the result of a 0/0 division error. - - When pairwise is set to `False`, only matching columns between self and + + When pairwise is set to `False`, only matching columns between self and other will be used. - - When pairwise is set to `True`, the output will be a MultiIndex DataFrame - with the original index on the first level, and the "other" DataFrame + + When pairwise is set to `True`, the output will be a MultiIndex DataFrame + with the original index on the first level, and the "other" DataFrame columns on the second level. - + In the case of missing elements, only complete pairwise observations will be used. - + Examples -------- The below example shows a rolling calculation with a window size of four matching the equivalent function call using `numpy.corrcoef`. - + >>> v1 = [3, 3, 3, 5, 8] >>> v2 = [3, 4, 4, 4, 8] >>> fmt = "{0:.6f}" # limit the printed precision to 6 digits @@ -1101,10 +1101,10 @@ def _get_cov(X, Y): 3 0.333333 4 0.916949 dtype: float64 - + The below example shows a similar rolling calculation on a DataFrame using the pairwise option. - + >>> matrix = np.array([[51., 35.], [49., 30.], [47., 32.],\ [46., 31.], [50., 36.]]) >>> print(np.corrcoef(matrix[:-1,0], matrix[:-1,1]).round(7)) @@ -1133,7 +1133,7 @@ def _get_cov(X, Y): Y 0.626300 1.000000 4 X 1.000000 0.555368 Y 0.555368 1.000000 - """) +""") def corr(self, other=None, pairwise=None, **kwargs): if other is None: From 248598b2be1087247bcab945fbdb6344f8e468b0 Mon Sep 17 00:00:00 2001 From: Andrew Gross Date: Sun, 15 Apr 2018 10:59:31 -0700 Subject: [PATCH 4/7] add link to Pearson's correlation calc. --- pandas/core/window.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/window.py b/pandas/core/window.py index 163d1967f0151..4f868a8b8fdfa 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -1030,7 +1030,7 @@ def _get_cov(X, Y): _shared_docs['corr'] = dedent(""" Calculate %(name)s correlation. - This function uses Pearson's definition of correlation. + This function uses Pearson's definition of correlation (https://en.wikipedia.org/wiki/Pearson_correlation_coefficient). Parameters ---------- From 87cbb3f1ce1934b002f92957f0a9794da1b3f219 Mon Sep 17 00:00:00 2001 From: Andrew Gross Date: Sun, 15 Apr 2018 11:22:19 -0700 Subject: [PATCH 5/7] Add changes requested in review. --- pandas/core/window.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/pandas/core/window.py b/pandas/core/window.py index 4f868a8b8fdfa..dcffaba8a3382 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -1030,7 +1030,8 @@ def _get_cov(X, Y): _shared_docs['corr'] = dedent(""" Calculate %(name)s correlation. - This function uses Pearson's definition of correlation (https://en.wikipedia.org/wiki/Pearson_correlation_coefficient). + This function uses Pearson's definition of correlation + (https://en.wikipedia.org/wiki/Pearson_correlation_coefficient). Parameters ---------- @@ -1038,8 +1039,8 @@ def _get_cov(X, Y): If not supplied then will default to self. pairwise : bool, default None Calculate pairwise combinations of columns within a - DataFrame. If other is not specified, defaults to True, - otherwise defaults to False. Not relevant for Series. + DataFrame. If `other` is not specified, defaults to `True`, + otherwise defaults to `False`. Not relevant for :class:`~pandas.Series`. See notes. **kwargs Under Review. @@ -1061,17 +1062,18 @@ def _get_cov(X, Y): Notes ----- - Other should be always be specified, except for DataFrame inputs with - pairwise set to `True`. All other input combinations will return all 1's. + When `other` is not specified, the output will be self correlation (e.g. + all 1's), except for :class:`~pandas.DataFrame` inputs with `pairwise` + set to `True`. Function will return `NaN`s for correlations of equal valued sequences; this is the result of a 0/0 division error. - When pairwise is set to `False`, only matching columns between self and - other will be used. + When `pairwise` is set to `False`, only matching columns between `self` and + `other` will be used. - When pairwise is set to `True`, the output will be a MultiIndex DataFrame - with the original index on the first level, and the "other" DataFrame + When `pairwise` is set to `True`, the output will be a MultiIndex DataFrame + with the original index on the first level, and the `other` DataFrame columns on the second level. In the case of missing elements, only complete pairwise observations @@ -1085,7 +1087,6 @@ def _get_cov(X, Y): >>> v1 = [3, 3, 3, 5, 8] >>> v2 = [3, 4, 4, 4, 8] >>> fmt = "{0:.6f}" # limit the printed precision to 6 digits - >>> import numpy as np >>> # numpy returns a 2X2 array, the correlation coefficient >>> # is the number at entry [0][1] >>> print(fmt.format(np.corrcoef(v1[:-1], v2[:-1])[0][1])) From fc2ee994526ffabcdc65504a2a98f33d01981f4e Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sun, 8 Jul 2018 09:29:41 -0500 Subject: [PATCH 6/7] Minor fixup --- pandas/core/window.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/window.py b/pandas/core/window.py index e686e3d473f9f..599c8d38df31a 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -1393,9 +1393,6 @@ def _get_cov(X, Y): _shared_docs['corr'] = dedent(""" Calculate %(name)s correlation. - This function uses Pearson's definition of correlation - (https://en.wikipedia.org/wiki/Pearson_correlation_coefficient). - Parameters ---------- other : Series, DataFrame, or ndarray, optional @@ -1425,6 +1422,9 @@ def _get_cov(X, Y): Notes ----- + This function uses Pearson's definition of correlation + (https://en.wikipedia.org/wiki/Pearson_correlation_coefficient). + When `other` is not specified, the output will be self correlation (e.g. all 1's), except for :class:`~pandas.DataFrame` inputs with `pairwise` set to `True`. From 4c322b04bf6e5f165cdcbb3c64d8240b46e1d63e Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sun, 8 Jul 2018 09:31:52 -0500 Subject: [PATCH 7/7] LINT fixup --- pandas/core/window.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/window.py b/pandas/core/window.py index 599c8d38df31a..1029b96c58475 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -1400,8 +1400,8 @@ def _get_cov(X, Y): pairwise : bool, default None Calculate pairwise combinations of columns within a DataFrame. If `other` is not specified, defaults to `True`, - otherwise defaults to `False`. Not relevant for :class:`~pandas.Series`. - See notes. + otherwise defaults to `False`. + Not relevant for :class:`~pandas.Series`. **kwargs Under Review. @@ -1425,8 +1425,8 @@ def _get_cov(X, Y): This function uses Pearson's definition of correlation (https://en.wikipedia.org/wiki/Pearson_correlation_coefficient). - When `other` is not specified, the output will be self correlation (e.g. - all 1's), except for :class:`~pandas.DataFrame` inputs with `pairwise` + When `other` is not specified, the output will be self correlation (e.g. + all 1's), except for :class:`~pandas.DataFrame` inputs with `pairwise` set to `True`. Function will return `NaN`s for correlations of equal valued sequences;