From 67c100e006604be77ef3fd7bec32433cf3ffdbca Mon Sep 17 00:00:00 2001 From: Stephen Childs Date: Sat, 10 Mar 2018 10:49:48 -0500 Subject: [PATCH 1/2] DOC: Revise docstring of DataFrame cov method Update the docstring with some examples from elsewhere in the pandas documentation. Some of the examples use randomly generated time series because we need to get covariance between long series. Used a random seed to ensure that the results are the same each time. --- pandas/core/frame.py | 82 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 78 insertions(+), 4 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a66d00fff9714..3ec9ed74e4168 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5501,7 +5501,22 @@ def corr(self, method='pearson', min_periods=1): def cov(self, min_periods=None): """ - Compute pairwise covariance of columns, excluding NA/null values + Compute pairwise covariance of columns, excluding NA/null values. + + Compute the pairwise covariance among the series of a DataFrame. + The returned data frame is the `covariance matrix + `__ of the columns + of the DataFrame. + + Both NA and null values are automatically excluded from the + calculation. (See the note below about bias from missing values.) + A threshold can be set for the minimum number of + observations for each value created. Comparisons with observations + below this threshold will be returned as ``NaN``. + + This method is generally used for the analysis of time series data to + understand the relationship between different measures + across time. Parameters ---------- @@ -5511,12 +5526,71 @@ def cov(self, min_periods=None): Returns ------- - y : DataFrame + DataFrame + The covariance matrix of the series of the DataFrame. + + See Also + -------- + Series.cov : compute covariance with another Series + core.window.EWM.cov: expoential weighted sample covariance + core.window.Expanding.cov : expanding sample covariance + core.window.Rolling.cov : rolling sample covariance Notes ----- - `y` contains the covariance matrix of the DataFrame's time series. - The covariance is normalized by N-1 (unbiased estimator). + Returns the covariance matrix of the DataFrame's time series. + The covariance is normalized by N-1. + + For DataFrames that have Series that are missing data (assuming that + data is `missing at random + `__) + the returned covariance matrix will be an unbiased estimate + of the variance and covariance between the member Series. + + However, for many applications this estimate may not be acceptable + because the estimate covariance matrix is not guaranteed to be positive + semi-definite. This could lead to estimate correlations having + absolute values which are greater than one, and/or a non-invertible + covariance matrix. See `Estimation of covariance matrices + `__ for more details. + + Examples + -------- + >>> df = pd.DataFrame([(1, 2), (0, 3), (2, 0), (1, 1)], + ... columns=['dogs', 'cats']) + >>> df.cov() + dogs cats + dogs 0.666667 -1.000000 + cats -1.000000 1.666667 + + >>> np.random.seed(42) + >>> df = pd.DataFrame(np.random.randn(1000, 5), + ... columns=['a', 'b', 'c', 'd', 'e']) + >>> df.cov() + a b c d e + a 0.998438 -0.020161 0.059277 -0.008943 0.014144 + b -0.020161 1.059352 -0.008543 -0.024738 0.009826 + c 0.059277 -0.008543 1.010670 -0.001486 -0.000271 + d -0.008943 -0.024738 -0.001486 0.921297 -0.013692 + e 0.014144 0.009826 -0.000271 -0.013692 0.977795 + + **Minimum number of periods** + + This method also supports an optional ``min_periods`` keyword + that specifies the required minimum number of observations for each + column pair in order to have a valid result: + + >>> np.random.seed(42) + >>> df = pd.DataFrame(np.random.randn(20, 3), + ... columns=['a', 'b', 'c']) + >>> df.loc[df.index[:5], 'a'] = np.nan + >>> df.loc[df.index[5:10], 'b'] = np.nan + >>> df.cov(min_periods=12) + a b c + a 0.316741 NaN -0.150812 + b NaN 1.248003 0.191417 + c -0.150812 0.191417 0.895202 """ numeric_df = self._get_numeric_data() cols = numeric_df.columns From 05e2753960a1d7fe977d33d53fd629b57237140e Mon Sep 17 00:00:00 2001 From: Stephen Childs Date: Sun, 11 Mar 2018 07:28:32 -0400 Subject: [PATCH 2/2] DOC: Fix See Also and min_periods explanation. Responding to comments on PR. See also section will link properly and number of periods explanation clearer. --- pandas/core/frame.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3ec9ed74e4168..d8269b605eef9 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5531,10 +5531,10 @@ def cov(self, min_periods=None): See Also -------- - Series.cov : compute covariance with another Series - core.window.EWM.cov: expoential weighted sample covariance - core.window.Expanding.cov : expanding sample covariance - core.window.Rolling.cov : rolling sample covariance + pandas.Series.cov : compute covariance with another Series + pandas.core.window.EWM.cov: expoential weighted sample covariance + pandas.core.window.Expanding.cov : expanding sample covariance + pandas.core.window.Rolling.cov : rolling sample covariance Notes ----- @@ -5578,8 +5578,8 @@ def cov(self, min_periods=None): **Minimum number of periods** This method also supports an optional ``min_periods`` keyword - that specifies the required minimum number of observations for each - column pair in order to have a valid result: + that specifies the required minimum number of non-NA observations for + each column pair in order to have a valid result: >>> np.random.seed(42) >>> df = pd.DataFrame(np.random.randn(20, 3),