@@ -1209,20 +1209,68 @@ def from_records(cls, data, index=None, exclude=None, columns=None,
1209
1209
1210
1210
def to_records (self , index = True , convert_datetime64 = True ):
1211
1211
"""
1212
- Convert DataFrame to record array. Index will be put in the
1213
- 'index' field of the record array if requested
1212
+ Convert DataFrame to a NumPy record array.
1213
+
1214
+ Index will be put in the 'index' field of the record array if
1215
+ requested.
1214
1216
1215
1217
Parameters
1216
1218
----------
1217
1219
index : boolean, default True
1218
- Include index in resulting record array, stored in 'index' field
1220
+ Include index in resulting record array, stored in 'index' field.
1219
1221
convert_datetime64 : boolean, default True
1220
1222
Whether to convert the index to datetime.datetime if it is a
1221
- DatetimeIndex
1223
+ DatetimeIndex.
1222
1224
1223
1225
Returns
1224
1226
-------
1225
- y : recarray
1227
+ y : numpy.recarray
1228
+
1229
+ See Also
1230
+ --------
1231
+ DataFrame.from_records: convert structured or record ndarray
1232
+ to DataFrame.
1233
+ numpy.recarray: ndarray that allows field access using
1234
+ attributes, analogous to typed columns in a
1235
+ spreadsheet.
1236
+
1237
+ Examples
1238
+ --------
1239
+ >>> df = pd.DataFrame({'A': [1, 2], 'B': [0.5, 0.75]},
1240
+ ... index=['a', 'b'])
1241
+ >>> df
1242
+ A B
1243
+ a 1 0.50
1244
+ b 2 0.75
1245
+ >>> df.to_records()
1246
+ rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
1247
+ dtype=[('index', 'O'), ('A', '<i8'), ('B', '<f8')])
1248
+
1249
+ The index can be excluded from the record array:
1250
+
1251
+ >>> df.to_records(index=False)
1252
+ rec.array([(1, 0.5 ), (2, 0.75)],
1253
+ dtype=[('A', '<i8'), ('B', '<f8')])
1254
+
1255
+ By default, timestamps are converted to `datetime.datetime`:
1256
+
1257
+ >>> df.index = pd.date_range('2018-01-01 09:00', periods=2, freq='min')
1258
+ >>> df
1259
+ A B
1260
+ 2018-01-01 09:00:00 1 0.50
1261
+ 2018-01-01 09:01:00 2 0.75
1262
+ >>> df.to_records()
1263
+ rec.array([(datetime.datetime(2018, 1, 1, 9, 0), 1, 0.5 ),
1264
+ (datetime.datetime(2018, 1, 1, 9, 1), 2, 0.75)],
1265
+ dtype=[('index', 'O'), ('A', '<i8'), ('B', '<f8')])
1266
+
1267
+ The timestamp conversion can be disabled so NumPy's datetime64
1268
+ data type is used instead:
1269
+
1270
+ >>> df.to_records(convert_datetime64=False)
1271
+ rec.array([('2018-01-01T09:00:00.000000000', 1, 0.5 ),
1272
+ ('2018-01-01T09:01:00.000000000', 2, 0.75)],
1273
+ dtype=[('index', '<M8[ns]'), ('A', '<i8'), ('B', '<f8')])
1226
1274
"""
1227
1275
if index :
1228
1276
if is_datetime64_any_dtype (self .index ) and convert_datetime64 :
@@ -4722,20 +4770,90 @@ def melt(self, id_vars=None, value_vars=None, var_name=None,
4722
4770
4723
4771
def diff (self , periods = 1 , axis = 0 ):
4724
4772
"""
4725
- 1st discrete difference of object
4773
+ First discrete difference of element.
4774
+
4775
+ Calculates the difference of a DataFrame element compared with another
4776
+ element in the DataFrame (default is the element in the same column
4777
+ of the previous row).
4726
4778
4727
4779
Parameters
4728
4780
----------
4729
4781
periods : int, default 1
4730
- Periods to shift for forming difference
4782
+ Periods to shift for calculating difference, accepts negative
4783
+ values.
4731
4784
axis : {0 or 'index', 1 or 'columns'}, default 0
4732
4785
Take difference over rows (0) or columns (1).
4733
4786
4734
- .. versionadded:: 0.16.1
4787
+ .. versionadded:: 0.16.1.
4735
4788
4736
4789
Returns
4737
4790
-------
4738
4791
diffed : DataFrame
4792
+
4793
+ See Also
4794
+ --------
4795
+ Series.diff: First discrete difference for a Series.
4796
+ DataFrame.pct_change: Percent change over given number of periods.
4797
+ DataFrame.shift: Shift index by desired number of periods with an
4798
+ optional time freq.
4799
+
4800
+ Examples
4801
+ --------
4802
+ Difference with previous row
4803
+
4804
+ >>> df = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6],
4805
+ ... 'b': [1, 1, 2, 3, 5, 8],
4806
+ ... 'c': [1, 4, 9, 16, 25, 36]})
4807
+ >>> df
4808
+ a b c
4809
+ 0 1 1 1
4810
+ 1 2 1 4
4811
+ 2 3 2 9
4812
+ 3 4 3 16
4813
+ 4 5 5 25
4814
+ 5 6 8 36
4815
+
4816
+ >>> df.diff()
4817
+ a b c
4818
+ 0 NaN NaN NaN
4819
+ 1 1.0 0.0 3.0
4820
+ 2 1.0 1.0 5.0
4821
+ 3 1.0 1.0 7.0
4822
+ 4 1.0 2.0 9.0
4823
+ 5 1.0 3.0 11.0
4824
+
4825
+ Difference with previous column
4826
+
4827
+ >>> df.diff(axis=1)
4828
+ a b c
4829
+ 0 NaN 0.0 0.0
4830
+ 1 NaN -1.0 3.0
4831
+ 2 NaN -1.0 7.0
4832
+ 3 NaN -1.0 13.0
4833
+ 4 NaN 0.0 20.0
4834
+ 5 NaN 2.0 28.0
4835
+
4836
+ Difference with 3rd previous row
4837
+
4838
+ >>> df.diff(periods=3)
4839
+ a b c
4840
+ 0 NaN NaN NaN
4841
+ 1 NaN NaN NaN
4842
+ 2 NaN NaN NaN
4843
+ 3 3.0 2.0 15.0
4844
+ 4 3.0 4.0 21.0
4845
+ 5 3.0 6.0 27.0
4846
+
4847
+ Difference with following row
4848
+
4849
+ >>> df.diff(periods=-1)
4850
+ a b c
4851
+ 0 -1.0 0.0 -3.0
4852
+ 1 -1.0 -1.0 -5.0
4853
+ 2 -1.0 -1.0 -7.0
4854
+ 3 -1.0 -2.0 -9.0
4855
+ 4 -1.0 -3.0 -11.0
4856
+ 5 NaN NaN NaN
4739
4857
"""
4740
4858
bm_axis = self ._get_block_manager_axis (axis )
4741
4859
new_data = self ._data .diff (n = periods , axis = bm_axis )
@@ -5501,7 +5619,22 @@ def corr(self, method='pearson', min_periods=1):
5501
5619
5502
5620
def cov (self , min_periods = None ):
5503
5621
"""
5504
- Compute pairwise covariance of columns, excluding NA/null values
5622
+ Compute pairwise covariance of columns, excluding NA/null values.
5623
+
5624
+ Compute the pairwise covariance among the series of a DataFrame.
5625
+ The returned data frame is the `covariance matrix
5626
+ <https://en.wikipedia.org/wiki/Covariance_matrix>`__ of the columns
5627
+ of the DataFrame.
5628
+
5629
+ Both NA and null values are automatically excluded from the
5630
+ calculation. (See the note below about bias from missing values.)
5631
+ A threshold can be set for the minimum number of
5632
+ observations for each value created. Comparisons with observations
5633
+ below this threshold will be returned as ``NaN``.
5634
+
5635
+ This method is generally used for the analysis of time series data to
5636
+ understand the relationship between different measures
5637
+ across time.
5505
5638
5506
5639
Parameters
5507
5640
----------
@@ -5511,12 +5644,71 @@ def cov(self, min_periods=None):
5511
5644
5512
5645
Returns
5513
5646
-------
5514
- y : DataFrame
5647
+ DataFrame
5648
+ The covariance matrix of the series of the DataFrame.
5649
+
5650
+ See Also
5651
+ --------
5652
+ pandas.Series.cov : compute covariance with another Series
5653
+ pandas.core.window.EWM.cov: expoential weighted sample covariance
5654
+ pandas.core.window.Expanding.cov : expanding sample covariance
5655
+ pandas.core.window.Rolling.cov : rolling sample covariance
5515
5656
5516
5657
Notes
5517
5658
-----
5518
- `y` contains the covariance matrix of the DataFrame's time series.
5519
- The covariance is normalized by N-1 (unbiased estimator).
5659
+ Returns the covariance matrix of the DataFrame's time series.
5660
+ The covariance is normalized by N-1.
5661
+
5662
+ For DataFrames that have Series that are missing data (assuming that
5663
+ data is `missing at random
5664
+ <https://en.wikipedia.org/wiki/Missing_data#Missing_at_random>`__)
5665
+ the returned covariance matrix will be an unbiased estimate
5666
+ of the variance and covariance between the member Series.
5667
+
5668
+ However, for many applications this estimate may not be acceptable
5669
+ because the estimate covariance matrix is not guaranteed to be positive
5670
+ semi-definite. This could lead to estimate correlations having
5671
+ absolute values which are greater than one, and/or a non-invertible
5672
+ covariance matrix. See `Estimation of covariance matrices
5673
+ <http://en.wikipedia.org/w/index.php?title=Estimation_of_covariance_
5674
+ matrices>`__ for more details.
5675
+
5676
+ Examples
5677
+ --------
5678
+ >>> df = pd.DataFrame([(1, 2), (0, 3), (2, 0), (1, 1)],
5679
+ ... columns=['dogs', 'cats'])
5680
+ >>> df.cov()
5681
+ dogs cats
5682
+ dogs 0.666667 -1.000000
5683
+ cats -1.000000 1.666667
5684
+
5685
+ >>> np.random.seed(42)
5686
+ >>> df = pd.DataFrame(np.random.randn(1000, 5),
5687
+ ... columns=['a', 'b', 'c', 'd', 'e'])
5688
+ >>> df.cov()
5689
+ a b c d e
5690
+ a 0.998438 -0.020161 0.059277 -0.008943 0.014144
5691
+ b -0.020161 1.059352 -0.008543 -0.024738 0.009826
5692
+ c 0.059277 -0.008543 1.010670 -0.001486 -0.000271
5693
+ d -0.008943 -0.024738 -0.001486 0.921297 -0.013692
5694
+ e 0.014144 0.009826 -0.000271 -0.013692 0.977795
5695
+
5696
+ **Minimum number of periods**
5697
+
5698
+ This method also supports an optional ``min_periods`` keyword
5699
+ that specifies the required minimum number of non-NA observations for
5700
+ each column pair in order to have a valid result:
5701
+
5702
+ >>> np.random.seed(42)
5703
+ >>> df = pd.DataFrame(np.random.randn(20, 3),
5704
+ ... columns=['a', 'b', 'c'])
5705
+ >>> df.loc[df.index[:5], 'a'] = np.nan
5706
+ >>> df.loc[df.index[5:10], 'b'] = np.nan
5707
+ >>> df.cov(min_periods=12)
5708
+ a b c
5709
+ a 0.316741 NaN -0.150812
5710
+ b NaN 1.248003 0.191417
5711
+ c -0.150812 0.191417 0.895202
5520
5712
"""
5521
5713
numeric_df = self ._get_numeric_data ()
5522
5714
cols = numeric_df .columns
0 commit comments