Skip to content

Commit 8dca0eb

Browse files
authored
Merge branch 'master' into docstring_hexbin
2 parents 53bf39d + 4271757 commit 8dca0eb

File tree

22 files changed

+973
-245
lines changed

22 files changed

+973
-245
lines changed

doc/source/contributing.rst

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -262,8 +262,9 @@ after updating.
262262
Contributing to the documentation
263263
=================================
264264

265-
If you're not the developer type, contributing to the documentation is still of
266-
huge value. You don't even have to be an expert on *pandas* to do so! In fact,
265+
Contributing to the documentation benefits everyone who uses *pandas*.
266+
We encourage you to help us improve the documentation, and
267+
you don't have to be an expert on *pandas* to do so! In fact,
267268
there are sections of the docs that are worse off after being written by
268269
experts. If something in the docs doesn't make sense to you, updating the
269270
relevant section after you figure it out is a great way to ensure it will help

doc/source/whatsnew/v0.23.0.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -935,6 +935,7 @@ Indexing
935935
- Bug in :func:`IntervalIndex.symmetric_difference` where the symmetric difference with a non-``IntervalIndex`` did not raise (:issue:`18475`)
936936
- Bug in :class:`IntervalIndex` where set operations that returned an empty ``IntervalIndex`` had the wrong dtype (:issue:`19101`)
937937
- Bug in :meth:`DataFrame.drop_duplicates` where no ``KeyError`` is raised when passing in columns that don't exist on the ``DataFrame`` (issue:`19726`)
938+
- Bug in ``Index`` subclasses constructors that ignore unexpected keyword arguments (:issue:`19348`)
938939

939940

940941
MultiIndex

pandas/core/base.py

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1184,24 +1184,6 @@ def searchsorted(self, value, side='left', sorter=None):
11841184
# needs coercion on the key (DatetimeIndex does already)
11851185
return self.values.searchsorted(value, side=side, sorter=sorter)
11861186

1187-
_shared_docs['drop_duplicates'] = (
1188-
"""Return %(klass)s with duplicate values removed
1189-
1190-
Parameters
1191-
----------
1192-
1193-
keep : {'first', 'last', False}, default 'first'
1194-
- ``first`` : Drop duplicates except for the first occurrence.
1195-
- ``last`` : Drop duplicates except for the last occurrence.
1196-
- False : Drop all duplicates.
1197-
%(inplace)s
1198-
1199-
Returns
1200-
-------
1201-
deduplicated : %(klass)s
1202-
""")
1203-
1204-
@Appender(_shared_docs['drop_duplicates'] % _indexops_doc_kwargs)
12051187
def drop_duplicates(self, keep='first', inplace=False):
12061188
inplace = validate_bool_kwarg(inplace, 'inplace')
12071189
if isinstance(self, ABCIndexClass):

pandas/core/frame.py

Lines changed: 204 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1209,20 +1209,68 @@ def from_records(cls, data, index=None, exclude=None, columns=None,
12091209

12101210
def to_records(self, index=True, convert_datetime64=True):
12111211
"""
1212-
Convert DataFrame to record array. Index will be put in the
1213-
'index' field of the record array if requested
1212+
Convert DataFrame to a NumPy record array.
1213+
1214+
Index will be put in the 'index' field of the record array if
1215+
requested.
12141216
12151217
Parameters
12161218
----------
12171219
index : boolean, default True
1218-
Include index in resulting record array, stored in 'index' field
1220+
Include index in resulting record array, stored in 'index' field.
12191221
convert_datetime64 : boolean, default True
12201222
Whether to convert the index to datetime.datetime if it is a
1221-
DatetimeIndex
1223+
DatetimeIndex.
12221224
12231225
Returns
12241226
-------
1225-
y : recarray
1227+
y : numpy.recarray
1228+
1229+
See Also
1230+
--------
1231+
DataFrame.from_records: convert structured or record ndarray
1232+
to DataFrame.
1233+
numpy.recarray: ndarray that allows field access using
1234+
attributes, analogous to typed columns in a
1235+
spreadsheet.
1236+
1237+
Examples
1238+
--------
1239+
>>> df = pd.DataFrame({'A': [1, 2], 'B': [0.5, 0.75]},
1240+
... index=['a', 'b'])
1241+
>>> df
1242+
A B
1243+
a 1 0.50
1244+
b 2 0.75
1245+
>>> df.to_records()
1246+
rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
1247+
dtype=[('index', 'O'), ('A', '<i8'), ('B', '<f8')])
1248+
1249+
The index can be excluded from the record array:
1250+
1251+
>>> df.to_records(index=False)
1252+
rec.array([(1, 0.5 ), (2, 0.75)],
1253+
dtype=[('A', '<i8'), ('B', '<f8')])
1254+
1255+
By default, timestamps are converted to `datetime.datetime`:
1256+
1257+
>>> df.index = pd.date_range('2018-01-01 09:00', periods=2, freq='min')
1258+
>>> df
1259+
A B
1260+
2018-01-01 09:00:00 1 0.50
1261+
2018-01-01 09:01:00 2 0.75
1262+
>>> df.to_records()
1263+
rec.array([(datetime.datetime(2018, 1, 1, 9, 0), 1, 0.5 ),
1264+
(datetime.datetime(2018, 1, 1, 9, 1), 2, 0.75)],
1265+
dtype=[('index', 'O'), ('A', '<i8'), ('B', '<f8')])
1266+
1267+
The timestamp conversion can be disabled so NumPy's datetime64
1268+
data type is used instead:
1269+
1270+
>>> df.to_records(convert_datetime64=False)
1271+
rec.array([('2018-01-01T09:00:00.000000000', 1, 0.5 ),
1272+
('2018-01-01T09:01:00.000000000', 2, 0.75)],
1273+
dtype=[('index', '<M8[ns]'), ('A', '<i8'), ('B', '<f8')])
12261274
"""
12271275
if index:
12281276
if is_datetime64_any_dtype(self.index) and convert_datetime64:
@@ -4722,20 +4770,90 @@ def melt(self, id_vars=None, value_vars=None, var_name=None,
47224770

47234771
def diff(self, periods=1, axis=0):
47244772
"""
4725-
1st discrete difference of object
4773+
First discrete difference of element.
4774+
4775+
Calculates the difference of a DataFrame element compared with another
4776+
element in the DataFrame (default is the element in the same column
4777+
of the previous row).
47264778
47274779
Parameters
47284780
----------
47294781
periods : int, default 1
4730-
Periods to shift for forming difference
4782+
Periods to shift for calculating difference, accepts negative
4783+
values.
47314784
axis : {0 or 'index', 1 or 'columns'}, default 0
47324785
Take difference over rows (0) or columns (1).
47334786
4734-
.. versionadded:: 0.16.1
4787+
.. versionadded:: 0.16.1.
47354788
47364789
Returns
47374790
-------
47384791
diffed : DataFrame
4792+
4793+
See Also
4794+
--------
4795+
Series.diff: First discrete difference for a Series.
4796+
DataFrame.pct_change: Percent change over given number of periods.
4797+
DataFrame.shift: Shift index by desired number of periods with an
4798+
optional time freq.
4799+
4800+
Examples
4801+
--------
4802+
Difference with previous row
4803+
4804+
>>> df = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6],
4805+
... 'b': [1, 1, 2, 3, 5, 8],
4806+
... 'c': [1, 4, 9, 16, 25, 36]})
4807+
>>> df
4808+
a b c
4809+
0 1 1 1
4810+
1 2 1 4
4811+
2 3 2 9
4812+
3 4 3 16
4813+
4 5 5 25
4814+
5 6 8 36
4815+
4816+
>>> df.diff()
4817+
a b c
4818+
0 NaN NaN NaN
4819+
1 1.0 0.0 3.0
4820+
2 1.0 1.0 5.0
4821+
3 1.0 1.0 7.0
4822+
4 1.0 2.0 9.0
4823+
5 1.0 3.0 11.0
4824+
4825+
Difference with previous column
4826+
4827+
>>> df.diff(axis=1)
4828+
a b c
4829+
0 NaN 0.0 0.0
4830+
1 NaN -1.0 3.0
4831+
2 NaN -1.0 7.0
4832+
3 NaN -1.0 13.0
4833+
4 NaN 0.0 20.0
4834+
5 NaN 2.0 28.0
4835+
4836+
Difference with 3rd previous row
4837+
4838+
>>> df.diff(periods=3)
4839+
a b c
4840+
0 NaN NaN NaN
4841+
1 NaN NaN NaN
4842+
2 NaN NaN NaN
4843+
3 3.0 2.0 15.0
4844+
4 3.0 4.0 21.0
4845+
5 3.0 6.0 27.0
4846+
4847+
Difference with following row
4848+
4849+
>>> df.diff(periods=-1)
4850+
a b c
4851+
0 -1.0 0.0 -3.0
4852+
1 -1.0 -1.0 -5.0
4853+
2 -1.0 -1.0 -7.0
4854+
3 -1.0 -2.0 -9.0
4855+
4 -1.0 -3.0 -11.0
4856+
5 NaN NaN NaN
47394857
"""
47404858
bm_axis = self._get_block_manager_axis(axis)
47414859
new_data = self._data.diff(n=periods, axis=bm_axis)
@@ -5501,7 +5619,22 @@ def corr(self, method='pearson', min_periods=1):
55015619

55025620
def cov(self, min_periods=None):
55035621
"""
5504-
Compute pairwise covariance of columns, excluding NA/null values
5622+
Compute pairwise covariance of columns, excluding NA/null values.
5623+
5624+
Compute the pairwise covariance among the series of a DataFrame.
5625+
The returned data frame is the `covariance matrix
5626+
<https://en.wikipedia.org/wiki/Covariance_matrix>`__ of the columns
5627+
of the DataFrame.
5628+
5629+
Both NA and null values are automatically excluded from the
5630+
calculation. (See the note below about bias from missing values.)
5631+
A threshold can be set for the minimum number of
5632+
observations for each value created. Comparisons with observations
5633+
below this threshold will be returned as ``NaN``.
5634+
5635+
This method is generally used for the analysis of time series data to
5636+
understand the relationship between different measures
5637+
across time.
55055638
55065639
Parameters
55075640
----------
@@ -5511,12 +5644,71 @@ def cov(self, min_periods=None):
55115644
55125645
Returns
55135646
-------
5514-
y : DataFrame
5647+
DataFrame
5648+
The covariance matrix of the series of the DataFrame.
5649+
5650+
See Also
5651+
--------
5652+
pandas.Series.cov : compute covariance with another Series
5653+
pandas.core.window.EWM.cov: expoential weighted sample covariance
5654+
pandas.core.window.Expanding.cov : expanding sample covariance
5655+
pandas.core.window.Rolling.cov : rolling sample covariance
55155656
55165657
Notes
55175658
-----
5518-
`y` contains the covariance matrix of the DataFrame's time series.
5519-
The covariance is normalized by N-1 (unbiased estimator).
5659+
Returns the covariance matrix of the DataFrame's time series.
5660+
The covariance is normalized by N-1.
5661+
5662+
For DataFrames that have Series that are missing data (assuming that
5663+
data is `missing at random
5664+
<https://en.wikipedia.org/wiki/Missing_data#Missing_at_random>`__)
5665+
the returned covariance matrix will be an unbiased estimate
5666+
of the variance and covariance between the member Series.
5667+
5668+
However, for many applications this estimate may not be acceptable
5669+
because the estimate covariance matrix is not guaranteed to be positive
5670+
semi-definite. This could lead to estimate correlations having
5671+
absolute values which are greater than one, and/or a non-invertible
5672+
covariance matrix. See `Estimation of covariance matrices
5673+
<http://en.wikipedia.org/w/index.php?title=Estimation_of_covariance_
5674+
matrices>`__ for more details.
5675+
5676+
Examples
5677+
--------
5678+
>>> df = pd.DataFrame([(1, 2), (0, 3), (2, 0), (1, 1)],
5679+
... columns=['dogs', 'cats'])
5680+
>>> df.cov()
5681+
dogs cats
5682+
dogs 0.666667 -1.000000
5683+
cats -1.000000 1.666667
5684+
5685+
>>> np.random.seed(42)
5686+
>>> df = pd.DataFrame(np.random.randn(1000, 5),
5687+
... columns=['a', 'b', 'c', 'd', 'e'])
5688+
>>> df.cov()
5689+
a b c d e
5690+
a 0.998438 -0.020161 0.059277 -0.008943 0.014144
5691+
b -0.020161 1.059352 -0.008543 -0.024738 0.009826
5692+
c 0.059277 -0.008543 1.010670 -0.001486 -0.000271
5693+
d -0.008943 -0.024738 -0.001486 0.921297 -0.013692
5694+
e 0.014144 0.009826 -0.000271 -0.013692 0.977795
5695+
5696+
**Minimum number of periods**
5697+
5698+
This method also supports an optional ``min_periods`` keyword
5699+
that specifies the required minimum number of non-NA observations for
5700+
each column pair in order to have a valid result:
5701+
5702+
>>> np.random.seed(42)
5703+
>>> df = pd.DataFrame(np.random.randn(20, 3),
5704+
... columns=['a', 'b', 'c'])
5705+
>>> df.loc[df.index[:5], 'a'] = np.nan
5706+
>>> df.loc[df.index[5:10], 'b'] = np.nan
5707+
>>> df.cov(min_periods=12)
5708+
a b c
5709+
a 0.316741 NaN -0.150812
5710+
b NaN 1.248003 0.191417
5711+
c -0.150812 0.191417 0.895202
55205712
"""
55215713
numeric_df = self._get_numeric_data()
55225714
cols = numeric_df.columns

0 commit comments

Comments
 (0)