From b4fa50039085df8dc8aaedc8fc1d61de1a30934f Mon Sep 17 00:00:00 2001 From: Thiago Fonseca Date: Fri, 16 Mar 2018 11:59:05 -0300 Subject: [PATCH 1/3] DOC: Update pandas.core.resample.Resampler.nearest docstring --- pandas/core/resample.py | 131 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 125 insertions(+), 6 deletions(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 004d572375234..73ca4464ecb22 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -498,23 +498,142 @@ def pad(self, limit=None): def nearest(self, limit=None): """ - Fill values with nearest neighbor starting from center + Fill the new missing values with their nearest neighbor value, based + on index. + + When resampling data, missing values may appear (e.g., when the + resampling frequency is higher than the original frequency). + The nearest fill will replace ``NaN`` values that appeared in + the resampled data with the value from the nearest member of the + sequence, based on the index value. + Missing values that existed in the original data will not be modified. + If `limit` is given, fill only `limit` values in each direction for + each of the original values. Parameters ---------- limit : integer, optional - limit of how many values to fill + Limit of how many values to fill. .. versionadded:: 0.21.0 Returns ------- - an upsampled Series + Series, DataFrame + An upsampled Series or DataFrame with ``NaN`` values filled with + their closest neighbor value. See Also -------- - Series.fillna - DataFrame.fillna + backfill: Backward fill the new missing values in the resampled data. + fillna : Fill ``NaN`` values using the specified method, which can be + 'backfill'. + pad : Forward fill ``NaN`` values. + pandas.Series.fillna : Fill ``NaN`` values in the Series using the + specified method, which can be 'backfill'. + pandas.DataFrame.fillna : Fill ``NaN`` values in the DataFrame using + the specified method, which can be 'backfill'. + + Examples + -------- + + Resampling a Series: + + >>> s = pd.Series([1, 2, 3], + ... index=pd.date_range('20180101', periods=3, + ... freq='1h')) + >>> s + 2018-01-01 00:00:00 1 + 2018-01-01 01:00:00 2 + 2018-01-01 02:00:00 3 + Freq: H, dtype: int64 + + >>> s.resample('20min').nearest() + 2018-01-01 00:00:00 1 + 2018-01-01 00:20:00 1 + 2018-01-01 00:40:00 2 + 2018-01-01 01:00:00 2 + 2018-01-01 01:20:00 2 + 2018-01-01 01:40:00 3 + 2018-01-01 02:00:00 3 + Freq: 20T, dtype: int64 + + Resample in the middle: + + >>> s.resample('30min').nearest() + 2018-01-01 00:00:00 1 + 2018-01-01 00:30:00 2 + 2018-01-01 01:00:00 2 + 2018-01-01 01:30:00 3 + 2018-01-01 02:00:00 3 + Freq: 30T, dtype: int64 + + Limited fill: + + >>> s.resample('10min').nearest(limit=1) + 2018-01-01 00:00:00 1.0 + 2018-01-01 00:10:00 1.0 + 2018-01-01 00:20:00 NaN + 2018-01-01 00:30:00 NaN + 2018-01-01 00:40:00 NaN + 2018-01-01 00:50:00 2.0 + 2018-01-01 01:00:00 2.0 + 2018-01-01 01:10:00 2.0 + 2018-01-01 01:20:00 NaN + 2018-01-01 01:30:00 NaN + 2018-01-01 01:40:00 NaN + 2018-01-01 01:50:00 3.0 + 2018-01-01 02:00:00 3.0 + Freq: 10T, dtype: float64 + + Resampling a DataFrame that has missing values: + + >>> df = pd.DataFrame({'a': [2, np.nan, 6], 'b': [1, 3, 5]}, + ... index=pd.date_range('20180101', periods=3, + ... freq='h')) + >>> df + a b + 2018-01-01 00:00:00 2.0 1 + 2018-01-01 01:00:00 NaN 3 + 2018-01-01 02:00:00 6.0 5 + + >>> df.resample('20min').nearest() + a b + 2018-01-01 00:00:00 2.0 1 + 2018-01-01 00:20:00 2.0 1 + 2018-01-01 00:40:00 NaN 3 + 2018-01-01 01:00:00 NaN 3 + 2018-01-01 01:20:00 NaN 3 + 2018-01-01 01:40:00 6.0 5 + 2018-01-01 02:00:00 6.0 5 + + Resampling a DataFrame with shuffled indexes: + + >>> df = pd.DataFrame({'a': [2, 6, 4]}, + ... index=pd.date_range('20180101', periods=3, + ... freq='h')) + >>> df + a + 2018-01-01 00:00:00 2 + 2018-01-01 01:00:00 6 + 2018-01-01 02:00:00 4 + + >>> sorted_df = df.sort_values(by=['a']) + >>> sorted_df + a + 2018-01-01 00:00:00 2 + 2018-01-01 02:00:00 4 + 2018-01-01 01:00:00 6 + + >>> sorted_df.resample('20min').nearest() + a + 2018-01-01 00:00:00 2 + 2018-01-01 00:20:00 2 + 2018-01-01 00:40:00 6 + 2018-01-01 01:00:00 6 + 2018-01-01 01:20:00 6 + 2018-01-01 01:40:00 4 + 2018-01-01 02:00:00 4 """ return self._upsample('nearest', limit=limit) @@ -527,7 +646,7 @@ def backfill(self, limit=None): appear (e.g., when the resampling frequency is higher than the original frequency). The backward fill will replace NaN values that appeared in the resampled data with the next value in the original sequence. - Missing values that existed in the orginal data will not be modified. + Missing values that existed in the original data will not be modified. Parameters ---------- From 14a3b3ae5709423b6f5001f512a2b2a2ed1567f4 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Sat, 3 Nov 2018 06:16:26 +0000 Subject: [PATCH 2/3] Minor fixes an making examples shorter --- pandas/core/resample.py | 76 ++++------------------------------------- 1 file changed, 7 insertions(+), 69 deletions(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index a2a3e73fe1f9b..a473b8d9c9673 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -418,8 +418,7 @@ def pad(self, limit=None): def nearest(self, limit=None): """ - Fill the new missing values with their nearest neighbor value, based - on index. + Resample by using the nearest value. When resampling data, missing values may appear (e.g., when the resampling frequency is higher than the original frequency). @@ -432,16 +431,16 @@ def nearest(self, limit=None): Parameters ---------- - limit : integer, optional + limit : int, optional Limit of how many values to fill. .. versionadded:: 0.21.0 Returns ------- - Series, DataFrame + Series or DataFrame An upsampled Series or DataFrame with ``NaN`` values filled with - their closest neighbor value. + their nearest value. See Also -------- @@ -456,11 +455,9 @@ def nearest(self, limit=None): Examples -------- - - Resampling a Series: - >>> s = pd.Series([1, 2, 3], - ... index=pd.date_range('20180101', periods=3, + ... index=pd.date_range('20180101', + ... periods=3, ... freq='1h')) >>> s 2018-01-01 00:00:00 1 @@ -478,17 +475,7 @@ def nearest(self, limit=None): 2018-01-01 02:00:00 3 Freq: 20T, dtype: int64 - Resample in the middle: - - >>> s.resample('30min').nearest() - 2018-01-01 00:00:00 1 - 2018-01-01 00:30:00 2 - 2018-01-01 01:00:00 2 - 2018-01-01 01:30:00 3 - 2018-01-01 02:00:00 3 - Freq: 30T, dtype: int64 - - Limited fill: + Limit the number of upsampled values imputed by the nearest: >>> s.resample('10min').nearest(limit=1) 2018-01-01 00:00:00 1.0 @@ -505,55 +492,6 @@ def nearest(self, limit=None): 2018-01-01 01:50:00 3.0 2018-01-01 02:00:00 3.0 Freq: 10T, dtype: float64 - - Resampling a DataFrame that has missing values: - - >>> df = pd.DataFrame({'a': [2, np.nan, 6], 'b': [1, 3, 5]}, - ... index=pd.date_range('20180101', periods=3, - ... freq='h')) - >>> df - a b - 2018-01-01 00:00:00 2.0 1 - 2018-01-01 01:00:00 NaN 3 - 2018-01-01 02:00:00 6.0 5 - - >>> df.resample('20min').nearest() - a b - 2018-01-01 00:00:00 2.0 1 - 2018-01-01 00:20:00 2.0 1 - 2018-01-01 00:40:00 NaN 3 - 2018-01-01 01:00:00 NaN 3 - 2018-01-01 01:20:00 NaN 3 - 2018-01-01 01:40:00 6.0 5 - 2018-01-01 02:00:00 6.0 5 - - Resampling a DataFrame with shuffled indexes: - - >>> df = pd.DataFrame({'a': [2, 6, 4]}, - ... index=pd.date_range('20180101', periods=3, - ... freq='h')) - >>> df - a - 2018-01-01 00:00:00 2 - 2018-01-01 01:00:00 6 - 2018-01-01 02:00:00 4 - - >>> sorted_df = df.sort_values(by=['a']) - >>> sorted_df - a - 2018-01-01 00:00:00 2 - 2018-01-01 02:00:00 4 - 2018-01-01 01:00:00 6 - - >>> sorted_df.resample('20min').nearest() - a - 2018-01-01 00:00:00 2 - 2018-01-01 00:20:00 2 - 2018-01-01 00:40:00 6 - 2018-01-01 01:00:00 6 - 2018-01-01 01:20:00 6 - 2018-01-01 01:40:00 4 - 2018-01-01 02:00:00 4 """ return self._upsample('nearest', limit=limit) From 3ab559ac24d78e6724836c70b344a1f6b3776b78 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Sun, 4 Nov 2018 04:52:50 +0000 Subject: [PATCH 3/3] Addressing comments from the review, and making the examples and see also section shorter --- pandas/core/resample.py | 45 +++++++++++++---------------------------- 1 file changed, 14 insertions(+), 31 deletions(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index a473b8d9c9673..0dcc11aa2e193 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -422,11 +422,11 @@ def nearest(self, limit=None): When resampling data, missing values may appear (e.g., when the resampling frequency is higher than the original frequency). - The nearest fill will replace ``NaN`` values that appeared in + The `nearest` method will replace ``NaN`` values that appeared in the resampled data with the value from the nearest member of the sequence, based on the index value. Missing values that existed in the original data will not be modified. - If `limit` is given, fill only `limit` values in each direction for + If `limit` is given, fill only this many values in each direction for each of the original values. Parameters @@ -444,54 +444,37 @@ def nearest(self, limit=None): See Also -------- - backfill: Backward fill the new missing values in the resampled data. - fillna : Fill ``NaN`` values using the specified method, which can be - 'backfill'. + backfill : Backward fill the new missing values in the resampled data. pad : Forward fill ``NaN`` values. - pandas.Series.fillna : Fill ``NaN`` values in the Series using the - specified method, which can be 'backfill'. - pandas.DataFrame.fillna : Fill ``NaN`` values in the DataFrame using - the specified method, which can be 'backfill'. Examples -------- - >>> s = pd.Series([1, 2, 3], + >>> s = pd.Series([1, 2], ... index=pd.date_range('20180101', - ... periods=3, + ... periods=2, ... freq='1h')) >>> s 2018-01-01 00:00:00 1 2018-01-01 01:00:00 2 - 2018-01-01 02:00:00 3 Freq: H, dtype: int64 - >>> s.resample('20min').nearest() + >>> s.resample('15min').nearest() 2018-01-01 00:00:00 1 - 2018-01-01 00:20:00 1 - 2018-01-01 00:40:00 2 + 2018-01-01 00:15:00 1 + 2018-01-01 00:30:00 2 + 2018-01-01 00:45:00 2 2018-01-01 01:00:00 2 - 2018-01-01 01:20:00 2 - 2018-01-01 01:40:00 3 - 2018-01-01 02:00:00 3 - Freq: 20T, dtype: int64 + Freq: 15T, dtype: int64 Limit the number of upsampled values imputed by the nearest: - >>> s.resample('10min').nearest(limit=1) + >>> s.resample('15min').nearest(limit=1) 2018-01-01 00:00:00 1.0 - 2018-01-01 00:10:00 1.0 - 2018-01-01 00:20:00 NaN + 2018-01-01 00:15:00 1.0 2018-01-01 00:30:00 NaN - 2018-01-01 00:40:00 NaN - 2018-01-01 00:50:00 2.0 + 2018-01-01 00:45:00 2.0 2018-01-01 01:00:00 2.0 - 2018-01-01 01:10:00 2.0 - 2018-01-01 01:20:00 NaN - 2018-01-01 01:30:00 NaN - 2018-01-01 01:40:00 NaN - 2018-01-01 01:50:00 3.0 - 2018-01-01 02:00:00 3.0 - Freq: 10T, dtype: float64 + Freq: 15T, dtype: float64 """ return self._upsample('nearest', limit=limit)