diff --git a/doc/source/comparison_with_r.rst b/doc/source/comparison_with_r.rst index eecacde8ad14e..704b0c4d80537 100644 --- a/doc/source/comparison_with_r.rst +++ b/doc/source/comparison_with_r.rst @@ -6,7 +6,7 @@ import pandas as pd import numpy as np - pd.options.display.max_rows=15 + pd.options.display.max_rows = 15 Comparison with R / R libraries ******************************* @@ -165,16 +165,15 @@ function. .. ipython:: python - df = pd.DataFrame({ - 'v1': [1,3,5,7,8,3,5,np.nan,4,5,7,9], - 'v2': [11,33,55,77,88,33,55,np.nan,44,55,77,99], - 'by1': ["red", "blue", 1, 2, np.nan, "big", 1, 2, "red", 1, np.nan, 12], - 'by2': ["wet", "dry", 99, 95, np.nan, "damp", 95, 99, "red", 99, np.nan, - np.nan] - }) + df = pd.DataFrame( + {'v1': [1, 3, 5, 7, 8, 3, 5, np.nan, 4, 5, 7, 9], + 'v2': [11, 33, 55, 77, 88, 33, 55, np.nan, 44, 55, 77, 99], + 'by1': ["red", "blue", 1, 2, np.nan, "big", 1, 2, "red", 1, np.nan, 12], + 'by2': ["wet", "dry", 99, 95, np.nan, "damp", 95, 99, "red", 99, np.nan, + np.nan]}) - g = df.groupby(['by1','by2']) - g[['v1','v2']].mean() + g = df.groupby(['by1', 'by2']) + g[['v1', 'v2']].mean() For more details and examples see :ref:`the groupby documentation `. @@ -195,7 +194,7 @@ The :meth:`~pandas.DataFrame.isin` method is similar to R ``%in%`` operator: .. ipython:: python - s = pd.Series(np.arange(5),dtype=np.float32) + s = pd.Series(np.arange(5), dtype=np.float32) s.isin([2, 4]) The ``match`` function returns a vector of the positions of matches @@ -234,11 +233,11 @@ In ``pandas`` we may use :meth:`~pandas.pivot_table` method to handle this: import random import string - baseball = pd.DataFrame({ - 'team': ["team %d" % (x+1) for x in range(5)]*5, - 'player': random.sample(list(string.ascii_lowercase),25), - 'batting avg': np.random.uniform(.200, .400, 25) - }) + baseball = pd.DataFrame( + {'team': ["team %d" % (x + 1) for x in range(5)] * 5, + 'player': random.sample(list(string.ascii_lowercase), 25), + 'batting avg': np.random.uniform(.200, .400, 25)}) + baseball.pivot_table(values='batting avg', columns='team', aggfunc=np.max) For more details and examples see :ref:`the reshaping documentation @@ -341,15 +340,13 @@ In ``pandas`` the equivalent expression, using the .. ipython:: python - df = pd.DataFrame({ - 'x': np.random.uniform(1., 168., 120), - 'y': np.random.uniform(7., 334., 120), - 'z': np.random.uniform(1.7, 20.7, 120), - 'month': [5,6,7,8]*30, - 'week': np.random.randint(1,4, 120) - }) + df = pd.DataFrame({'x': np.random.uniform(1., 168., 120), + 'y': np.random.uniform(7., 334., 120), + 'z': np.random.uniform(1.7, 20.7, 120), + 'month': [5, 6, 7, 8] * 30, + 'week': np.random.randint(1, 4, 120)}) - grouped = df.groupby(['month','week']) + grouped = df.groupby(['month', 'week']) grouped['x'].agg([np.mean, np.std]) @@ -374,8 +371,8 @@ In Python, since ``a`` is a list, you can simply use list comprehension. .. ipython:: python - a = np.array(list(range(1,24))+[np.NAN]).reshape(2,3,4) - pd.DataFrame([tuple(list(x)+[val]) for x, val in np.ndenumerate(a)]) + a = np.array(list(range(1, 24)) + [np.NAN]).reshape(2, 3, 4) + pd.DataFrame([tuple(list(x) + [val]) for x, val in np.ndenumerate(a)]) |meltlist|_ ~~~~~~~~~~~~ @@ -393,7 +390,7 @@ In Python, this list would be a list of tuples, so .. ipython:: python - a = list(enumerate(list(range(1,5))+[np.NAN])) + a = list(enumerate(list(range(1, 5)) + [np.NAN])) pd.DataFrame(a) For more details and examples see :ref:`the Into to Data Structures @@ -419,12 +416,13 @@ In Python, the :meth:`~pandas.melt` method is the R equivalent: .. ipython:: python - cheese = pd.DataFrame({'first' : ['John', 'Mary'], - 'last' : ['Doe', 'Bo'], - 'height' : [5.5, 6.0], - 'weight' : [130, 150]}) + cheese = pd.DataFrame({'first': ['John', 'Mary'], + 'last': ['Doe', 'Bo'], + 'height': [5.5, 6.0], + 'weight': [130, 150]}) + pd.melt(cheese, id_vars=['first', 'last']) - cheese.set_index(['first', 'last']).stack() # alternative way + cheese.set_index(['first', 'last']).stack() # alternative way For more details and examples see :ref:`the reshaping documentation `. @@ -452,16 +450,15 @@ In Python the best way is to make use of :meth:`~pandas.pivot_table`: .. ipython:: python - df = pd.DataFrame({ - 'x': np.random.uniform(1., 168., 12), - 'y': np.random.uniform(7., 334., 12), - 'z': np.random.uniform(1.7, 20.7, 12), - 'month': [5,6,7]*4, - 'week': [1,2]*6 - }) + df = pd.DataFrame({'x': np.random.uniform(1., 168., 12), + 'y': np.random.uniform(7., 334., 12), + 'z': np.random.uniform(1.7, 20.7, 12), + 'month': [5, 6, 7] * 4, + 'week': [1, 2] * 6}) + mdf = pd.melt(df, id_vars=['month', 'week']) - pd.pivot_table(mdf, values='value', index=['variable','week'], - columns=['month'], aggfunc=np.mean) + pd.pivot_table(mdf, values='value', index=['variable', 'week'], + columns=['month'], aggfunc=np.mean) Similarly for ``dcast`` which uses a data.frame called ``df`` in R to aggregate information based on ``Animal`` and ``FeedType``: @@ -491,13 +488,14 @@ using :meth:`~pandas.pivot_table`: 'Amount': [10, 7, 4, 2, 5, 6, 2], }) - df.pivot_table(values='Amount', index='Animal', columns='FeedType', aggfunc='sum') + df.pivot_table(values='Amount', index='Animal', columns='FeedType', + aggfunc='sum') The second approach is to use the :meth:`~pandas.DataFrame.groupby` method: .. ipython:: python - df.groupby(['Animal','FeedType'])['Amount'].sum() + df.groupby(['Animal', 'FeedType'])['Amount'].sum() For more details and examples see :ref:`the reshaping documentation ` or :ref:`the groupby documentation`. @@ -516,8 +514,8 @@ In pandas this is accomplished with ``pd.cut`` and ``astype("category")``: .. ipython:: python - pd.cut(pd.Series([1,2,3,4,5,6]), 3) - pd.Series([1,2,3,2,2,3]).astype("category") + pd.cut(pd.Series([1, 2, 3, 4, 5, 6]), 3) + pd.Series([1, 2, 3, 2, 2, 3]).astype("category") For more details and examples see :ref:`categorical introduction ` and the :ref:`API documentation `. There is also a documentation regarding the diff --git a/doc/source/comparison_with_sql.rst b/doc/source/comparison_with_sql.rst index db143cd586441..021f37eb5c66f 100644 --- a/doc/source/comparison_with_sql.rst +++ b/doc/source/comparison_with_sql.rst @@ -23,7 +23,8 @@ structure. .. ipython:: python - url = 'https://raw.github.com/pandas-dev/pandas/master/pandas/tests/data/tips.csv' + url = ('https://raw.github.com/pandas-dev' + '/pandas/master/pandas/tests/data/tips.csv') tips = pd.read_csv(url) tips.head() @@ -387,7 +388,7 @@ Top N rows with offset .. ipython:: python - tips.nlargest(10+5, columns='tip').tail(10) + tips.nlargest(10 + 5, columns='tip').tail(10) Top N rows per group ~~~~~~~~~~~~~~~~~~~~ @@ -411,8 +412,7 @@ Top N rows per group .groupby(['day']) .cumcount() + 1) .query('rn < 3') - .sort_values(['day','rn']) - ) + .sort_values(['day', 'rn'])) the same using `rank(method='first')` function @@ -421,8 +421,7 @@ the same using `rank(method='first')` function (tips.assign(rnk=tips.groupby(['day'])['total_bill'] .rank(method='first', ascending=False)) .query('rnk < 3') - .sort_values(['day','rnk']) - ) + .sort_values(['day', 'rnk'])) .. code-block:: sql @@ -445,11 +444,10 @@ Notice that when using ``rank(method='min')`` function .. ipython:: python (tips[tips['tip'] < 2] - .assign(rnk_min=tips.groupby(['sex'])['tip'] - .rank(method='min')) - .query('rnk_min < 3') - .sort_values(['sex','rnk_min']) - ) + .assign(rnk_min=tips.groupby(['sex'])['tip'] + .rank(method='min')) + .query('rnk_min < 3') + .sort_values(['sex', 'rnk_min'])) UPDATE diff --git a/doc/source/comparison_with_stata.rst b/doc/source/comparison_with_stata.rst index 6c518983d5904..e039843b22065 100644 --- a/doc/source/comparison_with_stata.rst +++ b/doc/source/comparison_with_stata.rst @@ -102,9 +102,7 @@ and the values are the data. .. ipython:: python - df = pd.DataFrame({ - 'x': [1, 3, 5], - 'y': [2, 4, 6]}) + df = pd.DataFrame({'x': [1, 3, 5], 'y': [2, 4, 6]}) df @@ -128,7 +126,8 @@ the data set if presented with a url. .. ipython:: python - url = 'https://raw.github.com/pandas-dev/pandas/master/pandas/tests/data/tips.csv' + url = ('https://raw.github.com/pandas-dev' + '/pandas/master/pandas/tests/data/tips.csv') tips = pd.read_csv(url) tips.head() @@ -278,17 +277,17 @@ see the :ref:`timeseries documentation` for more details. tips['date1_year'] = tips['date1'].dt.year tips['date2_month'] = tips['date2'].dt.month tips['date1_next'] = tips['date1'] + pd.offsets.MonthBegin() - tips['months_between'] = (tips['date2'].dt.to_period('M') - - tips['date1'].dt.to_period('M')) + tips['months_between'] = (tips['date2'].dt.to_period('M') + - tips['date1'].dt.to_period('M')) - tips[['date1','date2','date1_year','date2_month', - 'date1_next','months_between']].head() + tips[['date1', 'date2', 'date1_year', 'date2_month', 'date1_next', + 'months_between']].head() .. ipython:: python :suppress: - tips = tips.drop(['date1','date2','date1_year', - 'date2_month','date1_next','months_between'], axis=1) + tips = tips.drop(['date1', 'date2', 'date1_year', 'date2_month', + 'date1_next', 'months_between'], axis=1) Selection of Columns ~~~~~~~~~~~~~~~~~~~~ @@ -472,7 +471,7 @@ The following tables will be used in the merge examples 'value': np.random.randn(4)}) df1 df2 = pd.DataFrame({'key': ['B', 'D', 'D', 'E'], - 'value': np.random.randn(4)}) + 'value': np.random.randn(4)}) df2 In Stata, to perform a merge, one data set must be in memory @@ -661,7 +660,7 @@ In pandas this would be written as: .. ipython:: python - tips.groupby(['sex','smoker']).first() + tips.groupby(['sex', 'smoker']).first() Other Considerations diff --git a/doc/source/computation.rst b/doc/source/computation.rst index 0d2021de8f88e..251dce5141ea5 100644 --- a/doc/source/computation.rst +++ b/doc/source/computation.rst @@ -4,14 +4,15 @@ :suppress: import numpy as np + import matplotlib.pyplot as plt + + import pandas as pd + np.random.seed(123456) np.set_printoptions(precision=4, suppress=True) - import pandas as pd - import matplotlib - # matplotlib.style.use('default') - import matplotlib.pyplot as plt + pd.options.display.max_rows = 15 + plt.close('all') - pd.options.display.max_rows=15 .. _computation: @@ -75,7 +76,8 @@ series in the DataFrame, also excluding NA/null values. .. ipython:: python - frame = pd.DataFrame(np.random.randn(1000, 5), columns=['a', 'b', 'c', 'd', 'e']) + frame = pd.DataFrame(np.random.randn(1000, 5), + columns=['a', 'b', 'c', 'd', 'e']) frame.cov() ``DataFrame.cov`` also supports an optional ``min_periods`` keyword that @@ -127,7 +129,8 @@ Wikipedia has articles covering the above correlation coefficients: .. ipython:: python - frame = pd.DataFrame(np.random.randn(1000, 5), columns=['a', 'b', 'c', 'd', 'e']) + frame = pd.DataFrame(np.random.randn(1000, 5), + columns=['a', 'b', 'c', 'd', 'e']) frame.iloc[::2] = np.nan # Series with Series @@ -163,9 +166,10 @@ compute the correlation based on histogram intersection: .. ipython:: python # histogram intersection - histogram_intersection = lambda a, b: np.minimum( - np.true_divide(a, a.sum()), np.true_divide(b, b.sum()) - ).sum() + def histogram_intersection(a, b): + return np.minimum(np.true_divide(a, a.sum()), + np.true_divide(b, b.sum())).sum() + frame.corr(method=histogram_intersection) A related method :meth:`~DataFrame.corrwith` is implemented on DataFrame to @@ -192,7 +196,7 @@ assigned the mean of the ranks (by default) for the group: .. ipython:: python s = pd.Series(np.random.np.random.randn(5), index=list('abcde')) - s['d'] = s['b'] # so there's a tie + s['d'] = s['b'] # so there's a tie s.rank() :meth:`~DataFrame.rank` is also a DataFrame method and can rank either the rows @@ -202,7 +206,7 @@ ranking. .. ipython:: python df = pd.DataFrame(np.random.np.random.randn(10, 6)) - df[4] = df[2][:5] # some ties + df[4] = df[2][:5] # some ties df df.rank(1) @@ -243,7 +247,8 @@ objects, :class:`~pandas.core.window.Rolling`, :class:`~pandas.core.window.Expan .. ipython:: python - s = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000)) + s = pd.Series(np.random.randn(1000), + index=pd.date_range('1/1/2000', periods=1000)) s = s.cumsum() s @@ -258,7 +263,7 @@ These object provide tab-completion of the available methods and properties. .. code-block:: ipython - In [14]: r. + In [14]: r. # noqa: E225, E999 r.agg r.apply r.count r.exclusions r.max r.median r.name r.skew r.sum r.aggregate r.corr r.cov r.kurt r.mean r.min r.quantile r.std r.var @@ -336,7 +341,9 @@ compute the mean absolute deviation on a rolling basis: .. ipython:: python - mad = lambda x: np.fabs(x - x.mean()).mean() + def mad(x): + return np.fabs(x - x.mean()).mean() + @savefig rolling_apply_ex.png s.rolling(window=60).apply(mad, raw=True).plot(style='k') @@ -376,7 +383,8 @@ The list of recognized types are the `scipy.signal window functions .. ipython:: python - ser = pd.Series(np.random.randn(10), index=pd.date_range('1/1/2000', periods=10)) + ser = pd.Series(np.random.randn(10), + index=pd.date_range('1/1/2000', periods=10)) ser.rolling(window=5, win_type='triang').mean() @@ -423,7 +431,9 @@ This can be particularly useful for a non-regular time frequency index. .. ipython:: python dft = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}, - index=pd.date_range('20130101 09:00:00', periods=5, freq='s')) + index=pd.date_range('20130101 09:00:00', + periods=5, + freq='s')) dft This is a regular frequency index. Using an integer window parameter works to roll along the window frequency. @@ -445,12 +455,12 @@ Using a non-regular, but still monotonic index, rolling with an integer window d .. ipython:: python dft = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}, - index = pd.Index([pd.Timestamp('20130101 09:00:00'), - pd.Timestamp('20130101 09:00:02'), - pd.Timestamp('20130101 09:00:03'), - pd.Timestamp('20130101 09:00:05'), - pd.Timestamp('20130101 09:00:06')], - name='foo')) + index=pd.Index([pd.Timestamp('20130101 09:00:00'), + pd.Timestamp('20130101 09:00:02'), + pd.Timestamp('20130101 09:00:03'), + pd.Timestamp('20130101 09:00:05'), + pd.Timestamp('20130101 09:00:06')], + name='foo')) dft dft.rolling(2).sum() @@ -496,11 +506,11 @@ from present information back to past information. This allows the rolling windo .. ipython:: python df = pd.DataFrame({'x': 1}, - index = [pd.Timestamp('20130101 09:00:01'), - pd.Timestamp('20130101 09:00:02'), - pd.Timestamp('20130101 09:00:03'), - pd.Timestamp('20130101 09:00:04'), - pd.Timestamp('20130101 09:00:06')]) + index=[pd.Timestamp('20130101 09:00:01'), + pd.Timestamp('20130101 09:00:02'), + pd.Timestamp('20130101 09:00:03'), + pd.Timestamp('20130101 09:00:04'), + pd.Timestamp('20130101 09:00:06')]) df["right"] = df.rolling('2s', closed='right').x.sum() # default df["both"] = df.rolling('2s', closed='both').x.sum() @@ -601,7 +611,8 @@ can even be omitted: .. ipython:: python - covs = df[['B','C','D']].rolling(window=50).cov(df[['A','B','C']], pairwise=True) + covs = (df[['B', 'C', 'D']].rolling(window=50) + .cov(df[['A', 'B', 'C']], pairwise=True)) covs.loc['2002-09-22':] .. ipython:: python @@ -637,7 +648,7 @@ perform multiple computations on the data. These operations are similar to the : dfa = pd.DataFrame(np.random.randn(1000, 3), index=pd.date_range('1/1/2000', periods=1000), columns=['A', 'B', 'C']) - r = dfa.rolling(window=60,min_periods=1) + r = dfa.rolling(window=60, min_periods=1) r We can aggregate by passing a function to the entire DataFrame, or select a @@ -649,7 +660,7 @@ Series (or multiple Series) via standard ``__getitem__``. r['A'].aggregate(np.sum) - r[['A','B']].aggregate(np.sum) + r[['A', 'B']].aggregate(np.sum) As you can see, the result of the aggregation will have the selected columns, or all columns if none are selected. @@ -683,24 +694,21 @@ By passing a dict to ``aggregate`` you can apply a different aggregation to the columns of a ``DataFrame``: .. ipython:: python - :okexcept: - :okwarning: - r.agg({'A' : np.sum, - 'B' : lambda x: np.std(x, ddof=1)}) + r.agg({'A': np.sum, 'B': lambda x: np.std(x, ddof=1)}) The function names can also be strings. In order for a string to be valid it must be implemented on the windowed object .. ipython:: python - r.agg({'A' : 'sum', 'B' : 'std'}) + r.agg({'A': 'sum', 'B': 'std'}) Furthermore you can pass a nested dict to indicate different aggregations on different columns. .. ipython:: python - r.agg({'A' : ['sum','std'], 'B' : ['mean','std'] }) + r.agg({'A': ['sum', 'std'], 'B': ['mean', 'std']}) .. _stats.moments.expanding: