From d2759d59e52ba7ec7b31d7d1bf25610cffa7e56d Mon Sep 17 00:00:00 2001 From: No-Stream Date: Tue, 31 Oct 2017 15:31:38 -0700 Subject: [PATCH 01/21] ENH: gb.is_monotonic_increasing #17015 fix rebase conflicts --- doc/source/whatsnew/v0.21.0.txt | 10 ++++++ doc/source/whatsnew/v0.21.1.txt | 2 +- pandas/core/groupby.py | 49 ++++++++++++++++++++++++++ pandas/tests/groupby/test_whitelist.py | 23 ++++++------ 4 files changed, 72 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 4c460eeb85b82..f092f79e2f103 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -309,11 +309,21 @@ New keywords - :func:`read_feather` has gained the ``nthreads`` parameter for multi-threaded operations (:issue:`16359`) - :func:`DataFrame.clip()` and :func:`Series.clip()` have gained an ``inplace`` argument. (:issue:`15388`) - :func:`crosstab` has gained a ``margins_name`` parameter to define the name of the row / column that will contain the totals when ``margins=True``. (:issue:`15972`) +<<<<<<< HEAD - :func:`read_json` now accepts a ``chunksize`` parameter that can be used when ``lines=True``. If ``chunksize`` is passed, read_json now returns an iterator which reads in ``chunksize`` lines with each iteration. (:issue:`17048`) - :func:`read_json` and :func:`~DataFrame.to_json` now accept a ``compression`` argument which allows them to transparently handle compressed files. (:issue:`17798`) Various enhancements """""""""""""""""""" +======= +- :func:`DataFrame.select_dtypes` now accepts scalar values for include/exclude as well as list-like. (:issue:`16855`) +- :func:`date_range` now accepts 'YS' in addition to 'AS' as an alias for start of year (:issue:`9313`) +- :func:`date_range` now accepts 'Y' in addition to 'A' as an alias for end of year (:issue:`9313`) +- Integration with `Apache Parquet `__, including a new top-level :func:`read_parquet` and :func:`DataFrame.to_parquet` method, see :ref:`here `. +- :func:`DataFrame.add_prefix` and :func:`DataFrame.add_suffix` now accept strings containing the '%' character. (:issue:`17151`) +- `read_*` methods can now infer compression from non-string paths, such as ``pathlib.Path`` objects (:issue:`17206`). +- :func:`pd.read_sas()` now recognizes much more of the most frequently used date (datetime) formats in SAS7BDAT files (:issue:`15871`). +- :func:`DataFrame.items` and :func:`Series.items` is now present in both Python 2 and 3 and is lazy in all cases (:issue:`13918`, :issue:`17213`) - Improved the import time of pandas by about 2.25x. (:issue:`16764`) - Support for `PEP 519 -- Adding a file system path protocol diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 25a891eab0e86..8bf99e7990dc4 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -22,7 +22,7 @@ Other Enhancements ^^^^^^^^^^^^^^^^^^ - :meth:`Timestamp.timestamp` is now available in Python 2.7. (:issue:`17329`) -- +- :func: groupby.is_monotonic_increasing and .is_monotonic_decreasing extend Series.is_monotonic_increasing to groups, returning whether each group is monotonically increasing or decreasing, respectively. (:issue:`17015`) - .. _whatsnew_0211.deprecations: diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 5c07033f5a68f..ec3cce2821036 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1743,6 +1743,55 @@ def pipe(self, func, *args, **kwargs): """ return _pipe(self, func, *args, **kwargs) + @Substitution(name='groupby') + @Appender(_doc_template) + def is_monotonic_increasing(self): + """ + Returns whether each group is monotonically increasing. + + Equivalent to ``.apply(lambda x: x.is_monotonic_increasing)``. + + Examples + -------- + >>> source_dict = { + ... 'A': ['this', 'col', 'is', 'entirely', 'irrelevant', '.'], + ... 'B': ['cat_a', 'cat_a', 'cat_a', 'cat_b', 'cat_b', 'cat_b'], + ... 'C': [1, 2, 3, 2, 2, 0]} + + >>> df = pd.DataFrame(source_dict) + ... df.groupby(['B']).C.is_monotonic_increasing() + B + cat_a True + cat_b False + Name: C, dtype: bool + + """ + return self.apply(lambda x: x.is_monotonic_increasing) + + @Substitution(name='groupby') + @Appender(_doc_template) + def is_monotonic_decreasing(self): + """ + Returns whether each group is monotonically decreasing. + + Equivalent to ``.apply(lambda x: x.is_monotonic_decreasing)``. + + Examples + -------- + >>> source_dict = { + ... 'A': ['this', 'col', 'is', 'entirely', 'irrelevant', '.'], + ... 'B': ['cat_a', 'cat_a', 'cat_a', 'cat_b', 'cat_b', 'cat_b'], + ... 'C': [1, 2, 3, 2, 2, 0]} + + >>> df = pd.DataFrame(source_dict) + ... df.groupby(['B']).C.is_monotonic_decreasing() + B + cat_a False + cat_b True + Name: C, dtype: bool + """ + return self.apply(lambda x: x.is_monotonic_decreasing) + GroupBy._add_numeric_operations() diff --git a/pandas/tests/groupby/test_whitelist.py b/pandas/tests/groupby/test_whitelist.py index e8e2150558edb..37c3f725ff9a7 100644 --- a/pandas/tests/groupby/test_whitelist.py +++ b/pandas/tests/groupby/test_whitelist.py @@ -239,17 +239,18 @@ def test_groupby_blacklist(df_letters): def test_tab_completion(mframe): grp = mframe.groupby(level='second') results = set([v for v in dir(grp) if not v.startswith('_')]) - expected = { - 'A', 'B', 'C', 'agg', 'aggregate', 'apply', 'boxplot', 'filter', - 'first', 'get_group', 'groups', 'hist', 'indices', 'last', 'max', - 'mean', 'median', 'min', 'ngroups', 'nth', 'ohlc', 'plot', - 'prod', 'size', 'std', 'sum', 'transform', 'var', 'sem', 'count', - 'nunique', 'head', 'describe', 'cummax', 'quantile', - 'rank', 'cumprod', 'tail', 'resample', 'cummin', 'fillna', - 'cumsum', 'cumcount', 'ngroup', 'all', 'shift', 'skew', - 'take', 'tshift', 'pct_change', 'any', 'mad', 'corr', 'corrwith', - 'cov', 'dtypes', 'ndim', 'diff', 'idxmax', 'idxmin', - 'ffill', 'bfill', 'pad', 'backfill', 'rolling', 'expanding', 'pipe'} + expected = set( + ['A', 'B', 'C', 'agg', 'aggregate', 'apply', 'boxplot', 'filter', + 'first', 'get_group', 'groups', 'hist', 'indices', 'last', 'max', + 'mean', 'median', 'min', 'ngroups', 'nth', 'ohlc', 'plot', + 'prod', 'size', 'std', 'sum', 'transform', 'var', 'sem', 'count', + 'nunique', 'head', 'describe', 'cummax', 'quantile', + 'rank', 'cumprod', 'tail', 'resample', 'cummin', 'fillna', + 'cumsum', 'cumcount', 'ngroup', 'all', 'shift', 'skew', + 'take', 'tshift', 'pct_change', 'any', 'mad', 'corr', 'corrwith', + 'cov', 'dtypes', 'ndim', 'diff', 'idxmax', 'idxmin', + 'ffill', 'bfill', 'pad', 'backfill', 'rolling', 'expanding', 'pipe', + 'is_monotonic_increasing', 'is_monotonic_decreasing']) assert results == expected From 4c70dae75f9b3d348936804c7b191551cebc943a Mon Sep 17 00:00:00 2001 From: No-Stream Date: Tue, 31 Oct 2017 15:46:02 -0700 Subject: [PATCH 02/21] ENH: gb.is_monotonic_increasing #17015 fix rebase conflicts --- doc/source/whatsnew/v0.21.0.txt | 1 - pandas/tests/groupby/test_groupby.py | 122 +++++++++++++++++++++++++++ 2 files changed, 122 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index f092f79e2f103..adba6fbf2b356 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -309,7 +309,6 @@ New keywords - :func:`read_feather` has gained the ``nthreads`` parameter for multi-threaded operations (:issue:`16359`) - :func:`DataFrame.clip()` and :func:`Series.clip()` have gained an ``inplace`` argument. (:issue:`15388`) - :func:`crosstab` has gained a ``margins_name`` parameter to define the name of the row / column that will contain the totals when ``margins=True``. (:issue:`15972`) -<<<<<<< HEAD - :func:`read_json` now accepts a ``chunksize`` parameter that can be used when ``lines=True``. If ``chunksize`` is passed, read_json now returns an iterator which reads in ``chunksize`` lines with each iteration. (:issue:`17048`) - :func:`read_json` and :func:`~DataFrame.to_json` now accept a ``compression`` argument which allows them to transparently handle compressed files. (:issue:`17798`) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 9d25117fbd954..a3788dfe04ca9 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -3701,6 +3701,128 @@ def test_cummin_cummax(self): expected = pd.Series([1, 2, 1], name='b') tm.assert_series_equal(result, expected) + def test_is_increasing_is_decreasing(self): + # GH 17015 + + # Basics: strictly increasing (T), strictly decreasing (F), + # abs val increasing (F), non-strictly increasing (T) + source_dict = { + 'A': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'], + 'B': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd'], + 'C': [1, 2, 5, 3, 2, 0, 4, 5, -6, 1, 1]} + df = pd.DataFrame(source_dict) + result = df.groupby(['B']).C.is_monotonic_increasing() + expected = pd.Series(index=['a', 'b', 'c', 'd'], + data=[True, False, False, True], + name='C') + expected.index.name = 'B' + tm.assert_series_equal(result, expected) + # Also check result equal to manually taking x.is_monotonic_increasing. + expected = df.groupby('B').C.apply(lambda x: x.is_monotonic_increasing) + tm.assert_series_equal(result, expected) + + # Test with inf vals + source_dict = { + 'A': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'], + 'B': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd'], + 'C': [1, 2.1, np.inf, 3, 2, np.inf, -np.inf, 5, 11, 1, -np.inf]} + expected.index.name = 'B' + df = pd.DataFrame(source_dict) + result = df.groupby(['B']).C.is_monotonic_increasing() + expected = pd.Series(index=['a', 'b', 'c', 'd'], + data=[True, False, True, False], + name='C') + expected.index.name = 'B' + tm.assert_series_equal(result, expected) + # Also check result equal to manually taking x.is_monotonic_increasing. + expected = df.groupby('B').C.apply(lambda x: x.is_monotonic_increasing) + tm.assert_series_equal(result, expected) + + # Test with nan vals; should always be False + source_dict = { + 'A': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'], + 'B': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd'], + 'C': [1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan]} + df = pd.DataFrame(source_dict) + result = df.groupby(['B']).C.is_monotonic_increasing() + expected = pd.Series(index=['a', 'b', 'c', 'd'], + data=[False, False, False, False], + name='C') + expected.index.name = 'B' + tm.assert_series_equal(result, expected) + # Also check result equal to manually taking x.is_monotonic_increasing. + expected = df.groupby('B').C.apply(lambda x: x.is_monotonic_increasing) + tm.assert_series_equal(result, expected) + + # Test with single member groups; should be True except for np.nan + source_dict = { + 'A': ['1', '2', '3', '4'], + 'B': ['a', 'b', 'c', 'd'], + 'C': [1, 2, np.nan, np.inf]} + df = pd.DataFrame(source_dict) + result = df.groupby(['B']).C.is_monotonic_increasing() + expected = pd.Series(index=['a', 'b', 'c', 'd'], + data=[True, True, False, True], + name='C') + expected.index.name = 'B' + expected.index.name = 'B' + tm.assert_series_equal(result, expected) + # Also check result equal to manually taking x.is_monotonic_increasing. + expected = df.groupby('B').C.apply(lambda x: x.is_monotonic_increasing) + tm.assert_series_equal(result, expected) + + # As above, for .is_monotonic_decreasing() + # Basics: strictly decreasing (T), strictly increasing (F), + # abs val decreasing (F), non-strictly increasing (T) + source_dict = { + 'A': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'], + 'B': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd'], + 'C': [10, 9, 7, 3, 4, 5, -3, 2, 0, 1, 1]} + df = pd.DataFrame(source_dict) + result = df.groupby(['B']).C.is_monotonic_decreasing() + expected = pd.Series(index=['a', 'b', 'c', 'd'], + data=[True, False, False, True], + name='C') + expected.index.name = 'B' + tm.assert_series_equal(result, expected) + # Also check result equal to manually taking x.is_monotonic_decreasing. + expected = df.groupby('B').C.apply(lambda x: x.is_monotonic_decreasing) + tm.assert_series_equal(result, expected) + + # Test with inf vals + source_dict = { + 'A': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'], + 'B': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd'], + 'C': [np.inf, 1, -np.inf, np.inf, 2, -3, -np.inf, 5, -3, -np.inf, + -np.inf]} + df = pd.DataFrame(source_dict) + result = df.groupby(['B']).C.is_monotonic_decreasing() + expected = pd.Series(index=['a', 'b', 'c', 'd'], + data=[True, True, False, True], + name='C') + expected.index.name = 'B' + tm.assert_series_equal(result, expected) + # Also check result equal to manually taking x.is_monotonic_decreasing. + expected = df.groupby('B').C.apply(lambda x: x.is_monotonic_decreasing) + tm.assert_series_equal(result, expected) + + # Test with nan vals; should always be False + source_dict = { + 'A': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'], + 'B': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd'], + 'C': [1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan]} + df = pd.DataFrame(source_dict) + result = df.groupby(['B']).C.is_monotonic_decreasing() + expected = pd.Series(index=['a', 'b', 'c', 'd'], + data=[False, False, False, False], + name='C') + expected.index.name = 'B' + tm.assert_series_equal(result, expected) + # Also check result equal to manually taking x.is_monotonic_decreasing. + expected = df.groupby('B').C.apply(lambda x: x.is_monotonic_decreasing) + tm.assert_series_equal(result, expected) + + def test_apply_numeric_coercion_when_datetime(self): # In the past, group-by/apply operations have been over-eager # in converting dtypes to numeric, in the presence of datetime From d88acdd435105e4f2860c57491669588e0c22bd9 Mon Sep 17 00:00:00 2001 From: No-Stream Date: Fri, 8 Sep 2017 12:57:21 -0700 Subject: [PATCH 03/21] parametrized tests for gb.is_monotonic_increasing/decreasing --- doc/source/api.rst | 2 + pandas/tests/groupby/test_groupby.py | 118 +++++++-------------------- 2 files changed, 31 insertions(+), 89 deletions(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index e8b8b3624740d..a34a757db7a75 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -2052,6 +2052,8 @@ The following methods are available only for ``SeriesGroupBy`` objects. SeriesGroupBy.nunique SeriesGroupBy.unique SeriesGroupBy.value_counts + SeriesGroupBy.is_monotonic_increasing + SeriesGroupBy.is_monotonic_decreasing The following methods are available only for ``DataFrameGroupBy`` objects. diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index a3788dfe04ca9..a6f082ea68e39 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -3701,128 +3701,68 @@ def test_cummin_cummax(self): expected = pd.Series([1, 2, 1], name='b') tm.assert_series_equal(result, expected) - def test_is_increasing_is_decreasing(self): - # GH 17015 - + @pytest.mark.parametrize('in_vals, out_vals', [ # Basics: strictly increasing (T), strictly decreasing (F), # abs val increasing (F), non-strictly increasing (T) - source_dict = { - 'A': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'], - 'B': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd'], - 'C': [1, 2, 5, 3, 2, 0, 4, 5, -6, 1, 1]} - df = pd.DataFrame(source_dict) - result = df.groupby(['B']).C.is_monotonic_increasing() - expected = pd.Series(index=['a', 'b', 'c', 'd'], - data=[True, False, False, True], - name='C') - expected.index.name = 'B' - tm.assert_series_equal(result, expected) - # Also check result equal to manually taking x.is_monotonic_increasing. - expected = df.groupby('B').C.apply(lambda x: x.is_monotonic_increasing) - tm.assert_series_equal(result, expected) - + ([1, 2, 5, 3, 2, 0, 4, 5, -6, 1, 1], + [True, False, False, True]), # Test with inf vals - source_dict = { - 'A': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'], - 'B': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd'], - 'C': [1, 2.1, np.inf, 3, 2, np.inf, -np.inf, 5, 11, 1, -np.inf]} - expected.index.name = 'B' - df = pd.DataFrame(source_dict) - result = df.groupby(['B']).C.is_monotonic_increasing() - expected = pd.Series(index=['a', 'b', 'c', 'd'], - data=[True, False, True, False], - name='C') - expected.index.name = 'B' - tm.assert_series_equal(result, expected) - # Also check result equal to manually taking x.is_monotonic_increasing. - expected = df.groupby('B').C.apply(lambda x: x.is_monotonic_increasing) - tm.assert_series_equal(result, expected) - + ([1, 2.1, np.inf, 3, 2, np.inf, -np.inf, 5, 11, 1, -np.inf], + [True, False, True, False]), # Test with nan vals; should always be False + ([1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan], + [False, False, False, False]), + ]) + def test_is_monotonic_increasing(self, in_vals, out_vals): + # GH 17015 source_dict = { 'A': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'], 'B': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd'], - 'C': [1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan]} + 'C': in_vals} df = pd.DataFrame(source_dict) result = df.groupby(['B']).C.is_monotonic_increasing() expected = pd.Series(index=['a', 'b', 'c', 'd'], - data=[False, False, False, False], + data=out_vals, name='C') expected.index.name = 'B' tm.assert_series_equal(result, expected) - # Also check result equal to manually taking x.is_monotonic_increasing. - expected = df.groupby('B').C.apply(lambda x: x.is_monotonic_increasing) - tm.assert_series_equal(result, expected) - # Test with single member groups; should be True except for np.nan - source_dict = { - 'A': ['1', '2', '3', '4'], - 'B': ['a', 'b', 'c', 'd'], - 'C': [1, 2, np.nan, np.inf]} - df = pd.DataFrame(source_dict) - result = df.groupby(['B']).C.is_monotonic_increasing() - expected = pd.Series(index=['a', 'b', 'c', 'd'], - data=[True, True, False, True], - name='C') - expected.index.name = 'B' - expected.index.name = 'B' - tm.assert_series_equal(result, expected) # Also check result equal to manually taking x.is_monotonic_increasing. - expected = df.groupby('B').C.apply(lambda x: x.is_monotonic_increasing) + expected = ( + df.groupby(['B']).C.apply(lambda x: x.is_monotonic_increasing)) tm.assert_series_equal(result, expected) - # As above, for .is_monotonic_decreasing() + @pytest.mark.parametrize('in_vals, out_vals', [ # Basics: strictly decreasing (T), strictly increasing (F), # abs val decreasing (F), non-strictly increasing (T) - source_dict = { - 'A': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'], - 'B': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd'], - 'C': [10, 9, 7, 3, 4, 5, -3, 2, 0, 1, 1]} - df = pd.DataFrame(source_dict) - result = df.groupby(['B']).C.is_monotonic_decreasing() - expected = pd.Series(index=['a', 'b', 'c', 'd'], - data=[True, False, False, True], - name='C') - expected.index.name = 'B' - tm.assert_series_equal(result, expected) - # Also check result equal to manually taking x.is_monotonic_decreasing. - expected = df.groupby('B').C.apply(lambda x: x.is_monotonic_decreasing) - tm.assert_series_equal(result, expected) - + ([10, 9, 7, 3, 4, 5, -3, 2, 0, 1, 1], + [True, False, False, True]), # Test with inf vals - source_dict = { - 'A': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'], - 'B': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd'], - 'C': [np.inf, 1, -np.inf, np.inf, 2, -3, -np.inf, 5, -3, -np.inf, - -np.inf]} - df = pd.DataFrame(source_dict) - result = df.groupby(['B']).C.is_monotonic_decreasing() - expected = pd.Series(index=['a', 'b', 'c', 'd'], - data=[True, True, False, True], - name='C') - expected.index.name = 'B' - tm.assert_series_equal(result, expected) - # Also check result equal to manually taking x.is_monotonic_decreasing. - expected = df.groupby('B').C.apply(lambda x: x.is_monotonic_decreasing) - tm.assert_series_equal(result, expected) - + ([np.inf, 1, -np.inf, np.inf, 2, -3, -np.inf, 5, -3, -np.inf, -np.inf], + [True, True, False, True]), # Test with nan vals; should always be False + ([1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan], + [False, False, False, False]), + ]) + def test_is_monotonic_decreasing(self, in_vals, out_vals): + # GH 17015 source_dict = { 'A': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'], 'B': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd'], - 'C': [1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan]} + 'C': in_vals} + df = pd.DataFrame(source_dict) - result = df.groupby(['B']).C.is_monotonic_decreasing() + result = df.groupby('B').C.is_monotonic_decreasing() expected = pd.Series(index=['a', 'b', 'c', 'd'], - data=[False, False, False, False], + data=out_vals, name='C') expected.index.name = 'B' tm.assert_series_equal(result, expected) + # Also check result equal to manually taking x.is_monotonic_decreasing. expected = df.groupby('B').C.apply(lambda x: x.is_monotonic_decreasing) tm.assert_series_equal(result, expected) - def test_apply_numeric_coercion_when_datetime(self): # In the past, group-by/apply operations have been over-eager # in converting dtypes to numeric, in the presence of datetime From 53e5a2b557c73c9c2790d482dcb57ff0c3884557 Mon Sep 17 00:00:00 2001 From: No-Stream Date: Wed, 6 Sep 2017 13:42:23 -0700 Subject: [PATCH 04/21] ENH: gb.is_monotonic_increasing, is_monotonic_decreasing #17015 --- doc/source/whatsnew/v0.21.0.txt | 5 +++++ pandas/core/groupby.py | 3 +++ pandas/tests/groupby/test_whitelist.py | 4 ++++ 3 files changed, 12 insertions(+) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index adba6fbf2b356..efe7a415a260a 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -323,6 +323,7 @@ Various enhancements - `read_*` methods can now infer compression from non-string paths, such as ``pathlib.Path`` objects (:issue:`17206`). - :func:`pd.read_sas()` now recognizes much more of the most frequently used date (datetime) formats in SAS7BDAT files (:issue:`15871`). - :func:`DataFrame.items` and :func:`Series.items` is now present in both Python 2 and 3 and is lazy in all cases (:issue:`13918`, :issue:`17213`) +<<<<<<< HEAD - Improved the import time of pandas by about 2.25x. (:issue:`16764`) - Support for `PEP 519 -- Adding a file system path protocol @@ -347,6 +348,10 @@ Various enhancements - :func:`read_excel` raises ``ImportError`` with a better message if ``xlrd`` is not installed. (:issue:`17613`) - :meth:`DataFrame.assign` will preserve the original order of ``**kwargs`` for Python 3.6+ users instead of sorting the column names. (:issue:`14207`) - :func:`Series.reindex`, :func:`DataFrame.reindex`, :func:`Index.get_indexer` now support list-like argument for ``tolerance``. (:issue:`17367`) +======= +- :func: groupby.is_monotonic_increasing and .is_monotonic_decreasing extend Series.is_monotonic_increasing to groups, returning whether each group is monotonically increasing or decreasing, respectively. (:issue:`17015`) + +>>>>>>> e99897c... ENH: gb.is_monotonic_increasing, is_monotonic_decreasing #17015 .. _whatsnew_0210.api_breaking: diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index ec3cce2821036..9d63cf84ebe3b 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1695,6 +1695,7 @@ def tail(self, n=5): mask = self._cumcount_array(ascending=False) < n return self._selected_obj[mask] +<<<<<<< HEAD def pipe(self, func, *args, **kwargs): """ Apply a function with arguments to this GroupBy object, @@ -1743,6 +1744,8 @@ def pipe(self, func, *args, **kwargs): """ return _pipe(self, func, *args, **kwargs) +======= +>>>>>>> e99897c... ENH: gb.is_monotonic_increasing, is_monotonic_decreasing #17015 @Substitution(name='groupby') @Appender(_doc_template) def is_monotonic_increasing(self): diff --git a/pandas/tests/groupby/test_whitelist.py b/pandas/tests/groupby/test_whitelist.py index 37c3f725ff9a7..3c472fec84d43 100644 --- a/pandas/tests/groupby/test_whitelist.py +++ b/pandas/tests/groupby/test_whitelist.py @@ -249,7 +249,11 @@ def test_tab_completion(mframe): 'cumsum', 'cumcount', 'ngroup', 'all', 'shift', 'skew', 'take', 'tshift', 'pct_change', 'any', 'mad', 'corr', 'corrwith', 'cov', 'dtypes', 'ndim', 'diff', 'idxmax', 'idxmin', +<<<<<<< HEAD 'ffill', 'bfill', 'pad', 'backfill', 'rolling', 'expanding', 'pipe', +======= + 'ffill', 'bfill', 'pad', 'backfill', 'rolling', 'expanding', +>>>>>>> e99897c... ENH: gb.is_monotonic_increasing, is_monotonic_decreasing #17015 'is_monotonic_increasing', 'is_monotonic_decreasing']) assert results == expected From 5a30ee4d3f6254684aec8e406b5c5425263e38df Mon Sep 17 00:00:00 2001 From: No-Stream Date: Thu, 7 Sep 2017 15:28:44 -0700 Subject: [PATCH 05/21] added tests for gb.is_monotonically_increasing()/decreasing --- doc/source/whatsnew/v0.21.0.txt | 4 + pandas/tests/groupby/test_groupby.py | 124 +++++++++++++++++++++++++++ 2 files changed, 128 insertions(+) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index efe7a415a260a..f861361a92ba7 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -324,6 +324,7 @@ Various enhancements - :func:`pd.read_sas()` now recognizes much more of the most frequently used date (datetime) formats in SAS7BDAT files (:issue:`15871`). - :func:`DataFrame.items` and :func:`Series.items` is now present in both Python 2 and 3 and is lazy in all cases (:issue:`13918`, :issue:`17213`) <<<<<<< HEAD +<<<<<<< HEAD - Improved the import time of pandas by about 2.25x. (:issue:`16764`) - Support for `PEP 519 -- Adding a file system path protocol @@ -350,6 +351,9 @@ Various enhancements - :func:`Series.reindex`, :func:`DataFrame.reindex`, :func:`Index.get_indexer` now support list-like argument for ``tolerance``. (:issue:`17367`) ======= - :func: groupby.is_monotonic_increasing and .is_monotonic_decreasing extend Series.is_monotonic_increasing to groups, returning whether each group is monotonically increasing or decreasing, respectively. (:issue:`17015`) +======= +- is_monotonic_increasing/decreasing is added to .groupby(). (:issue:`17015`) +>>>>>>> 740c7c2... added tests for gb.is_monotonically_increasing()/decreasing >>>>>>> e99897c... ENH: gb.is_monotonic_increasing, is_monotonic_decreasing #17015 diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index a6f082ea68e39..0d89a417f3f81 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -3701,6 +3701,7 @@ def test_cummin_cummax(self): expected = pd.Series([1, 2, 1], name='b') tm.assert_series_equal(result, expected) +<<<<<<< HEAD @pytest.mark.parametrize('in_vals, out_vals', [ # Basics: strictly increasing (T), strictly decreasing (F), # abs val increasing (F), non-strictly increasing (T) @@ -3759,10 +3760,133 @@ def test_is_monotonic_decreasing(self, in_vals, out_vals): expected.index.name = 'B' tm.assert_series_equal(result, expected) +======= + def test_is_increasing_is_decreasing(self): + # GH 17015 + + # Basics: strictly increasing (T), strictly decreasing (F), + # abs val increasing (F), non-strictly increasing (T) + source_dict = { + 'A': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'], + 'B': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd'], + 'C': [1, 2, 5, 3, 2, 0, 4, 5, -6, 1, 1]} + df = pd.DataFrame(source_dict) + result = df.groupby(['B']).C.is_monotonic_increasing() + expected = pd.Series(index=['a', 'b', 'c', 'd'], + data=[True, False, False, True], + name='C') + expected.index.name = 'B' + tm.assert_series_equal(result, expected) + # Also check result equal to manually taking x.is_monotonic_increasing. + expected = df.groupby('B').C.apply(lambda x: x.is_monotonic_increasing) + tm.assert_series_equal(result, expected) + + # Test with inf vals + source_dict = { + 'A': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'], + 'B': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd'], + 'C': [1, 2.1, np.inf, 3, 2, np.inf, -np.inf, 5, 11, 1, -np.inf]} + expected.index.name = 'B' + df = pd.DataFrame(source_dict) + result = df.groupby(['B']).C.is_monotonic_increasing() + expected = pd.Series(index=['a', 'b', 'c', 'd'], + data=[True, False, True, False], + name='C') + expected.index.name = 'B' + tm.assert_series_equal(result, expected) + # Also check result equal to manually taking x.is_monotonic_increasing. + expected = df.groupby('B').C.apply(lambda x: x.is_monotonic_increasing) + tm.assert_series_equal(result, expected) + + # Test with nan vals; should always be False + source_dict = { + 'A': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'], + 'B': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd'], + 'C': [1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan]} + df = pd.DataFrame(source_dict) + result = df.groupby(['B']).C.is_monotonic_increasing() + expected = pd.Series(index=['a', 'b', 'c', 'd'], + data=[False, False, False, False], + name='C') + expected.index.name = 'B' + tm.assert_series_equal(result, expected) + # Also check result equal to manually taking x.is_monotonic_increasing. + expected = df.groupby('B').C.apply(lambda x: x.is_monotonic_increasing) + tm.assert_series_equal(result, expected) + + # Test with single member groups; should be True except for np.nan + source_dict = { + 'A': ['1', '2', '3', '4'], + 'B': ['a', 'b', 'c', 'd'], + 'C': [1, 2, np.nan, np.inf]} + df = pd.DataFrame(source_dict) + result = df.groupby(['B']).C.is_monotonic_increasing() + expected = pd.Series(index=['a', 'b', 'c', 'd'], + data=[True, True, False, True], + name='C') + expected.index.name = 'B' + expected.index.name = 'B' + tm.assert_series_equal(result, expected) + # Also check result equal to manually taking x.is_monotonic_increasing. + expected = df.groupby('B').C.apply(lambda x: x.is_monotonic_increasing) + tm.assert_series_equal(result, expected) + + # As above, for .is_monotonic_decreasing() + # Basics: strictly decreasing (T), strictly increasing (F), + # abs val decreasing (F), non-strictly increasing (T) + source_dict = { + 'A': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'], + 'B': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd'], + 'C': [10, 9, 7, 3, 4, 5, -3, 2, 0, 1, 1]} + df = pd.DataFrame(source_dict) + result = df.groupby(['B']).C.is_monotonic_decreasing() + expected = pd.Series(index=['a', 'b', 'c', 'd'], + data=[True, False, False, True], + name='C') + expected.index.name = 'B' + tm.assert_series_equal(result, expected) # Also check result equal to manually taking x.is_monotonic_decreasing. expected = df.groupby('B').C.apply(lambda x: x.is_monotonic_decreasing) tm.assert_series_equal(result, expected) + # Test with inf vals + source_dict = { + 'A': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'], + 'B': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd'], + 'C': [np.inf, 1, -np.inf, np.inf, 2, -3, -np.inf, 5, -3, -np.inf, + -np.inf]} + df = pd.DataFrame(source_dict) + result = df.groupby(['B']).C.is_monotonic_decreasing() + expected = pd.Series(index=['a', 'b', 'c', 'd'], + data=[True, True, False, True], + name='C') + expected.index.name = 'B' + tm.assert_series_equal(result, expected) + # Also check result equal to manually taking x.is_monotonic_decreasing. + expected = df.groupby('B').C.apply(lambda x: x.is_monotonic_decreasing) + tm.assert_series_equal(result, expected) + + # Test with nan vals; should always be False + source_dict = { + 'A': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'], + 'B': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd'], + 'C': [1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan]} + df = pd.DataFrame(source_dict) + result = df.groupby(['B']).C.is_monotonic_decreasing() + expected = pd.Series(index=['a', 'b', 'c', 'd'], + data=[False, False, False, False], + name='C') + expected.index.name = 'B' + tm.assert_series_equal(result, expected) +>>>>>>> 740c7c2... added tests for gb.is_monotonically_increasing()/decreasing + # Also check result equal to manually taking x.is_monotonic_decreasing. + expected = df.groupby('B').C.apply(lambda x: x.is_monotonic_decreasing) + tm.assert_series_equal(result, expected) + +<<<<<<< HEAD +======= + +>>>>>>> 740c7c2... added tests for gb.is_monotonically_increasing()/decreasing def test_apply_numeric_coercion_when_datetime(self): # In the past, group-by/apply operations have been over-eager # in converting dtypes to numeric, in the presence of datetime From 2e4bb15bc4101bad730d112f936a191a66acbe51 Mon Sep 17 00:00:00 2001 From: No-Stream Date: Fri, 8 Sep 2017 12:57:21 -0700 Subject: [PATCH 06/21] parametrized tests for gb.is_monotonic_increasing/decreasing --- pandas/tests/groupby/test_groupby.py | 123 +++++++++------------------ 1 file changed, 38 insertions(+), 85 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 0d89a417f3f81..87e06fbb01e81 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -3701,6 +3701,7 @@ def test_cummin_cummax(self): expected = pd.Series([1, 2, 1], name='b') tm.assert_series_equal(result, expected) +<<<<<<< HEAD <<<<<<< HEAD @pytest.mark.parametrize('in_vals, out_vals', [ # Basics: strictly increasing (T), strictly decreasing (F), @@ -3764,129 +3765,81 @@ def test_is_monotonic_decreasing(self, in_vals, out_vals): def test_is_increasing_is_decreasing(self): # GH 17015 +======= + @pytest.mark.parametrize('in_vals, out_vals', [ +>>>>>>> f8554ee... parametrized tests for gb.is_monotonic_increasing/decreasing # Basics: strictly increasing (T), strictly decreasing (F), # abs val increasing (F), non-strictly increasing (T) - source_dict = { - 'A': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'], - 'B': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd'], - 'C': [1, 2, 5, 3, 2, 0, 4, 5, -6, 1, 1]} - df = pd.DataFrame(source_dict) - result = df.groupby(['B']).C.is_monotonic_increasing() - expected = pd.Series(index=['a', 'b', 'c', 'd'], - data=[True, False, False, True], - name='C') - expected.index.name = 'B' - tm.assert_series_equal(result, expected) - # Also check result equal to manually taking x.is_monotonic_increasing. - expected = df.groupby('B').C.apply(lambda x: x.is_monotonic_increasing) - tm.assert_series_equal(result, expected) - + ([1, 2, 5, 3, 2, 0, 4, 5, -6, 1, 1], + [True, False, False, True]), # Test with inf vals - source_dict = { - 'A': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'], - 'B': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd'], - 'C': [1, 2.1, np.inf, 3, 2, np.inf, -np.inf, 5, 11, 1, -np.inf]} - expected.index.name = 'B' - df = pd.DataFrame(source_dict) - result = df.groupby(['B']).C.is_monotonic_increasing() - expected = pd.Series(index=['a', 'b', 'c', 'd'], - data=[True, False, True, False], - name='C') - expected.index.name = 'B' - tm.assert_series_equal(result, expected) - # Also check result equal to manually taking x.is_monotonic_increasing. - expected = df.groupby('B').C.apply(lambda x: x.is_monotonic_increasing) - tm.assert_series_equal(result, expected) - + ([1, 2.1, np.inf, 3, 2, np.inf, -np.inf, 5, 11, 1, -np.inf], + [True, False, True, False]), # Test with nan vals; should always be False + ([1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan], + [False, False, False, False]), + ]) + def test_is_monotonic_increasing(self, in_vals, out_vals): + # GH 17015 source_dict = { 'A': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'], 'B': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd'], - 'C': [1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan]} + 'C': in_vals} df = pd.DataFrame(source_dict) result = df.groupby(['B']).C.is_monotonic_increasing() expected = pd.Series(index=['a', 'b', 'c', 'd'], - data=[False, False, False, False], + data=out_vals, name='C') expected.index.name = 'B' tm.assert_series_equal(result, expected) - # Also check result equal to manually taking x.is_monotonic_increasing. - expected = df.groupby('B').C.apply(lambda x: x.is_monotonic_increasing) - tm.assert_series_equal(result, expected) - # Test with single member groups; should be True except for np.nan - source_dict = { - 'A': ['1', '2', '3', '4'], - 'B': ['a', 'b', 'c', 'd'], - 'C': [1, 2, np.nan, np.inf]} - df = pd.DataFrame(source_dict) - result = df.groupby(['B']).C.is_monotonic_increasing() - expected = pd.Series(index=['a', 'b', 'c', 'd'], - data=[True, True, False, True], - name='C') - expected.index.name = 'B' - expected.index.name = 'B' - tm.assert_series_equal(result, expected) # Also check result equal to manually taking x.is_monotonic_increasing. - expected = df.groupby('B').C.apply(lambda x: x.is_monotonic_increasing) + expected = ( + df.groupby(['B']).C.apply(lambda x: x.is_monotonic_increasing)) tm.assert_series_equal(result, expected) - # As above, for .is_monotonic_decreasing() + @pytest.mark.parametrize('in_vals, out_vals', [ # Basics: strictly decreasing (T), strictly increasing (F), # abs val decreasing (F), non-strictly increasing (T) - source_dict = { - 'A': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'], - 'B': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd'], - 'C': [10, 9, 7, 3, 4, 5, -3, 2, 0, 1, 1]} - df = pd.DataFrame(source_dict) - result = df.groupby(['B']).C.is_monotonic_decreasing() - expected = pd.Series(index=['a', 'b', 'c', 'd'], - data=[True, False, False, True], - name='C') - expected.index.name = 'B' - tm.assert_series_equal(result, expected) - # Also check result equal to manually taking x.is_monotonic_decreasing. - expected = df.groupby('B').C.apply(lambda x: x.is_monotonic_decreasing) - tm.assert_series_equal(result, expected) - + ([10, 9, 7, 3, 4, 5, -3, 2, 0, 1, 1], + [True, False, False, True]), # Test with inf vals - source_dict = { - 'A': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'], - 'B': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd'], - 'C': [np.inf, 1, -np.inf, np.inf, 2, -3, -np.inf, 5, -3, -np.inf, - -np.inf]} - df = pd.DataFrame(source_dict) - result = df.groupby(['B']).C.is_monotonic_decreasing() - expected = pd.Series(index=['a', 'b', 'c', 'd'], - data=[True, True, False, True], - name='C') - expected.index.name = 'B' - tm.assert_series_equal(result, expected) - # Also check result equal to manually taking x.is_monotonic_decreasing. - expected = df.groupby('B').C.apply(lambda x: x.is_monotonic_decreasing) - tm.assert_series_equal(result, expected) - + ([np.inf, 1, -np.inf, np.inf, 2, -3, -np.inf, 5, -3, -np.inf, -np.inf], + [True, True, False, True]), # Test with nan vals; should always be False + ([1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan], + [False, False, False, False]), + ]) + def test_is_monotonic_decreasing(self, in_vals, out_vals): + # GH 17015 source_dict = { 'A': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'], 'B': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd'], - 'C': [1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan]} + 'C': in_vals} + df = pd.DataFrame(source_dict) - result = df.groupby(['B']).C.is_monotonic_decreasing() + result = df.groupby('B').C.is_monotonic_decreasing() expected = pd.Series(index=['a', 'b', 'c', 'd'], - data=[False, False, False, False], + data=out_vals, name='C') expected.index.name = 'B' tm.assert_series_equal(result, expected) +<<<<<<< HEAD >>>>>>> 740c7c2... added tests for gb.is_monotonically_increasing()/decreasing +======= + +>>>>>>> f8554ee... parametrized tests for gb.is_monotonic_increasing/decreasing # Also check result equal to manually taking x.is_monotonic_decreasing. expected = df.groupby('B').C.apply(lambda x: x.is_monotonic_decreasing) tm.assert_series_equal(result, expected) +<<<<<<< HEAD <<<<<<< HEAD ======= >>>>>>> 740c7c2... added tests for gb.is_monotonically_increasing()/decreasing +======= +>>>>>>> f8554ee... parametrized tests for gb.is_monotonic_increasing/decreasing def test_apply_numeric_coercion_when_datetime(self): # In the past, group-by/apply operations have been over-eager # in converting dtypes to numeric, in the presence of datetime From 3122f1f4039954b0682527208cfde3e54508f41c Mon Sep 17 00:00:00 2001 From: No-Stream Date: Tue, 31 Oct 2017 13:15:59 -0700 Subject: [PATCH 07/21] removed edits to whatsnew 0.21.0 --- doc/source/whatsnew/v0.21.0.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index f861361a92ba7..c62aad1a19e42 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -325,6 +325,7 @@ Various enhancements - :func:`DataFrame.items` and :func:`Series.items` is now present in both Python 2 and 3 and is lazy in all cases (:issue:`13918`, :issue:`17213`) <<<<<<< HEAD <<<<<<< HEAD +<<<<<<< HEAD - Improved the import time of pandas by about 2.25x. (:issue:`16764`) - Support for `PEP 519 -- Adding a file system path protocol @@ -354,6 +355,8 @@ Various enhancements ======= - is_monotonic_increasing/decreasing is added to .groupby(). (:issue:`17015`) >>>>>>> 740c7c2... added tests for gb.is_monotonically_increasing()/decreasing +======= +>>>>>>> 8ed37cd... removed edits to whatsnew 0.21.0 >>>>>>> e99897c... ENH: gb.is_monotonic_increasing, is_monotonic_decreasing #17015 From a6d0640f1e38a78a689dcb4447b81da6a5bef14d Mon Sep 17 00:00:00 2001 From: No-Stream Date: Tue, 31 Oct 2017 16:35:59 -0700 Subject: [PATCH 08/21] ENH: gb.is_monotonic_increasing #17015 fix rebase conflicts --- doc/source/whatsnew/v0.21.0.txt | 898 ++++--------------------- pandas/tests/groupby/test_whitelist.py | 15 + 2 files changed, 138 insertions(+), 775 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index c62aad1a19e42..0c01d2f19a62d 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -1,49 +1,33 @@ .. _whatsnew_0210: -v0.21.0 (October 27, 2017) --------------------------- +v0.21.0 (???) +------------- -This is a major release from 0.20.3 and includes a number of API changes, deprecations, new features, +This is a major release from 0.20.x and includes a number of API changes, deprecations, new features, enhancements, and performance improvements along with a large number of bug fixes. We recommend that all users upgrade to this version. Highlights include: -- Integration with `Apache Parquet `__, including a new top-level :func:`read_parquet` function and :meth:`DataFrame.to_parquet` method, see :ref:`here `. -- New user-facing :class:`pandas.api.types.CategoricalDtype` for specifying - categoricals independent of the data, see :ref:`here `. -- The behavior of ``sum`` and ``prod`` on all-NaN Series/DataFrames is now consistent and no longer depends on whether `bottleneck `__ is installed, see :ref:`here `. -- Compatibility fixes for pypy, see :ref:`here `. -- Additions to the ``drop``, ``reindex`` and ``rename`` API to make them more consistent, see :ref:`here `. -- Addition of the new methods ``DataFrame.infer_objects`` (see :ref:`here `) and ``GroupBy.pipe`` (see :ref:`here `). -- Indexing with a list of labels, where one or more of the labels is missing, is deprecated and will raise a KeyError in a future version, see :ref:`here `. +- Integration with `Apache Parquet `__, including a new top-level :func:`read_parquet` and :func:`DataFrame.to_parquet` method, see :ref:`here `. Check the :ref:`API Changes ` and :ref:`deprecations ` before updating. .. contents:: What's new in v0.21.0 :local: :backlinks: none - :depth: 2 .. _whatsnew_0210.enhancements: New features ~~~~~~~~~~~~ -.. _whatsnew_0210.enhancements.parquet: - -Integration with Apache Parquet file format -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Integration with `Apache Parquet `__, including a new top-level :func:`read_parquet` and :func:`DataFrame.to_parquet` method, see :ref:`here ` (:issue:`15838`, :issue:`17438`). - -`Apache Parquet `__ provides a cross-language, binary file format for reading and writing data frames efficiently. -Parquet is designed to faithfully serialize and de-serialize ``DataFrame`` s, supporting all of the pandas -dtypes, including extension dtypes such as datetime with timezones. - -This functionality depends on either the `pyarrow `__ or `fastparquet `__ library. -For more details, see see :ref:`the IO docs on Parquet `. - +- Support for `PEP 519 -- Adding a file system path protocol + `_ on most readers and writers (:issue:`13823`) +- Added ``__fspath__`` method to :class:`~pandas.HDFStore`, :class:`~pandas.ExcelFile`, + and :class:`~pandas.ExcelWriter` to work properly with the file system path protocol (:issue:`13823`) +- Added ``skipna`` parameter to :func:`~pandas.api.types.infer_dtype` to + support type inference in the presence of missing values (:issue:`17059`). .. _whatsnew_0210.enhancements.infer_objects: @@ -57,7 +41,7 @@ method. See the documentation :ref:`here ` for more details. (:issue:`11221`) This method only performs soft conversions on object columns, converting Python objects -to native types, but not any coercive conversions. For example: +to native types, but not any coercive conversions. For example: .. ipython:: python @@ -68,7 +52,7 @@ to native types, but not any coercive conversions. For example: df.infer_objects().dtypes Note that column ``'C'`` was not converted - only scalar numeric types -will be converted to a new type. Other types of conversion should be accomplished +will be inferred to a new type. Other types of conversion should be accomplished using the :func:`to_numeric` function (or :func:`to_datetime`, :func:`to_timedelta`). .. ipython:: python @@ -82,239 +66,62 @@ using the :func:`to_numeric` function (or :func:`to_datetime`, :func:`to_timedel Improved warnings when attempting to create columns ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -New users are often puzzled by the relationship between column operations and -attribute access on ``DataFrame`` instances (:issue:`7175`). One specific -instance of this confusion is attempting to create a new column by setting an -attribute on the ``DataFrame``: +New users are often flummoxed by the relationship between column operations and attribute +access on ``DataFrame`` instances (:issue:`5904` & :issue:`7175`). Two specific instances +of this confusion include attempting to create a new column by setting into an attribute: .. code-block:: ipython - In[1]: df = pd.DataFrame({'one': [1., 2., 3.]}) - In[2]: df.two = [4, 5, 6] + In[1]: df = pd.DataFrame({'one': [1., 2., 3.]}) + In[2]: df.two = [4, 5, 6] This does not raise any obvious exceptions, but also does not create a new column: .. code-block:: ipython - In[3]: df - Out[3]: - one - 0 1.0 - 1 2.0 - 2 3.0 - -Setting a list-like data structure into a new attribute now raises a ``UserWarning`` about the potential for unexpected behavior. See :ref:`Attribute Access `. - -.. _whatsnew_0210.enhancements.drop_api: - -``drop`` now also accepts index/columns keywords -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The :meth:`~DataFrame.drop` method has gained ``index``/``columns`` keywords as an -alternative to specifying the ``axis``. This is similar to the behavior of ``reindex`` -(:issue:`12392`). - -For example: - -.. ipython:: python - - df = pd.DataFrame(np.arange(8).reshape(2,4), - columns=['A', 'B', 'C', 'D']) - df - df.drop(['B', 'C'], axis=1) - # the following is now equivalent - df.drop(columns=['B', 'C']) - -.. _whatsnew_0210.enhancements.rename_reindex_axis: - -``rename``, ``reindex`` now also accept axis keyword -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The :meth:`DataFrame.rename` and :meth:`DataFrame.reindex` methods have gained -the ``axis`` keyword to specify the axis to target with the operation -(:issue:`12392`). - -Here's ``rename``: - -.. ipython:: python - - df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) - df.rename(str.lower, axis='columns') - df.rename(id, axis='index') - -And ``reindex``: - -.. ipython:: python - - df.reindex(['A', 'B', 'C'], axis='columns') - df.reindex([0, 1, 3], axis='index') - -The "index, columns" style continues to work as before. - -.. ipython:: python - - df.rename(index=id, columns=str.lower) - df.reindex(index=[0, 1, 3], columns=['A', 'B', 'C']) - -We *highly* encourage using named arguments to avoid confusion when using either -style. - -.. _whatsnew_0210.enhancements.categorical_dtype: - -``CategoricalDtype`` for specifying categoricals -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -:class:`pandas.api.types.CategoricalDtype` has been added to the public API and -expanded to include the ``categories`` and ``ordered`` attributes. A -``CategoricalDtype`` can be used to specify the set of categories and -orderedness of an array, independent of the data. This can be useful for example, -when converting string data to a ``Categorical`` (:issue:`14711`, -:issue:`15078`, :issue:`16015`, :issue:`17643`): - -.. ipython:: python - - from pandas.api.types import CategoricalDtype - - s = pd.Series(['a', 'b', 'c', 'a']) # strings - dtype = CategoricalDtype(categories=['a', 'b', 'c', 'd'], ordered=True) - s.astype(dtype) - -One place that deserves special mention is in :meth:`read_csv`. Previously, with -``dtype={'col': 'category'}``, the returned values and categories would always -be strings. - -.. ipython:: python - :suppress: - - from pandas.compat import StringIO - -.. ipython:: python - - data = 'A,B\na,1\nb,2\nc,3' - pd.read_csv(StringIO(data), dtype={'B': 'category'}).B.cat.categories - -Notice the "object" dtype. - -With a ``CategoricalDtype`` of all numerics, datetimes, or -timedeltas, we can automatically convert to the correct type - -.. ipython:: python - - dtype = {'B': CategoricalDtype([1, 2, 3])} - pd.read_csv(StringIO(data), dtype=dtype).B.cat.categories - -The values have been correctly interpreted as integers. - -The ``.dtype`` property of a ``Categorical``, ``CategoricalIndex`` or a -``Series`` with categorical type will now return an instance of -``CategoricalDtype``. While the repr has changed, ``str(CategoricalDtype())`` is -still the string ``'category'``. We'll take this moment to remind users that the -*preferred* way to detect categorical data is to use -:func:`pandas.api.types.is_categorical_dtype`, and not ``str(dtype) == 'category'``. - -See the :ref:`CategoricalDtype docs ` for more. - -.. _whatsnew_0210.enhancements.GroupBy_pipe: - -``GroupBy`` objects now have a ``pipe`` method -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -``GroupBy`` objects now have a ``pipe`` method, similar to the one on -``DataFrame`` and ``Series``, that allow for functions that take a -``GroupBy`` to be composed in a clean, readable syntax. (:issue:`17871`) - -For a concrete example on combining ``.groupby`` and ``.pipe`` , imagine having a -DataFrame with columns for stores, products, revenue and sold quantity. We'd like to -do a groupwise calculation of *prices* (i.e. revenue/quantity) per store and per product. -We could do this in a multi-step operation, but expressing it in terms of piping can make the -code more readable. - -First we set the data: - -.. ipython:: python - - import numpy as np - n = 1000 - df = pd.DataFrame({'Store': np.random.choice(['Store_1', 'Store_2'], n), - 'Product': np.random.choice(['Product_1', 'Product_2', 'Product_3'], n), - 'Revenue': (np.random.random(n)*50+10).round(2), - 'Quantity': np.random.randint(1, 10, size=n)}) - df.head(2) + In[3]: df + Out[3]: + one + 0 1.0 + 1 2.0 + 2 3.0 -Now, to find prices per store/product, we can simply do: +The second source of confusion is creating a column whose name collides with a method or +attribute already in the instance namespace: -.. ipython:: python - - (df.groupby(['Store', 'Product']) - .pipe(lambda grp: grp.Revenue.sum()/grp.Quantity.sum()) - .unstack().round(2)) - -See the :ref:`documentation ` for more. - - -.. _whatsnew_0210.enhancements.reanme_categories: - -``Categorical.rename_categories`` accepts a dict-like -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -:meth:`~Series.cat.rename_categories` now accepts a dict-like argument for -``new_categories``. The previous categories are looked up in the dictionary's -keys and replaced if found. The behavior of missing and extra keys is the same -as in :meth:`DataFrame.rename`. - -.. ipython:: python - - c = pd.Categorical(['a', 'a', 'b']) - c.rename_categories({"a": "eh", "b": "bee"}) +.. code-block:: ipython -.. warning:: + In[4]: df['sum'] = [5., 7., 9.] - To assist with upgrading pandas, ``rename_categories`` treats ``Series`` as - list-like. Typically, Series are considered to be dict-like (e.g. in - ``.rename``, ``.map``). In a future version of pandas ``rename_categories`` - will change to treat them as dict-like. Follow the warning message's - recommendations for writing future-proof code. +This does not permit that column to be accessed as an attribute: - .. code-block:: ipython +.. code-block:: ipython - In [33]: c.rename_categories(pd.Series([0, 1], index=['a', 'c'])) - FutureWarning: Treating Series 'new_categories' as a list-like and using the values. - In a future version, 'rename_categories' will treat Series like a dictionary. - For dict-like, use 'new_categories.to_dict()' - For list-like, use 'new_categories.values'. - Out[33]: - [0, 0, 1] - Categories (2, int64): [0, 1] + In[5]: df.sum + Out[5]: + +Both of these now raise a ``UserWarning`` about the potential for unexpected behavior. See :ref:`Attribute Access `. .. _whatsnew_0210.enhancements.other: Other Enhancements ^^^^^^^^^^^^^^^^^^ -New functions or methods -"""""""""""""""""""""""" - -- :meth:`~pandas.core.resample.Resampler.nearest` is added to support nearest-neighbor upsampling (:issue:`17496`). -- :class:`~pandas.Index` has added support for a ``to_frame`` method (:issue:`15230`). - -New keywords -"""""""""""" - -- Added a ``skipna`` parameter to :func:`~pandas.api.types.infer_dtype` to - support type inference in the presence of missing values (:issue:`17059`). +- The ``validate`` argument for :func:`merge` function now checks whether a merge is one-to-one, one-to-many, many-to-one, or many-to-many. If a merge is found to not be an example of specified merge type, an exception of type ``MergeError`` will be raised. For more, see :ref:`here ` (:issue:`16270`) +- Added support for `PEP 518 `_ to the build system (:issue:`16745`) - :func:`Series.to_dict` and :func:`DataFrame.to_dict` now support an ``into`` keyword which allows you to specify the ``collections.Mapping`` subclass that you would like returned. The default is ``dict``, which is backwards compatible. (:issue:`16122`) +- :func:`RangeIndex.append` now returns a ``RangeIndex`` object when possible (:issue:`16212`) +- :func:`Series.rename_axis` and :func:`DataFrame.rename_axis` with ``inplace=True`` now return ``None`` while renaming the axis inplace. (:issue:`15704`) - :func:`Series.set_axis` and :func:`DataFrame.set_axis` now support the ``inplace`` parameter. (:issue:`14636`) - :func:`Series.to_pickle` and :func:`DataFrame.to_pickle` have gained a ``protocol`` parameter (:issue:`16252`). By default, this parameter is set to `HIGHEST_PROTOCOL `__ +- :func:`api.types.infer_dtype` now infers decimals. (:issue:`15690`) - :func:`read_feather` has gained the ``nthreads`` parameter for multi-threaded operations (:issue:`16359`) - :func:`DataFrame.clip()` and :func:`Series.clip()` have gained an ``inplace`` argument. (:issue:`15388`) - :func:`crosstab` has gained a ``margins_name`` parameter to define the name of the row / column that will contain the totals when ``margins=True``. (:issue:`15972`) -- :func:`read_json` now accepts a ``chunksize`` parameter that can be used when ``lines=True``. If ``chunksize`` is passed, read_json now returns an iterator which reads in ``chunksize`` lines with each iteration. (:issue:`17048`) -- :func:`read_json` and :func:`~DataFrame.to_json` now accept a ``compression`` argument which allows them to transparently handle compressed files. (:issue:`17798`) - -Various enhancements -"""""""""""""""""""" -======= - :func:`DataFrame.select_dtypes` now accepts scalar values for include/exclude as well as list-like. (:issue:`16855`) - :func:`date_range` now accepts 'YS' in addition to 'AS' as an alias for start of year (:issue:`9313`) - :func:`date_range` now accepts 'Y' in addition to 'A' as an alias for end of year (:issue:`9313`) @@ -326,6 +133,7 @@ Various enhancements <<<<<<< HEAD <<<<<<< HEAD <<<<<<< HEAD +<<<<<<< HEAD - Improved the import time of pandas by about 2.25x. (:issue:`16764`) - Support for `PEP 519 -- Adding a file system path protocol @@ -359,23 +167,28 @@ Various enhancements >>>>>>> 8ed37cd... removed edits to whatsnew 0.21.0 >>>>>>> e99897c... ENH: gb.is_monotonic_increasing, is_monotonic_decreasing #17015 +======= + + +>>>>>>> ceceae1... ENH: gb.is_monotonic_increasing #17015 fix rebase conflicts .. _whatsnew_0210.api_breaking: Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + .. _whatsnew_0210.api_breaking.deps: Dependencies have increased minimum versions ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -We have updated our minimum supported versions of dependencies (:issue:`15206`, :issue:`15543`, :issue:`15214`). -If installed, we now require: +We have updated our minimum supported versions of dependencies (:issue:`15206`, :issue:`15543`, :issue:`15214`) +). If installed, we now require: +--------------+-----------------+----------+ | Package | Minimum Version | Required | - +==============+=================+==========+ + +======================+=========+==========+ | Numpy | 1.9.0 | X | +--------------+-----------------+----------+ | Matplotlib | 1.4.3 | | @@ -385,308 +198,6 @@ If installed, we now require: | Bottleneck | 1.0.0 | | +--------------+-----------------+----------+ -Additionally, support has been dropped for Python 3.4 (:issue:`15251`). - - -.. _whatsnew_0210.api_breaking.bottleneck: - -Sum/Prod of all-NaN Series/DataFrames is now consistently NaN -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The behavior of ``sum`` and ``prod`` on all-NaN Series/DataFrames no longer depends on -whether `bottleneck `__ is installed. (:issue:`9422`, :issue:`15507`). - -Calling ``sum`` or ``prod`` on an empty or all-``NaN`` ``Series``, or columns of a ``DataFrame``, will result in ``NaN``. See the :ref:`docs `. - -.. ipython:: python - - s = Series([np.nan]) - -Previously NO ``bottleneck`` - -.. code-block:: ipython - - In [2]: s.sum() - Out[2]: np.nan - -Previously WITH ``bottleneck`` - -.. code-block:: ipython - - In [2]: s.sum() - Out[2]: 0.0 - -New Behavior, without regard to the bottleneck installation. - -.. ipython:: python - - s.sum() - -Note that this also changes the sum of an empty ``Series`` - -Previously regardless of ``bottlenck`` - -.. code-block:: ipython - - In [1]: pd.Series([]).sum() - Out[1]: 0 - -.. ipython:: python - - pd.Series([]).sum() - - -.. _whatsnew_0210.api_breaking.loc: - -Indexing with a list with missing labels is Deprecated -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Previously, selecting with a list of labels, where one or more labels were missing would always succeed, returning ``NaN`` for missing labels. -This will now show a ``FutureWarning``. In the future this will raise a ``KeyError`` (:issue:`15747`). -This warning will trigger on a ``DataFrame`` or a ``Series`` for using ``.loc[]`` or ``[[]]`` when passing a list-of-labels with at least 1 missing label. -See the :ref:`deprecation docs `. - - -.. ipython:: python - - s = pd.Series([1, 2, 3]) - s - -Previous Behavior - -.. code-block:: ipython - - In [4]: s.loc[[1, 2, 3]] - Out[4]: - 1 2.0 - 2 3.0 - 3 NaN - dtype: float64 - - -Current Behavior - -.. code-block:: ipython - - In [4]: s.loc[[1, 2, 3]] - Passing list-likes to .loc or [] with any missing label will raise - KeyError in the future, you can use .reindex() as an alternative. - - See the documentation here: - http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike - - Out[4]: - 1 2.0 - 2 3.0 - 3 NaN - dtype: float64 - -The idiomatic way to achieve selecting potentially not-found elements is via ``.reindex()`` - -.. ipython:: python - - s.reindex([1, 2, 3]) - -Selection with all keys found is unchanged. - -.. ipython:: python - - s.loc[[1, 2]] - - -.. _whatsnew_0210.api.na_changes: - -NA naming Changes -^^^^^^^^^^^^^^^^^ - -In order to promote more consistency among the pandas API, we have added additional top-level -functions :func:`isna` and :func:`notna` that are aliases for :func:`isnull` and :func:`notnull`. -The naming scheme is now more consistent with methods like ``.dropna()`` and ``.fillna()``. Furthermore -in all cases where ``.isnull()`` and ``.notnull()`` methods are defined, these have additional methods -named ``.isna()`` and ``.notna()``, these are included for classes ``Categorical``, -``Index``, ``Series``, and ``DataFrame``. (:issue:`15001`). - -The configuration option ``pd.options.mode.use_inf_as_null`` is deprecated, and ``pd.options.mode.use_inf_as_na`` is added as a replacement. - - -.. _whatsnew_0210.api_breaking.iteration_scalars: - -Iteration of Series/Index will now return Python scalars -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Previously, when using certain iteration methods for a ``Series`` with dtype ``int`` or ``float``, you would receive a ``numpy`` scalar, e.g. a ``np.int64``, rather than a Python ``int``. Issue (:issue:`10904`) corrected this for ``Series.tolist()`` and ``list(Series)``. This change makes all iteration methods consistent, in particular, for ``__iter__()`` and ``.map()``; note that this only affects int/float dtypes. (:issue:`13236`, :issue:`13258`, :issue:`14216`). - -.. ipython:: python - - s = pd.Series([1, 2, 3]) - s - -Previously: - -.. code-block:: ipython - - In [2]: type(list(s)[0]) - Out[2]: numpy.int64 - -New Behaviour: - -.. ipython:: python - - type(list(s)[0]) - -Furthermore this will now correctly box the results of iteration for :func:`DataFrame.to_dict` as well. - -.. ipython:: python - - d = {'a':[1], 'b':['b']} - df = pd.DataFrame(d) - -Previously: - -.. code-block:: ipython - - In [8]: type(df.to_dict()['a'][0]) - Out[8]: numpy.int64 - -New Behaviour: - -.. ipython:: python - - type(df.to_dict()['a'][0]) - - -.. _whatsnew_0210.api_breaking.loc_with_index: - -Indexing with a Boolean Index -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Previously when passing a boolean ``Index`` to ``.loc``, if the index of the ``Series/DataFrame`` had ``boolean`` labels, -you would get a label based selection, potentially duplicating result labels, rather than a boolean indexing selection -(where ``True`` selects elements), this was inconsistent how a boolean numpy array indexed. The new behavior is to -act like a boolean numpy array indexer. (:issue:`17738`) - -Previous Behavior: - -.. ipython:: python - - s = pd.Series([1, 2, 3], index=[False, True, False]) - s - -.. code-block:: ipython - - In [59]: s.loc[pd.Index([True, False, True])] - Out[59]: - True 2 - False 1 - False 3 - True 2 - dtype: int64 - -Current Behavior - -.. ipython:: python - - s.loc[pd.Index([True, False, True])] - - -Furthermore, previously if you had an index that was non-numeric (e.g. strings), then a boolean Index would raise a ``KeyError``. -This will now be treated as a boolean indexer. - -Previously Behavior: - -.. ipython:: python - - s = pd.Series([1,2,3], index=['a', 'b', 'c']) - s - -.. code-block:: ipython - - In [39]: s.loc[pd.Index([True, False, True])] - KeyError: "None of [Index([True, False, True], dtype='object')] are in the [index]" - -Current Behavior - -.. ipython:: python - - s.loc[pd.Index([True, False, True])] - - -.. _whatsnew_0210.api_breaking.period_index_resampling: - -``PeriodIndex`` resampling -^^^^^^^^^^^^^^^^^^^^^^^^^^ - -In previous versions of pandas, resampling a ``Series``/``DataFrame`` indexed by a ``PeriodIndex`` returned a ``DatetimeIndex`` in some cases (:issue:`12884`). Resampling to a multiplied frequency now returns a ``PeriodIndex`` (:issue:`15944`). As a minor enhancement, resampling a ``PeriodIndex`` can now handle ``NaT`` values (:issue:`13224`) - -Previous Behavior: - -.. code-block:: ipython - - In [1]: pi = pd.period_range('2017-01', periods=12, freq='M') - - In [2]: s = pd.Series(np.arange(12), index=pi) - - In [3]: resampled = s.resample('2Q').mean() - - In [4]: resampled - Out[4]: - 2017-03-31 1.0 - 2017-09-30 5.5 - 2018-03-31 10.0 - Freq: 2Q-DEC, dtype: float64 - - In [5]: resampled.index - Out[5]: DatetimeIndex(['2017-03-31', '2017-09-30', '2018-03-31'], dtype='datetime64[ns]', freq='2Q-DEC') - -New Behavior: - -.. ipython:: python - - pi = pd.period_range('2017-01', periods=12, freq='M') - - s = pd.Series(np.arange(12), index=pi) - - resampled = s.resample('2Q').mean() - - resampled - - resampled.index - -Upsampling and calling ``.ohlc()`` previously returned a ``Series``, basically identical to calling ``.asfreq()``. OHLC upsampling now returns a DataFrame with columns ``open``, ``high``, ``low`` and ``close`` (:issue:`13083`). This is consistent with downsampling and ``DatetimeIndex`` behavior. - -Previous Behavior: - -.. code-block:: ipython - - In [1]: pi = pd.PeriodIndex(start='2000-01-01', freq='D', periods=10) - - In [2]: s = pd.Series(np.arange(10), index=pi) - - In [3]: s.resample('H').ohlc() - Out[3]: - 2000-01-01 00:00 0.0 - ... - 2000-01-10 23:00 NaN - Freq: H, Length: 240, dtype: float64 - - In [4]: s.resample('M').ohlc() - Out[4]: - open high low close - 2000-01 0 9 0 9 - -New Behavior: - -.. ipython:: python - - pi = pd.PeriodIndex(start='2000-01-01', freq='D', periods=10) - - s = pd.Series(np.arange(10), index=pi) - - s.resample('H').ohlc() - - s.resample('M').ohlc() - - .. _whatsnew_0210.api_breaking.pandas_eval: Improved error handling during item assignment in pd.eval @@ -733,70 +244,81 @@ the target. Now, a ``ValueError`` will be raised when such an input is passed in ... ValueError: Cannot operate inplace if there is no assignment - -.. _whatsnew_0210.api_breaking.dtype_conversions: - Dtype Conversions ^^^^^^^^^^^^^^^^^ -Previously assignments, ``.where()`` and ``.fillna()`` with a ``bool`` assignment, would coerce to same the type (e.g. int / float), or raise for datetimelikes. These will now preserve the bools with ``object`` dtypes. (:issue:`16821`). +- Previously assignments, ``.where()`` and ``.fillna()`` with a ``bool`` assignment, would coerce to + same the type (e.g. int / float), or raise for datetimelikes. These will now preseve the bools with ``object`` dtypes. (:issue:`16821`). -.. ipython:: python + .. ipython:: python - s = Series([1, 2, 3]) + s = Series([1, 2, 3]) -.. code-block:: python + .. code-block:: python - In [5]: s[1] = True + In [5]: s[1] = True - In [6]: s - Out[6]: - 0 1 - 1 1 - 2 3 - dtype: int64 + In [6]: s + Out[6]: + 0 1 + 1 1 + 2 3 + dtype: int64 -New Behavior + New Behavior -.. ipython:: python + .. ipython:: python - s[1] = True - s + s[1] = True + s -Previously, as assignment to a datetimelike with a non-datetimelike would coerce the -non-datetime-like item being assigned (:issue:`14145`). +- Previously, as assignment to a datetimelike with a non-datetimelike would coerce the + non-datetime-like item being assigned (:issue:`14145`). -.. ipython:: python + .. ipython:: python - s = pd.Series([pd.Timestamp('2011-01-01'), pd.Timestamp('2012-01-01')]) + s = pd.Series([pd.Timestamp('2011-01-01'), pd.Timestamp('2012-01-01')]) -.. code-block:: python + .. code-block:: python - In [1]: s[1] = 1 + In [1]: s[1] = 1 - In [2]: s - Out[2]: - 0 2011-01-01 00:00:00.000000000 - 1 1970-01-01 00:00:00.000000001 - dtype: datetime64[ns] + In [2]: s + Out[2]: + 0 2011-01-01 00:00:00.000000000 + 1 1970-01-01 00:00:00.000000001 + dtype: datetime64[ns] -These now coerce to ``object`` dtype. + These now coerce to ``object`` dtype. -.. ipython:: python + .. ipython:: python - s[1] = 1 - s + s[1] = 1 + s - Inconsistent behavior in ``.where()`` with datetimelikes which would raise rather than coerce to ``object`` (:issue:`16402`) - Bug in assignment against ``int64`` data with ``np.ndarray`` with ``float64`` dtype may keep ``int64`` dtype (:issue:`14001`) +.. _whatsnew_0210.api.na_changes: + +NA naming Changes +^^^^^^^^^^^^^^^^^ + +In order to promote more consistency among the pandas API, we have added additional top-level +functions :func:`isna` and :func:`notna` that are aliases for :func:`isnull` and :func:`notnull`. +The naming scheme is now more consistent with methods like ``.dropna()`` and ``.fillna()``. Furthermore +in all cases where ``.isnull()`` and ``.notnull()`` methods are defined, these have additional methods +named ``.isna()`` and ``.notna()``, these are included for classes ``Categorical``, +``Index``, ``Series``, and ``DataFrame``. (:issue:`15001`). + +The configuration option ``pd.options.mode.use_inf_as_null`` is deprecated, and ``pd.options.mode.use_inf_as_na`` is added as a replacement. .. _whatsnew_210.api.multiindex_single: MultiIndex Constructor with a Single Level ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The ``MultiIndex`` constructors no longer squeezes a MultiIndex with all +The ``MultiIndex`` constructors no longer squeeze a MultiIndex with all length-one levels down to a regular ``Index``. This affects all the ``MultiIndex`` constructors. (:issue:`17178`) @@ -822,94 +344,42 @@ UTC Localization with Series Previously, :func:`to_datetime` did not localize datetime ``Series`` data when ``utc=True`` was passed. Now, :func:`to_datetime` will correctly localize ``Series`` with a ``datetime64[ns, UTC]`` dtype to be consistent with how list-like and ``Index`` data are handled. (:issue:`6415`). -Previous Behavior + Previous Behavior -.. ipython:: python + .. ipython:: python - s = Series(['20130101 00:00:00'] * 3) + s = Series(['20130101 00:00:00'] * 3) -.. code-block:: ipython + .. code-block:: ipython - In [12]: pd.to_datetime(s, utc=True) - Out[12]: - 0 2013-01-01 - 1 2013-01-01 - 2 2013-01-01 - dtype: datetime64[ns] + In [12]: pd.to_datetime(s, utc=True) + Out[12]: + 0 2013-01-01 + 1 2013-01-01 + 2 2013-01-01 + dtype: datetime64[ns] -New Behavior + New Behavior -.. ipython:: python + .. ipython:: python - pd.to_datetime(s, utc=True) + pd.to_datetime(s, utc=True) Additionally, DataFrames with datetime columns that were parsed by :func:`read_sql_table` and :func:`read_sql_query` will also be localized to UTC only if the original SQL columns were timezone aware datetime columns. -.. _whatsnew_0210.api.consistency_of_range_functions: - -Consistency of Range Functions -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -In previous versions, there were some inconsistencies between the various range functions: :func:`date_range`, :func:`bdate_range`, :func:`period_range`, :func:`timedelta_range`, and :func:`interval_range`. (:issue:`17471`). - -One of the inconsistent behaviors occurred when the ``start``, ``end`` and ``period`` parameters were all specified, potentially leading to ambiguous ranges. When all three parameters were passed, ``interval_range`` ignored the ``period`` parameter, ``period_range`` ignored the ``end`` parameter, and the other range functions raised. To promote consistency among the range functions, and avoid potentially ambiguous ranges, ``interval_range`` and ``period_range`` will now raise when all three parameters are passed. - -Previous Behavior: - -.. code-block:: ipython - - In [2]: pd.interval_range(start=0, end=4, periods=6) - Out[2]: - IntervalIndex([(0, 1], (1, 2], (2, 3]] - closed='right', - dtype='interval[int64]') - - In [3]: pd.period_range(start='2017Q1', end='2017Q4', periods=6, freq='Q') - Out[3]: PeriodIndex(['2017Q1', '2017Q2', '2017Q3', '2017Q4', '2018Q1', '2018Q2'], dtype='period[Q-DEC]', freq='Q-DEC') - -New Behavior: - -.. code-block:: ipython - - In [2]: pd.interval_range(start=0, end=4, periods=6) - --------------------------------------------------------------------------- - ValueError: Of the three parameters: start, end, and periods, exactly two must be specified - - In [3]: pd.period_range(start='2017Q1', end='2017Q4', periods=6, freq='Q') - --------------------------------------------------------------------------- - ValueError: Of the three parameters: start, end, and periods, exactly two must be specified - -Additionally, the endpoint parameter ``end`` was not included in the intervals produced by ``interval_range``. However, all other range functions include ``end`` in their output. To promote consistency among the range functions, ``interval_range`` will now include ``end`` as the right endpoint of the final interval, except if ``freq`` is specified in a way which skips ``end``. - -Previous Behavior: - -.. code-block:: ipython - - In [4]: pd.interval_range(start=0, end=4) - Out[4]: - IntervalIndex([(0, 1], (1, 2], (2, 3]] - closed='right', - dtype='interval[int64]') - - -New Behavior: - -.. ipython:: python - - pd.interval_range(start=0, end=4) - .. _whatsnew_0210.api: Other API Changes ^^^^^^^^^^^^^^^^^ +- Support has been dropped for Python 3.4 (:issue:`15251`) - The Categorical constructor no longer accepts a scalar for the ``categories`` keyword. (:issue:`16022`) - Accessing a non-existent attribute on a closed :class:`~pandas.HDFStore` will now raise an ``AttributeError`` rather than a ``ClosedFileError`` (:issue:`16301`) -- :func:`read_csv` now issues a ``UserWarning`` if the ``names`` parameter contains duplicates (:issue:`17095`) -- :func:`read_csv` now treats ``'null'`` and ``'n/a'`` strings as missing values by default (:issue:`16471`, :issue:`16078`) +- :func:`read_csv` now treats ``'null'`` strings as missing values by default (:issue:`16471`) +- :func:`read_csv` now treats ``'n/a'`` strings as missing values by default (:issue:`16078`) - :class:`pandas.HDFStore`'s string representation is now faster and less detailed. For the previous behavior, use ``pandas.HDFStore.info()``. (:issue:`16503`). -- Compression defaults in HDF stores now follow pytables standards. Default is no compression and if ``complib`` is missing and ``complevel`` > 0 ``zlib`` is used (:issue:`15943`) +- Compression defaults in HDF stores now follow pytable standards. Default is no compression and if ``complib`` is missing and ``complevel`` > 0 ``zlib`` is used (:issue:`15943`) - ``Index.get_indexer_non_unique()`` now returns a ndarray indexer rather than an ``Index``; this is consistent with ``Index.get_indexer()`` (:issue:`16819`) - Removed the ``@slow`` decorator from ``pandas.util.testing``, which caused issues for some downstream packages' test suites. Use ``@pytest.mark.slow`` instead, which achieves the same thing (:issue:`16850`) - Moved definition of ``MergeError`` to the ``pandas.errors`` module. @@ -917,80 +387,15 @@ Other API Changes - :func:`Series.argmin` and :func:`Series.argmax` will now raise a ``TypeError`` when used with ``object`` dtypes, instead of a ``ValueError`` (:issue:`13595`) - :class:`Period` is now immutable, and will now raise an ``AttributeError`` when a user tries to assign a new value to the ``ordinal`` or ``freq`` attributes (:issue:`17116`). - :func:`to_datetime` when passed a tz-aware ``origin=`` kwarg will now raise a more informative ``ValueError`` rather than a ``TypeError`` (:issue:`16842`) -- :func:`to_datetime` now raises a ``ValueError`` when format includes ``%W`` or ``%U`` without also including day of the week and calendar year (:issue:`16774`) -- Renamed non-functional ``index`` to ``index_col`` in :func:`read_stata` to improve API consistency (:issue:`16342`) -- Bug in :func:`DataFrame.drop` caused boolean labels ``False`` and ``True`` to be treated as labels 0 and 1 respectively when dropping indices from a numeric index. This will now raise a ValueError (:issue:`16877`) -- Restricted DateOffset keyword arguments. Previously, ``DateOffset`` subclasses allowed arbitrary keyword arguments which could lead to unexpected behavior. Now, only valid arguments will be accepted. (:issue:`17176`). -- Pandas no longer registers matplotlib converters on import. The converters - will be registered and used when the first plot is draw (:issue:`17710`) + .. _whatsnew_0210.deprecations: Deprecations ~~~~~~~~~~~~ - -- :meth:`DataFrame.from_csv` and :meth:`Series.from_csv` have been deprecated in favor of :func:`read_csv()` (:issue:`4191`) - :func:`read_excel()` has deprecated ``sheetname`` in favor of ``sheet_name`` for consistency with ``.to_excel()`` (:issue:`10559`). -- :func:`read_excel()` has deprecated ``parse_cols`` in favor of ``usecols`` for consistency with :func:`read_csv` (:issue:`4988`) -- :func:`read_csv()` has deprecated the ``tupleize_cols`` argument. Column tuples will always be converted to a ``MultiIndex`` (:issue:`17060`) -- :meth:`DataFrame.to_csv` has deprecated the ``tupleize_cols`` argument. Multi-index columns will be always written as rows in the CSV file (:issue:`17060`) -- The ``convert`` parameter has been deprecated in the ``.take()`` method, as it was not being respected (:issue:`16948`) -- ``pd.options.html.border`` has been deprecated in favor of ``pd.options.display.html.border`` (:issue:`15793`). -- :func:`SeriesGroupBy.nth` has deprecated ``True`` in favor of ``'all'`` for its kwarg ``dropna`` (:issue:`11038`). -- :func:`DataFrame.as_blocks` is deprecated, as this is exposing the internal implementation (:issue:`17302`) -- ``pd.TimeGrouper`` is deprecated in favor of :class:`pandas.Grouper` (:issue:`16747`) -- ``cdate_range`` has been deprecated in favor of :func:`bdate_range`, which has gained ``weekmask`` and ``holidays`` parameters for building custom frequency date ranges. See the :ref:`documentation ` for more details (:issue:`17596`) -- passing ``categories`` or ``ordered`` kwargs to :func:`Series.astype` is deprecated, in favor of passing a :ref:`CategoricalDtype ` (:issue:`17636`) -- ``.get_value`` and ``.set_value`` on ``Series``, ``DataFrame``, ``Panel``, ``SparseSeries``, and ``SparseDataFrame`` are deprecated in favor of using ``.iat[]`` or ``.at[]`` accessors (:issue:`15269`) -- Passing a non-existent column in ``.to_excel(..., columns=)`` is deprecated and will raise a ``KeyError`` in the future (:issue:`17295`) -- ``raise_on_error`` parameter to :func:`Series.where`, :func:`Series.mask`, :func:`DataFrame.where`, :func:`DataFrame.mask` is deprecated, in favor of ``errors=`` (:issue:`14968`) -- Using :meth:`DataFrame.rename_axis` and :meth:`Series.rename_axis` to alter index or column *labels* is now deprecated in favor of using ``.rename``. ``rename_axis`` may still be used to alter the name of the index or columns (:issue:`17833`). -- :meth:`~DataFrame.reindex_axis` has been deprecated in favor of :meth:`~DataFrame.reindex`. See :ref:`here ` for more (:issue:`17833`). - -.. _whatsnew_0210.deprecations.select: - -Series.select and DataFrame.select -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The :meth:`Series.select` and :meth:`DataFrame.select` methods are deprecated in favor of using ``df.loc[labels.map(crit)]`` (:issue:`12401`) - -.. ipython:: python - - df = DataFrame({'A': [1, 2, 3]}, index=['foo', 'bar', 'baz']) - -.. code-block:: ipython - - In [3]: df.select(lambda x: x in ['bar', 'baz']) - FutureWarning: select is deprecated and will be removed in a future release. You can use .loc[crit] as a replacement - Out[3]: - A - bar 2 - baz 3 - -.. ipython:: python - - df.loc[df.index.map(lambda x: x in ['bar', 'baz'])] - - -.. _whatsnew_0210.deprecations.argmin_min: - -Series.argmax and Series.argmin -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The behavior of :func:`Series.argmax` and :func:`Series.argmin` have been deprecated in favor of :func:`Series.idxmax` and :func:`Series.idxmin`, respectively (:issue:`16830`). - -For compatibility with NumPy arrays, ``pd.Series`` implements ``argmax`` and -``argmin``. Since pandas 0.13.0, ``argmax`` has been an alias for -:meth:`pandas.Series.idxmax`, and ``argmin`` has been an alias for -:meth:`pandas.Series.idxmin`. They return the *label* of the maximum or minimum, -rather than the *position*. - -We've deprecated the current behavior of ``Series.argmax`` and -``Series.argmin``. Using either of these will emit a ``FutureWarning``. Use -:meth:`Series.idxmax` if you want the label of the maximum. Use -``Series.values.argmax()`` if you want the position of the maximum. Likewise for -the minimum. In a future release ``Series.argmax`` and ``Series.argmin`` will -return the position of the maximum or minimum. +- ``pd.options.html.border`` has been deprecated in favor of ``pd.options.display.html.border`` (:issue:`15793`). .. _whatsnew_0210.prior_deprecations: @@ -1016,38 +421,23 @@ Performance Improvements - Improved performance of instantiating :class:`SparseDataFrame` (:issue:`16773`) - :attr:`Series.dt` no longer performs frequency inference, yielding a large speedup when accessing the attribute (:issue:`17210`) -- Improved performance of :meth:`~Series.cat.set_categories` by not materializing the values (:issue:`17508`) -- :attr:`Timestamp.microsecond` no longer re-computes on attribute access (:issue:`17331`) -- Improved performance of the :class:`CategoricalIndex` for data that is already categorical dtype (:issue:`17513`) -- Improved performance of :meth:`RangeIndex.min` and :meth:`RangeIndex.max` by using ``RangeIndex`` properties to perform the computations (:issue:`17607`) - -.. _whatsnew_0210.docs: -Documentation Changes -~~~~~~~~~~~~~~~~~~~~~ - -- Several ``NaT`` method docstrings (e.g. :func:`NaT.ctime`) were incorrect (:issue:`17327`) -- The documentation has had references to versions < v0.17 removed and cleaned up (:issue:`17442`, :issue:`17442`, :issue:`17404` & :issue:`17504`) .. _whatsnew_0210.bug_fixes: Bug Fixes ~~~~~~~~~ + Conversion ^^^^^^^^^^ - Bug in assignment against datetime-like data with ``int`` may incorrectly convert to datetime-like (:issue:`14145`) - Bug in assignment against ``int64`` data with ``np.ndarray`` with ``float64`` dtype may keep ``int64`` dtype (:issue:`14001`) +- Fix :func:`DataFrame.memory_usage` to support PyPy. Objects on PyPy do not have a fixed size, so an approximation is used instead (:issue:`17228`) - Fixed the return type of ``IntervalIndex.is_non_overlapping_monotonic`` to be a Python ``bool`` for consistency with similar attributes/methods. Previously returned a ``numpy.bool_``. (:issue:`17237`) - Bug in ``IntervalIndex.is_non_overlapping_monotonic`` when intervals are closed on both sides and overlap at a point (:issue:`16560`) - Bug in :func:`Series.fillna` returns frame when ``inplace=True`` and ``value`` is dict (:issue:`16156`) -- Bug in :attr:`Timestamp.weekday_name` returning a UTC-based weekday name when localized to a timezone (:issue:`17354`) -- Bug in ``Timestamp.replace`` when replacing ``tzinfo`` around DST changes (:issue:`15683`) -- Bug in ``Timedelta`` construction and arithmetic that would not propagate the ``Overflow`` exception (:issue:`17367`) -- Bug in :meth:`~DataFrame.astype` converting to object dtype when passed extension type classes (`DatetimeTZDtype``, ``CategoricalDtype``) rather than instances. Now a ``TypeError`` is raised when a class is passed (:issue:`17780`). -- Bug in :meth:`to_numeric` in which elements were not always being coerced to numeric when ``errors='coerce'`` (:issue:`17007`, :issue:`17125`) -- Bug in ``DataFrame`` and ``Series`` constructors where ``range`` objects are converted to ``int32`` dtype on Windows instead of ``int64`` (:issue:`16804`) Indexing ^^^^^^^^ @@ -1065,40 +455,22 @@ Indexing - Bug in ``.iloc`` when used with inplace addition or assignment and an int indexer on a ``MultiIndex`` causing the wrong indexes to be read from and written to (:issue:`17148`) - Bug in ``.isin()`` in which checking membership in empty ``Series`` objects raised an error (:issue:`16991`) - Bug in ``CategoricalIndex`` reindexing in which specified indices containing duplicates were not being respected (:issue:`17323`) -- Bug in intersection of ``RangeIndex`` with negative step (:issue:`17296`) -- Bug in ``IntervalIndex`` where performing a scalar lookup fails for included right endpoints of non-overlapping monotonic decreasing indexes (:issue:`16417`, :issue:`17271`) -- Bug in :meth:`DataFrame.first_valid_index` and :meth:`DataFrame.last_valid_index` when no valid entry (:issue:`17400`) -- Bug in :func:`Series.rename` when called with a callable, incorrectly alters the name of the ``Series``, rather than the name of the ``Index``. (:issue:`17407`) -- Bug in :func:`String.str_get` raises ``IndexError`` instead of inserting NaNs when using a negative index. (:issue:`17704`) I/O ^^^ -- Bug in :func:`read_hdf` when reading a timezone aware index from ``fixed`` format HDFStore (:issue:`17618`) - Bug in :func:`read_csv` in which columns were not being thoroughly de-duplicated (:issue:`17060`) - Bug in :func:`read_csv` in which specified column names were not being thoroughly de-duplicated (:issue:`17095`) - Bug in :func:`read_csv` in which non integer values for the header argument generated an unhelpful / unrelated error message (:issue:`16338`) - Bug in :func:`read_csv` in which memory management issues in exception handling, under certain conditions, would cause the interpreter to segfault (:issue:`14696`, :issue:`16798`). - Bug in :func:`read_csv` when called with ``low_memory=False`` in which a CSV with at least one column > 2GB in size would incorrectly raise a ``MemoryError`` (:issue:`16798`). - Bug in :func:`read_csv` when called with a single-element list ``header`` would return a ``DataFrame`` of all NaN values (:issue:`7757`) -- Bug in :meth:`DataFrame.to_csv` defaulting to 'ascii' encoding in Python 3, instead of 'utf-8' (:issue:`17097`) - Bug in :func:`read_stata` where value labels could not be read when using an iterator (:issue:`16923`) -- Bug in :func:`read_stata` where the index was not set (:issue:`16342`) - Bug in :func:`read_html` where import check fails when run in multiple threads (:issue:`16928`) -- Bug in :func:`read_csv` where automatic delimiter detection caused a ``TypeError`` to be thrown when a bad line was encountered rather than the correct error message (:issue:`13374`) -- Bug in :meth:`DataFrame.to_html` with ``notebook=True`` where DataFrames with named indices or non-MultiIndex indices had undesired horizontal or vertical alignment for column or row labels, respectively (:issue:`16792`) -- Bug in :meth:`DataFrame.to_html` in which there was no validation of the ``justify`` parameter (:issue:`17527`) -- Bug in :func:`HDFStore.select` when reading a contiguous mixed-data table featuring VLArray (:issue:`17021`) -- Bug in :func:`to_json` where several conditions (including objects with unprintable symbols, objects with deep recursion, overlong labels) caused segfaults instead of raising the appropriate exception (:issue:`14256`) Plotting ^^^^^^^^ - Bug in plotting methods using ``secondary_y`` and ``fontsize`` not setting secondary axis font size (:issue:`12565`) -- Bug when plotting ``timedelta`` and ``datetime`` dtypes on y-axis (:issue:`16953`) -- Line plots no longer assume monotonic x data when calculating xlims, they show the entire lines now even for unsorted x data. (:issue:`11310`, :issue:`11471`) -- With matplotlib 2.0.0 and above, calculation of x limits for line plots is left to matplotlib, so that its new default settings are applied. (:issue:`15495`) -- Bug in ``Series.plot.bar`` or ``DataFrame.plot.bar`` with ``y`` not respecting user-passed ``color`` (:issue:`16822`) -- Bug causing ``plotting.parallel_coordinates`` to reset the random seed when using random colors (:issue:`17525`) Groupby/Resample/Rolling @@ -1109,21 +481,15 @@ Groupby/Resample/Rolling - Bug in ``.rolling(...).quantile()`` which incorrectly used different defaults than :func:`Series.quantile()` and :func:`DataFrame.quantile()` (:issue:`9413`, :issue:`16211`) - Bug in ``groupby.transform()`` that would coerce boolean dtypes back to float (:issue:`16875`) - Bug in ``Series.resample(...).apply()`` where an empty ``Series`` modified the source index and did not return the name of a ``Series`` (:issue:`14313`) -- Bug in ``.rolling(...).apply(...)`` with a ``DataFrame`` with a ``DatetimeIndex``, a ``window`` of a timedelta-convertible and ``min_periods >= 1`` (:issue:`15305`) +- Bug in ``.rolling(...).apply(...)`` with a ``DataFrame`` with a ``DatetimeIndex``, a ``window`` of a timedelta-convertible and ``min_periods >= 1` (:issue:`15305`) - Bug in ``DataFrame.groupby`` where index and column keys were not recognized correctly when the number of keys equaled the number of elements on the groupby axis (:issue:`16859`) -- Bug in ``groupby.nunique()`` with ``TimeGrouper`` which cannot handle ``NaT`` correctly (:issue:`17575`) -- Bug in ``DataFrame.groupby`` where a single level selection from a ``MultiIndex`` unexpectedly sorts (:issue:`17537`) -- Bug in ``DataFrame.groupby`` where spurious warning is raised when ``Grouper`` object is used to override ambiguous column name (:issue:`17383`) -- Bug in ``TimeGrouper`` differs when passes as a list and as a scalar (:issue:`17530`) Sparse ^^^^^^ - Bug in ``SparseSeries`` raises ``AttributeError`` when a dictionary is passed in as data (:issue:`16905`) - Bug in :func:`SparseDataFrame.fillna` not filling all NaNs when frame was instantiated from SciPy sparse matrix (:issue:`16112`) -- Bug in :func:`SparseSeries.unstack` and :func:`SparseDataFrame.stack` (:issue:`16614`, :issue:`15045`) -- Bug in :func:`make_sparse` treating two numeric/boolean data, which have same bits, as same when array ``dtype`` is ``object`` (:issue:`17574`) -- :func:`SparseArray.all` and :func:`SparseArray.any` are now implemented to handle ``SparseArray``, these were used but not implemented (:issue:`17570`) + Reshaping ^^^^^^^^^ @@ -1137,11 +503,6 @@ Reshaping - :func:`Series.argmin`, :func:`Series.argmax`, and their counterparts on ``DataFrame`` and groupby objects work correctly with floating point data that contains infinite values (:issue:`13595`). - Bug in :func:`unique` where checking a tuple of strings raised a ``TypeError`` (:issue:`17108`) - Bug in :func:`concat` where order of result index was unpredictable if it contained non-comparable elements (:issue:`17344`) -- Fixes regression when sorting by multiple columns on a ``datetime64`` dtype ``Series`` with ``NaT`` values (:issue:`16836`) -- Bug in :func:`pivot_table` where the result's columns did not preserve the categorical dtype of ``columns`` when ``dropna`` was ``False`` (:issue:`17842`) -- Bug in ``DataFrame.drop_duplicates`` where dropping with non-unique column names raised a ``ValueError`` (:issue:`17836`) -- Bug in :func:`unstack` which, when called on a list of levels, would discard the ``fillna`` argument (:issue:`13971`) -- Bug in the alignment of ``range`` objects and other list-likes with ``DataFrame`` leading to operations being performed row-wise instead of column-wise (:issue:`17901`) Numeric ^^^^^^^ @@ -1151,26 +512,13 @@ Numeric Categorical ^^^^^^^^^^^ -- Bug in :func:`Series.isin` when called with a categorical (:issue:`16639`) -- Bug in the categorical constructor with empty values and categories causing the ``.categories`` to be an empty ``Float64Index`` rather than an empty ``Index`` with object dtype (:issue:`17248`) -- Bug in categorical operations with :ref:`Series.cat ` not preserving the original Series' name (:issue:`17509`) -- Bug in :func:`DataFrame.merge` failing for categorical columns with boolean/int data types (:issue:`17187`) -- Bug in constructing a ``Categorical``/``CategoricalDtype`` when the specified ``categories`` are of categorical type (:issue:`17884`). - -.. _whatsnew_0210.pypy: - -PyPy -^^^^ +- Bug in :func:`Series.isin` when called with a categorical (:issue`16639`) +- Bug in the categorical constructor with empty values and categories causing + the ``.categories`` to be an empty ``Float64Index`` rather than an empty + ``Index`` with object dtype (:issue:`17248`) -- Compatibility with PyPy in :func:`read_csv` with ``usecols=[]`` and - :func:`read_json` (:issue:`17351`) -- Split tests into cases for CPython and PyPy where needed, which highlights the fragility - of index matching with ``float('nan')``, ``np.nan`` and ``NAT`` (:issue:`17351`) -- Fix :func:`DataFrame.memory_usage` to support PyPy. Objects on PyPy do not have a fixed size, - so an approximation is used instead (:issue:`17228`) Other ^^^^^ -- Bug where some inplace operators were not being wrapped and produced a copy when invoked (:issue:`12962`) - Bug in :func:`eval` where the ``inplace`` parameter was being incorrectly handled (:issue:`16732`) - +- Several ``NaT`` method docstrings (e.g. :func:`NaT.ctime`) were incorrect (:issue:`17327`) diff --git a/pandas/tests/groupby/test_whitelist.py b/pandas/tests/groupby/test_whitelist.py index 3c472fec84d43..865e51ca985de 100644 --- a/pandas/tests/groupby/test_whitelist.py +++ b/pandas/tests/groupby/test_whitelist.py @@ -239,6 +239,7 @@ def test_groupby_blacklist(df_letters): def test_tab_completion(mframe): grp = mframe.groupby(level='second') results = set([v for v in dir(grp) if not v.startswith('_')]) +<<<<<<< HEAD expected = set( ['A', 'B', 'C', 'agg', 'aggregate', 'apply', 'boxplot', 'filter', 'first', 'get_group', 'groups', 'hist', 'indices', 'last', 'max', @@ -255,6 +256,20 @@ def test_tab_completion(mframe): 'ffill', 'bfill', 'pad', 'backfill', 'rolling', 'expanding', >>>>>>> e99897c... ENH: gb.is_monotonic_increasing, is_monotonic_decreasing #17015 'is_monotonic_increasing', 'is_monotonic_decreasing']) +======= + expected = { + 'A', 'B', 'C', 'agg', 'aggregate', 'apply', 'boxplot', 'filter', + 'first', 'get_group', 'groups', 'hist', 'indices', 'last', 'max', + 'mean', 'median', 'min', 'ngroups', 'nth', 'ohlc', 'plot', + 'prod', 'size', 'std', 'sum', 'transform', 'var', 'sem', 'count', + 'nunique', 'head', 'describe', 'cummax', 'quantile', + 'rank', 'cumprod', 'tail', 'resample', 'cummin', 'fillna', + 'cumsum', 'cumcount', 'ngroup', 'all', 'shift', 'skew', + 'take', 'tshift', 'pct_change', 'any', 'mad', 'corr', 'corrwith', + 'cov', 'dtypes', 'ndim', 'diff', 'idxmax', 'idxmin', + 'ffill', 'bfill', 'pad', 'backfill', 'rolling', 'expanding', 'pipe', + 'is_monotonic_increasing', 'is_monotonic_decreasing'} +>>>>>>> ceceae1... ENH: gb.is_monotonic_increasing #17015 fix rebase conflicts assert results == expected From 4ecc4791fc8ca90e490a14a59ef8fc1c3cda534a Mon Sep 17 00:00:00 2001 From: No-Stream Date: Tue, 31 Oct 2017 16:41:04 -0700 Subject: [PATCH 09/21] ENH: gb.is_monotonic_increasing #17015 fix rebase conflicts --- doc/source/whatsnew/v0.21.0.txt | 899 +++++++++++++++++++++++++++----- 1 file changed, 781 insertions(+), 118 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 0c01d2f19a62d..4c8e094e69068 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -1,33 +1,49 @@ .. _whatsnew_0210: -v0.21.0 (???) -------------- +v0.21.0 (October 27, 2017) +-------------------------- -This is a major release from 0.20.x and includes a number of API changes, deprecations, new features, +This is a major release from 0.20.3 and includes a number of API changes, deprecations, new features, enhancements, and performance improvements along with a large number of bug fixes. We recommend that all users upgrade to this version. Highlights include: -- Integration with `Apache Parquet `__, including a new top-level :func:`read_parquet` and :func:`DataFrame.to_parquet` method, see :ref:`here `. +- Integration with `Apache Parquet `__, including a new top-level :func:`read_parquet` function and :meth:`DataFrame.to_parquet` method, see :ref:`here `. +- New user-facing :class:`pandas.api.types.CategoricalDtype` for specifying + categoricals independent of the data, see :ref:`here `. +- The behavior of ``sum`` and ``prod`` on all-NaN Series/DataFrames is now consistent and no longer depends on whether `bottleneck `__ is installed, see :ref:`here `. +- Compatibility fixes for pypy, see :ref:`here `. +- Additions to the ``drop``, ``reindex`` and ``rename`` API to make them more consistent, see :ref:`here `. +- Addition of the new methods ``DataFrame.infer_objects`` (see :ref:`here `) and ``GroupBy.pipe`` (see :ref:`here `). +- Indexing with a list of labels, where one or more of the labels is missing, is deprecated and will raise a KeyError in a future version, see :ref:`here `. Check the :ref:`API Changes ` and :ref:`deprecations ` before updating. .. contents:: What's new in v0.21.0 :local: :backlinks: none + :depth: 2 .. _whatsnew_0210.enhancements: New features ~~~~~~~~~~~~ -- Support for `PEP 519 -- Adding a file system path protocol - `_ on most readers and writers (:issue:`13823`) -- Added ``__fspath__`` method to :class:`~pandas.HDFStore`, :class:`~pandas.ExcelFile`, - and :class:`~pandas.ExcelWriter` to work properly with the file system path protocol (:issue:`13823`) -- Added ``skipna`` parameter to :func:`~pandas.api.types.infer_dtype` to - support type inference in the presence of missing values (:issue:`17059`). +.. _whatsnew_0210.enhancements.parquet: + +Integration with Apache Parquet file format +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Integration with `Apache Parquet `__, including a new top-level :func:`read_parquet` and :func:`DataFrame.to_parquet` method, see :ref:`here ` (:issue:`15838`, :issue:`17438`). + +`Apache Parquet `__ provides a cross-language, binary file format for reading and writing data frames efficiently. +Parquet is designed to faithfully serialize and de-serialize ``DataFrame`` s, supporting all of the pandas +dtypes, including extension dtypes such as datetime with timezones. + +This functionality depends on either the `pyarrow `__ or `fastparquet `__ library. +For more details, see see :ref:`the IO docs on Parquet `. + .. _whatsnew_0210.enhancements.infer_objects: @@ -41,7 +57,7 @@ method. See the documentation :ref:`here ` for more details. (:issue:`11221`) This method only performs soft conversions on object columns, converting Python objects -to native types, but not any coercive conversions. For example: +to native types, but not any coercive conversions. For example: .. ipython:: python @@ -52,7 +68,7 @@ to native types, but not any coercive conversions. For example: df.infer_objects().dtypes Note that column ``'C'`` was not converted - only scalar numeric types -will be inferred to a new type. Other types of conversion should be accomplished +will be converted to a new type. Other types of conversion should be accomplished using the :func:`to_numeric` function (or :func:`to_datetime`, :func:`to_timedelta`). .. ipython:: python @@ -66,62 +82,234 @@ using the :func:`to_numeric` function (or :func:`to_datetime`, :func:`to_timedel Improved warnings when attempting to create columns ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -New users are often flummoxed by the relationship between column operations and attribute -access on ``DataFrame`` instances (:issue:`5904` & :issue:`7175`). Two specific instances -of this confusion include attempting to create a new column by setting into an attribute: +New users are often puzzled by the relationship between column operations and +attribute access on ``DataFrame`` instances (:issue:`7175`). One specific +instance of this confusion is attempting to create a new column by setting an +attribute on the ``DataFrame``: .. code-block:: ipython - In[1]: df = pd.DataFrame({'one': [1., 2., 3.]}) - In[2]: df.two = [4, 5, 6] + In[1]: df = pd.DataFrame({'one': [1., 2., 3.]}) + In[2]: df.two = [4, 5, 6] This does not raise any obvious exceptions, but also does not create a new column: .. code-block:: ipython - In[3]: df - Out[3]: - one - 0 1.0 - 1 2.0 - 2 3.0 + In[3]: df + Out[3]: + one + 0 1.0 + 1 2.0 + 2 3.0 -The second source of confusion is creating a column whose name collides with a method or -attribute already in the instance namespace: +Setting a list-like data structure into a new attribute now raises a ``UserWarning`` about the potential for unexpected behavior. See :ref:`Attribute Access `. -.. code-block:: ipython +.. _whatsnew_0210.enhancements.drop_api: - In[4]: df['sum'] = [5., 7., 9.] +``drop`` now also accepts index/columns keywords +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -This does not permit that column to be accessed as an attribute: +The :meth:`~DataFrame.drop` method has gained ``index``/``columns`` keywords as an +alternative to specifying the ``axis``. This is similar to the behavior of ``reindex`` +(:issue:`12392`). -.. code-block:: ipython +For example: + +.. ipython:: python + + df = pd.DataFrame(np.arange(8).reshape(2,4), + columns=['A', 'B', 'C', 'D']) + df + df.drop(['B', 'C'], axis=1) + # the following is now equivalent + df.drop(columns=['B', 'C']) + +.. _whatsnew_0210.enhancements.rename_reindex_axis: + +``rename``, ``reindex`` now also accept axis keyword +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The :meth:`DataFrame.rename` and :meth:`DataFrame.reindex` methods have gained +the ``axis`` keyword to specify the axis to target with the operation +(:issue:`12392`). + +Here's ``rename``: + +.. ipython:: python + + df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + df.rename(str.lower, axis='columns') + df.rename(id, axis='index') + +And ``reindex``: + +.. ipython:: python + + df.reindex(['A', 'B', 'C'], axis='columns') + df.reindex([0, 1, 3], axis='index') + +The "index, columns" style continues to work as before. + +.. ipython:: python + + df.rename(index=id, columns=str.lower) + df.reindex(index=[0, 1, 3], columns=['A', 'B', 'C']) + +We *highly* encourage using named arguments to avoid confusion when using either +style. + +.. _whatsnew_0210.enhancements.categorical_dtype: + +``CategoricalDtype`` for specifying categoricals +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:class:`pandas.api.types.CategoricalDtype` has been added to the public API and +expanded to include the ``categories`` and ``ordered`` attributes. A +``CategoricalDtype`` can be used to specify the set of categories and +orderedness of an array, independent of the data. This can be useful for example, +when converting string data to a ``Categorical`` (:issue:`14711`, +:issue:`15078`, :issue:`16015`, :issue:`17643`): + +.. ipython:: python + + from pandas.api.types import CategoricalDtype + + s = pd.Series(['a', 'b', 'c', 'a']) # strings + dtype = CategoricalDtype(categories=['a', 'b', 'c', 'd'], ordered=True) + s.astype(dtype) + +One place that deserves special mention is in :meth:`read_csv`. Previously, with +``dtype={'col': 'category'}``, the returned values and categories would always +be strings. + +.. ipython:: python + :suppress: + + from pandas.compat import StringIO + +.. ipython:: python + + data = 'A,B\na,1\nb,2\nc,3' + pd.read_csv(StringIO(data), dtype={'B': 'category'}).B.cat.categories + +Notice the "object" dtype. + +With a ``CategoricalDtype`` of all numerics, datetimes, or +timedeltas, we can automatically convert to the correct type + +.. ipython:: python + + dtype = {'B': CategoricalDtype([1, 2, 3])} + pd.read_csv(StringIO(data), dtype=dtype).B.cat.categories + +The values have been correctly interpreted as integers. + +The ``.dtype`` property of a ``Categorical``, ``CategoricalIndex`` or a +``Series`` with categorical type will now return an instance of +``CategoricalDtype``. While the repr has changed, ``str(CategoricalDtype())`` is +still the string ``'category'``. We'll take this moment to remind users that the +*preferred* way to detect categorical data is to use +:func:`pandas.api.types.is_categorical_dtype`, and not ``str(dtype) == 'category'``. + +See the :ref:`CategoricalDtype docs ` for more. + +.. _whatsnew_0210.enhancements.GroupBy_pipe: + +``GroupBy`` objects now have a ``pipe`` method +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``GroupBy`` objects now have a ``pipe`` method, similar to the one on +``DataFrame`` and ``Series``, that allow for functions that take a +``GroupBy`` to be composed in a clean, readable syntax. (:issue:`17871`) + +For a concrete example on combining ``.groupby`` and ``.pipe`` , imagine having a +DataFrame with columns for stores, products, revenue and sold quantity. We'd like to +do a groupwise calculation of *prices* (i.e. revenue/quantity) per store and per product. +We could do this in a multi-step operation, but expressing it in terms of piping can make the +code more readable. + +First we set the data: + +.. ipython:: python + + import numpy as np + n = 1000 + df = pd.DataFrame({'Store': np.random.choice(['Store_1', 'Store_2'], n), + 'Product': np.random.choice(['Product_1', 'Product_2', 'Product_3'], n), + 'Revenue': (np.random.random(n)*50+10).round(2), + 'Quantity': np.random.randint(1, 10, size=n)}) + df.head(2) + +Now, to find prices per store/product, we can simply do: + +.. ipython:: python - In[5]: df.sum - Out[5]: - + (df.groupby(['Store', 'Product']) + .pipe(lambda grp: grp.Revenue.sum()/grp.Quantity.sum()) + .unstack().round(2)) + +See the :ref:`documentation ` for more. + + +.. _whatsnew_0210.enhancements.reanme_categories: + +``Categorical.rename_categories`` accepts a dict-like +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:meth:`~Series.cat.rename_categories` now accepts a dict-like argument for +``new_categories``. The previous categories are looked up in the dictionary's +keys and replaced if found. The behavior of missing and extra keys is the same +as in :meth:`DataFrame.rename`. + +.. ipython:: python + + c = pd.Categorical(['a', 'a', 'b']) + c.rename_categories({"a": "eh", "b": "bee"}) + +.. warning:: + + To assist with upgrading pandas, ``rename_categories`` treats ``Series`` as + list-like. Typically, Series are considered to be dict-like (e.g. in + ``.rename``, ``.map``). In a future version of pandas ``rename_categories`` + will change to treat them as dict-like. Follow the warning message's + recommendations for writing future-proof code. + + .. code-block:: ipython + + In [33]: c.rename_categories(pd.Series([0, 1], index=['a', 'c'])) + FutureWarning: Treating Series 'new_categories' as a list-like and using the values. + In a future version, 'rename_categories' will treat Series like a dictionary. + For dict-like, use 'new_categories.to_dict()' + For list-like, use 'new_categories.values'. + Out[33]: + [0, 0, 1] + Categories (2, int64): [0, 1] -Both of these now raise a ``UserWarning`` about the potential for unexpected behavior. See :ref:`Attribute Access `. .. _whatsnew_0210.enhancements.other: Other Enhancements ^^^^^^^^^^^^^^^^^^ -- The ``validate`` argument for :func:`merge` function now checks whether a merge is one-to-one, one-to-many, many-to-one, or many-to-many. If a merge is found to not be an example of specified merge type, an exception of type ``MergeError`` will be raised. For more, see :ref:`here ` (:issue:`16270`) -- Added support for `PEP 518 `_ to the build system (:issue:`16745`) +New functions or methods +"""""""""""""""""""""""" + +- :meth:`~pandas.core.resample.Resampler.nearest` is added to support nearest-neighbor upsampling (:issue:`17496`). +- :class:`~pandas.Index` has added support for a ``to_frame`` method (:issue:`15230`). + +New keywords +"""""""""""" + +- Added a ``skipna`` parameter to :func:`~pandas.api.types.infer_dtype` to + support type inference in the presence of missing values (:issue:`17059`). - :func:`Series.to_dict` and :func:`DataFrame.to_dict` now support an ``into`` keyword which allows you to specify the ``collections.Mapping`` subclass that you would like returned. The default is ``dict``, which is backwards compatible. (:issue:`16122`) -- :func:`RangeIndex.append` now returns a ``RangeIndex`` object when possible (:issue:`16212`) -- :func:`Series.rename_axis` and :func:`DataFrame.rename_axis` with ``inplace=True`` now return ``None`` while renaming the axis inplace. (:issue:`15704`) - :func:`Series.set_axis` and :func:`DataFrame.set_axis` now support the ``inplace`` parameter. (:issue:`14636`) - :func:`Series.to_pickle` and :func:`DataFrame.to_pickle` have gained a ``protocol`` parameter (:issue:`16252`). By default, this parameter is set to `HIGHEST_PROTOCOL `__ -- :func:`api.types.infer_dtype` now infers decimals. (:issue:`15690`) - :func:`read_feather` has gained the ``nthreads`` parameter for multi-threaded operations (:issue:`16359`) - :func:`DataFrame.clip()` and :func:`Series.clip()` have gained an ``inplace`` argument. (:issue:`15388`) - :func:`crosstab` has gained a ``margins_name`` parameter to define the name of the row / column that will contain the totals when ``margins=True``. (:issue:`15972`) +<<<<<<< HEAD - :func:`DataFrame.select_dtypes` now accepts scalar values for include/exclude as well as list-like. (:issue:`16855`) - :func:`date_range` now accepts 'YS' in addition to 'AS' as an alias for start of year (:issue:`9313`) - :func:`date_range` now accepts 'Y' in addition to 'A' as an alias for end of year (:issue:`9313`) @@ -135,6 +323,14 @@ Other Enhancements <<<<<<< HEAD <<<<<<< HEAD +======= +- :func:`read_json` now accepts a ``chunksize`` parameter that can be used when ``lines=True``. If ``chunksize`` is passed, read_json now returns an iterator which reads in ``chunksize`` lines with each iteration. (:issue:`17048`) +- :func:`read_json` and :func:`~DataFrame.to_json` now accept a ``compression`` argument which allows them to transparently handle compressed files. (:issue:`17798`) + +Various enhancements +"""""""""""""""""""" + +>>>>>>> 95d4ba8... ENH: gb.is_monotonic_increasing #17015 fix rebase conflicts - Improved the import time of pandas by about 2.25x. (:issue:`16764`) - Support for `PEP 519 -- Adding a file system path protocol `_ on most readers (e.g. @@ -158,6 +354,7 @@ Other Enhancements - :func:`read_excel` raises ``ImportError`` with a better message if ``xlrd`` is not installed. (:issue:`17613`) - :meth:`DataFrame.assign` will preserve the original order of ``**kwargs`` for Python 3.6+ users instead of sorting the column names. (:issue:`14207`) - :func:`Series.reindex`, :func:`DataFrame.reindex`, :func:`Index.get_indexer` now support list-like argument for ``tolerance``. (:issue:`17367`) +<<<<<<< HEAD ======= - :func: groupby.is_monotonic_increasing and .is_monotonic_decreasing extend Series.is_monotonic_increasing to groups, returning whether each group is monotonically increasing or decreasing, respectively. (:issue:`17015`) ======= @@ -171,24 +368,25 @@ Other Enhancements >>>>>>> ceceae1... ENH: gb.is_monotonic_increasing #17015 fix rebase conflicts +======= +>>>>>>> 95d4ba8... ENH: gb.is_monotonic_increasing #17015 fix rebase conflicts .. _whatsnew_0210.api_breaking: Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - .. _whatsnew_0210.api_breaking.deps: Dependencies have increased minimum versions ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -We have updated our minimum supported versions of dependencies (:issue:`15206`, :issue:`15543`, :issue:`15214`) -). If installed, we now require: +We have updated our minimum supported versions of dependencies (:issue:`15206`, :issue:`15543`, :issue:`15214`). +If installed, we now require: +--------------+-----------------+----------+ | Package | Minimum Version | Required | - +======================+=========+==========+ + +==============+=================+==========+ | Numpy | 1.9.0 | X | +--------------+-----------------+----------+ | Matplotlib | 1.4.3 | | @@ -198,6 +396,308 @@ We have updated our minimum supported versions of dependencies (:issue:`15206`, | Bottleneck | 1.0.0 | | +--------------+-----------------+----------+ +Additionally, support has been dropped for Python 3.4 (:issue:`15251`). + + +.. _whatsnew_0210.api_breaking.bottleneck: + +Sum/Prod of all-NaN Series/DataFrames is now consistently NaN +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The behavior of ``sum`` and ``prod`` on all-NaN Series/DataFrames no longer depends on +whether `bottleneck `__ is installed. (:issue:`9422`, :issue:`15507`). + +Calling ``sum`` or ``prod`` on an empty or all-``NaN`` ``Series``, or columns of a ``DataFrame``, will result in ``NaN``. See the :ref:`docs `. + +.. ipython:: python + + s = Series([np.nan]) + +Previously NO ``bottleneck`` + +.. code-block:: ipython + + In [2]: s.sum() + Out[2]: np.nan + +Previously WITH ``bottleneck`` + +.. code-block:: ipython + + In [2]: s.sum() + Out[2]: 0.0 + +New Behavior, without regard to the bottleneck installation. + +.. ipython:: python + + s.sum() + +Note that this also changes the sum of an empty ``Series`` + +Previously regardless of ``bottlenck`` + +.. code-block:: ipython + + In [1]: pd.Series([]).sum() + Out[1]: 0 + +.. ipython:: python + + pd.Series([]).sum() + + +.. _whatsnew_0210.api_breaking.loc: + +Indexing with a list with missing labels is Deprecated +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Previously, selecting with a list of labels, where one or more labels were missing would always succeed, returning ``NaN`` for missing labels. +This will now show a ``FutureWarning``. In the future this will raise a ``KeyError`` (:issue:`15747`). +This warning will trigger on a ``DataFrame`` or a ``Series`` for using ``.loc[]`` or ``[[]]`` when passing a list-of-labels with at least 1 missing label. +See the :ref:`deprecation docs `. + + +.. ipython:: python + + s = pd.Series([1, 2, 3]) + s + +Previous Behavior + +.. code-block:: ipython + + In [4]: s.loc[[1, 2, 3]] + Out[4]: + 1 2.0 + 2 3.0 + 3 NaN + dtype: float64 + + +Current Behavior + +.. code-block:: ipython + + In [4]: s.loc[[1, 2, 3]] + Passing list-likes to .loc or [] with any missing label will raise + KeyError in the future, you can use .reindex() as an alternative. + + See the documentation here: + http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike + + Out[4]: + 1 2.0 + 2 3.0 + 3 NaN + dtype: float64 + +The idiomatic way to achieve selecting potentially not-found elements is via ``.reindex()`` + +.. ipython:: python + + s.reindex([1, 2, 3]) + +Selection with all keys found is unchanged. + +.. ipython:: python + + s.loc[[1, 2]] + + +.. _whatsnew_0210.api.na_changes: + +NA naming Changes +^^^^^^^^^^^^^^^^^ + +In order to promote more consistency among the pandas API, we have added additional top-level +functions :func:`isna` and :func:`notna` that are aliases for :func:`isnull` and :func:`notnull`. +The naming scheme is now more consistent with methods like ``.dropna()`` and ``.fillna()``. Furthermore +in all cases where ``.isnull()`` and ``.notnull()`` methods are defined, these have additional methods +named ``.isna()`` and ``.notna()``, these are included for classes ``Categorical``, +``Index``, ``Series``, and ``DataFrame``. (:issue:`15001`). + +The configuration option ``pd.options.mode.use_inf_as_null`` is deprecated, and ``pd.options.mode.use_inf_as_na`` is added as a replacement. + + +.. _whatsnew_0210.api_breaking.iteration_scalars: + +Iteration of Series/Index will now return Python scalars +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Previously, when using certain iteration methods for a ``Series`` with dtype ``int`` or ``float``, you would receive a ``numpy`` scalar, e.g. a ``np.int64``, rather than a Python ``int``. Issue (:issue:`10904`) corrected this for ``Series.tolist()`` and ``list(Series)``. This change makes all iteration methods consistent, in particular, for ``__iter__()`` and ``.map()``; note that this only affects int/float dtypes. (:issue:`13236`, :issue:`13258`, :issue:`14216`). + +.. ipython:: python + + s = pd.Series([1, 2, 3]) + s + +Previously: + +.. code-block:: ipython + + In [2]: type(list(s)[0]) + Out[2]: numpy.int64 + +New Behaviour: + +.. ipython:: python + + type(list(s)[0]) + +Furthermore this will now correctly box the results of iteration for :func:`DataFrame.to_dict` as well. + +.. ipython:: python + + d = {'a':[1], 'b':['b']} + df = pd.DataFrame(d) + +Previously: + +.. code-block:: ipython + + In [8]: type(df.to_dict()['a'][0]) + Out[8]: numpy.int64 + +New Behaviour: + +.. ipython:: python + + type(df.to_dict()['a'][0]) + + +.. _whatsnew_0210.api_breaking.loc_with_index: + +Indexing with a Boolean Index +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Previously when passing a boolean ``Index`` to ``.loc``, if the index of the ``Series/DataFrame`` had ``boolean`` labels, +you would get a label based selection, potentially duplicating result labels, rather than a boolean indexing selection +(where ``True`` selects elements), this was inconsistent how a boolean numpy array indexed. The new behavior is to +act like a boolean numpy array indexer. (:issue:`17738`) + +Previous Behavior: + +.. ipython:: python + + s = pd.Series([1, 2, 3], index=[False, True, False]) + s + +.. code-block:: ipython + + In [59]: s.loc[pd.Index([True, False, True])] + Out[59]: + True 2 + False 1 + False 3 + True 2 + dtype: int64 + +Current Behavior + +.. ipython:: python + + s.loc[pd.Index([True, False, True])] + + +Furthermore, previously if you had an index that was non-numeric (e.g. strings), then a boolean Index would raise a ``KeyError``. +This will now be treated as a boolean indexer. + +Previously Behavior: + +.. ipython:: python + + s = pd.Series([1,2,3], index=['a', 'b', 'c']) + s + +.. code-block:: ipython + + In [39]: s.loc[pd.Index([True, False, True])] + KeyError: "None of [Index([True, False, True], dtype='object')] are in the [index]" + +Current Behavior + +.. ipython:: python + + s.loc[pd.Index([True, False, True])] + + +.. _whatsnew_0210.api_breaking.period_index_resampling: + +``PeriodIndex`` resampling +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In previous versions of pandas, resampling a ``Series``/``DataFrame`` indexed by a ``PeriodIndex`` returned a ``DatetimeIndex`` in some cases (:issue:`12884`). Resampling to a multiplied frequency now returns a ``PeriodIndex`` (:issue:`15944`). As a minor enhancement, resampling a ``PeriodIndex`` can now handle ``NaT`` values (:issue:`13224`) + +Previous Behavior: + +.. code-block:: ipython + + In [1]: pi = pd.period_range('2017-01', periods=12, freq='M') + + In [2]: s = pd.Series(np.arange(12), index=pi) + + In [3]: resampled = s.resample('2Q').mean() + + In [4]: resampled + Out[4]: + 2017-03-31 1.0 + 2017-09-30 5.5 + 2018-03-31 10.0 + Freq: 2Q-DEC, dtype: float64 + + In [5]: resampled.index + Out[5]: DatetimeIndex(['2017-03-31', '2017-09-30', '2018-03-31'], dtype='datetime64[ns]', freq='2Q-DEC') + +New Behavior: + +.. ipython:: python + + pi = pd.period_range('2017-01', periods=12, freq='M') + + s = pd.Series(np.arange(12), index=pi) + + resampled = s.resample('2Q').mean() + + resampled + + resampled.index + +Upsampling and calling ``.ohlc()`` previously returned a ``Series``, basically identical to calling ``.asfreq()``. OHLC upsampling now returns a DataFrame with columns ``open``, ``high``, ``low`` and ``close`` (:issue:`13083`). This is consistent with downsampling and ``DatetimeIndex`` behavior. + +Previous Behavior: + +.. code-block:: ipython + + In [1]: pi = pd.PeriodIndex(start='2000-01-01', freq='D', periods=10) + + In [2]: s = pd.Series(np.arange(10), index=pi) + + In [3]: s.resample('H').ohlc() + Out[3]: + 2000-01-01 00:00 0.0 + ... + 2000-01-10 23:00 NaN + Freq: H, Length: 240, dtype: float64 + + In [4]: s.resample('M').ohlc() + Out[4]: + open high low close + 2000-01 0 9 0 9 + +New Behavior: + +.. ipython:: python + + pi = pd.PeriodIndex(start='2000-01-01', freq='D', periods=10) + + s = pd.Series(np.arange(10), index=pi) + + s.resample('H').ohlc() + + s.resample('M').ohlc() + + .. _whatsnew_0210.api_breaking.pandas_eval: Improved error handling during item assignment in pd.eval @@ -244,81 +744,70 @@ the target. Now, a ``ValueError`` will be raised when such an input is passed in ... ValueError: Cannot operate inplace if there is no assignment + +.. _whatsnew_0210.api_breaking.dtype_conversions: + Dtype Conversions ^^^^^^^^^^^^^^^^^ -- Previously assignments, ``.where()`` and ``.fillna()`` with a ``bool`` assignment, would coerce to - same the type (e.g. int / float), or raise for datetimelikes. These will now preseve the bools with ``object`` dtypes. (:issue:`16821`). +Previously assignments, ``.where()`` and ``.fillna()`` with a ``bool`` assignment, would coerce to same the type (e.g. int / float), or raise for datetimelikes. These will now preserve the bools with ``object`` dtypes. (:issue:`16821`). - .. ipython:: python +.. ipython:: python - s = Series([1, 2, 3]) + s = Series([1, 2, 3]) - .. code-block:: python +.. code-block:: python - In [5]: s[1] = True + In [5]: s[1] = True - In [6]: s - Out[6]: - 0 1 - 1 1 - 2 3 - dtype: int64 + In [6]: s + Out[6]: + 0 1 + 1 1 + 2 3 + dtype: int64 - New Behavior +New Behavior - .. ipython:: python +.. ipython:: python - s[1] = True - s + s[1] = True + s -- Previously, as assignment to a datetimelike with a non-datetimelike would coerce the - non-datetime-like item being assigned (:issue:`14145`). +Previously, as assignment to a datetimelike with a non-datetimelike would coerce the +non-datetime-like item being assigned (:issue:`14145`). - .. ipython:: python +.. ipython:: python - s = pd.Series([pd.Timestamp('2011-01-01'), pd.Timestamp('2012-01-01')]) + s = pd.Series([pd.Timestamp('2011-01-01'), pd.Timestamp('2012-01-01')]) - .. code-block:: python +.. code-block:: python - In [1]: s[1] = 1 + In [1]: s[1] = 1 - In [2]: s - Out[2]: - 0 2011-01-01 00:00:00.000000000 - 1 1970-01-01 00:00:00.000000001 - dtype: datetime64[ns] + In [2]: s + Out[2]: + 0 2011-01-01 00:00:00.000000000 + 1 1970-01-01 00:00:00.000000001 + dtype: datetime64[ns] - These now coerce to ``object`` dtype. +These now coerce to ``object`` dtype. - .. ipython:: python +.. ipython:: python - s[1] = 1 - s + s[1] = 1 + s - Inconsistent behavior in ``.where()`` with datetimelikes which would raise rather than coerce to ``object`` (:issue:`16402`) - Bug in assignment against ``int64`` data with ``np.ndarray`` with ``float64`` dtype may keep ``int64`` dtype (:issue:`14001`) -.. _whatsnew_0210.api.na_changes: - -NA naming Changes -^^^^^^^^^^^^^^^^^ - -In order to promote more consistency among the pandas API, we have added additional top-level -functions :func:`isna` and :func:`notna` that are aliases for :func:`isnull` and :func:`notnull`. -The naming scheme is now more consistent with methods like ``.dropna()`` and ``.fillna()``. Furthermore -in all cases where ``.isnull()`` and ``.notnull()`` methods are defined, these have additional methods -named ``.isna()`` and ``.notna()``, these are included for classes ``Categorical``, -``Index``, ``Series``, and ``DataFrame``. (:issue:`15001`). - -The configuration option ``pd.options.mode.use_inf_as_null`` is deprecated, and ``pd.options.mode.use_inf_as_na`` is added as a replacement. .. _whatsnew_210.api.multiindex_single: MultiIndex Constructor with a Single Level ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The ``MultiIndex`` constructors no longer squeeze a MultiIndex with all +The ``MultiIndex`` constructors no longer squeezes a MultiIndex with all length-one levels down to a regular ``Index``. This affects all the ``MultiIndex`` constructors. (:issue:`17178`) @@ -344,42 +833,94 @@ UTC Localization with Series Previously, :func:`to_datetime` did not localize datetime ``Series`` data when ``utc=True`` was passed. Now, :func:`to_datetime` will correctly localize ``Series`` with a ``datetime64[ns, UTC]`` dtype to be consistent with how list-like and ``Index`` data are handled. (:issue:`6415`). - Previous Behavior +Previous Behavior - .. ipython:: python +.. ipython:: python - s = Series(['20130101 00:00:00'] * 3) + s = Series(['20130101 00:00:00'] * 3) - .. code-block:: ipython +.. code-block:: ipython - In [12]: pd.to_datetime(s, utc=True) - Out[12]: - 0 2013-01-01 - 1 2013-01-01 - 2 2013-01-01 - dtype: datetime64[ns] + In [12]: pd.to_datetime(s, utc=True) + Out[12]: + 0 2013-01-01 + 1 2013-01-01 + 2 2013-01-01 + dtype: datetime64[ns] - New Behavior +New Behavior - .. ipython:: python +.. ipython:: python - pd.to_datetime(s, utc=True) + pd.to_datetime(s, utc=True) Additionally, DataFrames with datetime columns that were parsed by :func:`read_sql_table` and :func:`read_sql_query` will also be localized to UTC only if the original SQL columns were timezone aware datetime columns. +.. _whatsnew_0210.api.consistency_of_range_functions: + +Consistency of Range Functions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In previous versions, there were some inconsistencies between the various range functions: :func:`date_range`, :func:`bdate_range`, :func:`period_range`, :func:`timedelta_range`, and :func:`interval_range`. (:issue:`17471`). + +One of the inconsistent behaviors occurred when the ``start``, ``end`` and ``period`` parameters were all specified, potentially leading to ambiguous ranges. When all three parameters were passed, ``interval_range`` ignored the ``period`` parameter, ``period_range`` ignored the ``end`` parameter, and the other range functions raised. To promote consistency among the range functions, and avoid potentially ambiguous ranges, ``interval_range`` and ``period_range`` will now raise when all three parameters are passed. + +Previous Behavior: + +.. code-block:: ipython + + In [2]: pd.interval_range(start=0, end=4, periods=6) + Out[2]: + IntervalIndex([(0, 1], (1, 2], (2, 3]] + closed='right', + dtype='interval[int64]') + + In [3]: pd.period_range(start='2017Q1', end='2017Q4', periods=6, freq='Q') + Out[3]: PeriodIndex(['2017Q1', '2017Q2', '2017Q3', '2017Q4', '2018Q1', '2018Q2'], dtype='period[Q-DEC]', freq='Q-DEC') + +New Behavior: + +.. code-block:: ipython + + In [2]: pd.interval_range(start=0, end=4, periods=6) + --------------------------------------------------------------------------- + ValueError: Of the three parameters: start, end, and periods, exactly two must be specified + + In [3]: pd.period_range(start='2017Q1', end='2017Q4', periods=6, freq='Q') + --------------------------------------------------------------------------- + ValueError: Of the three parameters: start, end, and periods, exactly two must be specified + +Additionally, the endpoint parameter ``end`` was not included in the intervals produced by ``interval_range``. However, all other range functions include ``end`` in their output. To promote consistency among the range functions, ``interval_range`` will now include ``end`` as the right endpoint of the final interval, except if ``freq`` is specified in a way which skips ``end``. + +Previous Behavior: + +.. code-block:: ipython + + In [4]: pd.interval_range(start=0, end=4) + Out[4]: + IntervalIndex([(0, 1], (1, 2], (2, 3]] + closed='right', + dtype='interval[int64]') + + +New Behavior: + +.. ipython:: python + + pd.interval_range(start=0, end=4) + .. _whatsnew_0210.api: Other API Changes ^^^^^^^^^^^^^^^^^ -- Support has been dropped for Python 3.4 (:issue:`15251`) - The Categorical constructor no longer accepts a scalar for the ``categories`` keyword. (:issue:`16022`) - Accessing a non-existent attribute on a closed :class:`~pandas.HDFStore` will now raise an ``AttributeError`` rather than a ``ClosedFileError`` (:issue:`16301`) -- :func:`read_csv` now treats ``'null'`` strings as missing values by default (:issue:`16471`) -- :func:`read_csv` now treats ``'n/a'`` strings as missing values by default (:issue:`16078`) +- :func:`read_csv` now issues a ``UserWarning`` if the ``names`` parameter contains duplicates (:issue:`17095`) +- :func:`read_csv` now treats ``'null'`` and ``'n/a'`` strings as missing values by default (:issue:`16471`, :issue:`16078`) - :class:`pandas.HDFStore`'s string representation is now faster and less detailed. For the previous behavior, use ``pandas.HDFStore.info()``. (:issue:`16503`). -- Compression defaults in HDF stores now follow pytable standards. Default is no compression and if ``complib`` is missing and ``complevel`` > 0 ``zlib`` is used (:issue:`15943`) +- Compression defaults in HDF stores now follow pytables standards. Default is no compression and if ``complib`` is missing and ``complevel`` > 0 ``zlib`` is used (:issue:`15943`) - ``Index.get_indexer_non_unique()`` now returns a ndarray indexer rather than an ``Index``; this is consistent with ``Index.get_indexer()`` (:issue:`16819`) - Removed the ``@slow`` decorator from ``pandas.util.testing``, which caused issues for some downstream packages' test suites. Use ``@pytest.mark.slow`` instead, which achieves the same thing (:issue:`16850`) - Moved definition of ``MergeError`` to the ``pandas.errors`` module. @@ -387,15 +928,80 @@ Other API Changes - :func:`Series.argmin` and :func:`Series.argmax` will now raise a ``TypeError`` when used with ``object`` dtypes, instead of a ``ValueError`` (:issue:`13595`) - :class:`Period` is now immutable, and will now raise an ``AttributeError`` when a user tries to assign a new value to the ``ordinal`` or ``freq`` attributes (:issue:`17116`). - :func:`to_datetime` when passed a tz-aware ``origin=`` kwarg will now raise a more informative ``ValueError`` rather than a ``TypeError`` (:issue:`16842`) - +- :func:`to_datetime` now raises a ``ValueError`` when format includes ``%W`` or ``%U`` without also including day of the week and calendar year (:issue:`16774`) +- Renamed non-functional ``index`` to ``index_col`` in :func:`read_stata` to improve API consistency (:issue:`16342`) +- Bug in :func:`DataFrame.drop` caused boolean labels ``False`` and ``True`` to be treated as labels 0 and 1 respectively when dropping indices from a numeric index. This will now raise a ValueError (:issue:`16877`) +- Restricted DateOffset keyword arguments. Previously, ``DateOffset`` subclasses allowed arbitrary keyword arguments which could lead to unexpected behavior. Now, only valid arguments will be accepted. (:issue:`17176`). +- Pandas no longer registers matplotlib converters on import. The converters + will be registered and used when the first plot is draw (:issue:`17710`) .. _whatsnew_0210.deprecations: Deprecations ~~~~~~~~~~~~ -- :func:`read_excel()` has deprecated ``sheetname`` in favor of ``sheet_name`` for consistency with ``.to_excel()`` (:issue:`10559`). +- :meth:`DataFrame.from_csv` and :meth:`Series.from_csv` have been deprecated in favor of :func:`read_csv()` (:issue:`4191`) +- :func:`read_excel()` has deprecated ``sheetname`` in favor of ``sheet_name`` for consistency with ``.to_excel()`` (:issue:`10559`). +- :func:`read_excel()` has deprecated ``parse_cols`` in favor of ``usecols`` for consistency with :func:`read_csv` (:issue:`4988`) +- :func:`read_csv()` has deprecated the ``tupleize_cols`` argument. Column tuples will always be converted to a ``MultiIndex`` (:issue:`17060`) +- :meth:`DataFrame.to_csv` has deprecated the ``tupleize_cols`` argument. Multi-index columns will be always written as rows in the CSV file (:issue:`17060`) +- The ``convert`` parameter has been deprecated in the ``.take()`` method, as it was not being respected (:issue:`16948`) - ``pd.options.html.border`` has been deprecated in favor of ``pd.options.display.html.border`` (:issue:`15793`). +- :func:`SeriesGroupBy.nth` has deprecated ``True`` in favor of ``'all'`` for its kwarg ``dropna`` (:issue:`11038`). +- :func:`DataFrame.as_blocks` is deprecated, as this is exposing the internal implementation (:issue:`17302`) +- ``pd.TimeGrouper`` is deprecated in favor of :class:`pandas.Grouper` (:issue:`16747`) +- ``cdate_range`` has been deprecated in favor of :func:`bdate_range`, which has gained ``weekmask`` and ``holidays`` parameters for building custom frequency date ranges. See the :ref:`documentation ` for more details (:issue:`17596`) +- passing ``categories`` or ``ordered`` kwargs to :func:`Series.astype` is deprecated, in favor of passing a :ref:`CategoricalDtype ` (:issue:`17636`) +- ``.get_value`` and ``.set_value`` on ``Series``, ``DataFrame``, ``Panel``, ``SparseSeries``, and ``SparseDataFrame`` are deprecated in favor of using ``.iat[]`` or ``.at[]`` accessors (:issue:`15269`) +- Passing a non-existent column in ``.to_excel(..., columns=)`` is deprecated and will raise a ``KeyError`` in the future (:issue:`17295`) +- ``raise_on_error`` parameter to :func:`Series.where`, :func:`Series.mask`, :func:`DataFrame.where`, :func:`DataFrame.mask` is deprecated, in favor of ``errors=`` (:issue:`14968`) +- Using :meth:`DataFrame.rename_axis` and :meth:`Series.rename_axis` to alter index or column *labels* is now deprecated in favor of using ``.rename``. ``rename_axis`` may still be used to alter the name of the index or columns (:issue:`17833`). +- :meth:`~DataFrame.reindex_axis` has been deprecated in favor of :meth:`~DataFrame.reindex`. See :ref:`here ` for more (:issue:`17833`). + +.. _whatsnew_0210.deprecations.select: + +Series.select and DataFrame.select +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The :meth:`Series.select` and :meth:`DataFrame.select` methods are deprecated in favor of using ``df.loc[labels.map(crit)]`` (:issue:`12401`) + +.. ipython:: python + + df = DataFrame({'A': [1, 2, 3]}, index=['foo', 'bar', 'baz']) + +.. code-block:: ipython + + In [3]: df.select(lambda x: x in ['bar', 'baz']) + FutureWarning: select is deprecated and will be removed in a future release. You can use .loc[crit] as a replacement + Out[3]: + A + bar 2 + baz 3 + +.. ipython:: python + + df.loc[df.index.map(lambda x: x in ['bar', 'baz'])] + + +.. _whatsnew_0210.deprecations.argmin_min: + +Series.argmax and Series.argmin +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The behavior of :func:`Series.argmax` and :func:`Series.argmin` have been deprecated in favor of :func:`Series.idxmax` and :func:`Series.idxmin`, respectively (:issue:`16830`). + +For compatibility with NumPy arrays, ``pd.Series`` implements ``argmax`` and +``argmin``. Since pandas 0.13.0, ``argmax`` has been an alias for +:meth:`pandas.Series.idxmax`, and ``argmin`` has been an alias for +:meth:`pandas.Series.idxmin`. They return the *label* of the maximum or minimum, +rather than the *position*. + +We've deprecated the current behavior of ``Series.argmax`` and +``Series.argmin``. Using either of these will emit a ``FutureWarning``. Use +:meth:`Series.idxmax` if you want the label of the maximum. Use +``Series.values.argmax()`` if you want the position of the maximum. Likewise for +the minimum. In a future release ``Series.argmax`` and ``Series.argmin`` will +return the position of the maximum or minimum. .. _whatsnew_0210.prior_deprecations: @@ -421,23 +1027,38 @@ Performance Improvements - Improved performance of instantiating :class:`SparseDataFrame` (:issue:`16773`) - :attr:`Series.dt` no longer performs frequency inference, yielding a large speedup when accessing the attribute (:issue:`17210`) +- Improved performance of :meth:`~Series.cat.set_categories` by not materializing the values (:issue:`17508`) +- :attr:`Timestamp.microsecond` no longer re-computes on attribute access (:issue:`17331`) +- Improved performance of the :class:`CategoricalIndex` for data that is already categorical dtype (:issue:`17513`) +- Improved performance of :meth:`RangeIndex.min` and :meth:`RangeIndex.max` by using ``RangeIndex`` properties to perform the computations (:issue:`17607`) + +.. _whatsnew_0210.docs: +Documentation Changes +~~~~~~~~~~~~~~~~~~~~~ + +- Several ``NaT`` method docstrings (e.g. :func:`NaT.ctime`) were incorrect (:issue:`17327`) +- The documentation has had references to versions < v0.17 removed and cleaned up (:issue:`17442`, :issue:`17442`, :issue:`17404` & :issue:`17504`) .. _whatsnew_0210.bug_fixes: Bug Fixes ~~~~~~~~~ - Conversion ^^^^^^^^^^ - Bug in assignment against datetime-like data with ``int`` may incorrectly convert to datetime-like (:issue:`14145`) - Bug in assignment against ``int64`` data with ``np.ndarray`` with ``float64`` dtype may keep ``int64`` dtype (:issue:`14001`) -- Fix :func:`DataFrame.memory_usage` to support PyPy. Objects on PyPy do not have a fixed size, so an approximation is used instead (:issue:`17228`) - Fixed the return type of ``IntervalIndex.is_non_overlapping_monotonic`` to be a Python ``bool`` for consistency with similar attributes/methods. Previously returned a ``numpy.bool_``. (:issue:`17237`) - Bug in ``IntervalIndex.is_non_overlapping_monotonic`` when intervals are closed on both sides and overlap at a point (:issue:`16560`) - Bug in :func:`Series.fillna` returns frame when ``inplace=True`` and ``value`` is dict (:issue:`16156`) +- Bug in :attr:`Timestamp.weekday_name` returning a UTC-based weekday name when localized to a timezone (:issue:`17354`) +- Bug in ``Timestamp.replace`` when replacing ``tzinfo`` around DST changes (:issue:`15683`) +- Bug in ``Timedelta`` construction and arithmetic that would not propagate the ``Overflow`` exception (:issue:`17367`) +- Bug in :meth:`~DataFrame.astype` converting to object dtype when passed extension type classes (`DatetimeTZDtype``, ``CategoricalDtype``) rather than instances. Now a ``TypeError`` is raised when a class is passed (:issue:`17780`). +- Bug in :meth:`to_numeric` in which elements were not always being coerced to numeric when ``errors='coerce'`` (:issue:`17007`, :issue:`17125`) +- Bug in ``DataFrame`` and ``Series`` constructors where ``range`` objects are converted to ``int32`` dtype on Windows instead of ``int64`` (:issue:`16804`) Indexing ^^^^^^^^ @@ -455,22 +1076,40 @@ Indexing - Bug in ``.iloc`` when used with inplace addition or assignment and an int indexer on a ``MultiIndex`` causing the wrong indexes to be read from and written to (:issue:`17148`) - Bug in ``.isin()`` in which checking membership in empty ``Series`` objects raised an error (:issue:`16991`) - Bug in ``CategoricalIndex`` reindexing in which specified indices containing duplicates were not being respected (:issue:`17323`) +- Bug in intersection of ``RangeIndex`` with negative step (:issue:`17296`) +- Bug in ``IntervalIndex`` where performing a scalar lookup fails for included right endpoints of non-overlapping monotonic decreasing indexes (:issue:`16417`, :issue:`17271`) +- Bug in :meth:`DataFrame.first_valid_index` and :meth:`DataFrame.last_valid_index` when no valid entry (:issue:`17400`) +- Bug in :func:`Series.rename` when called with a callable, incorrectly alters the name of the ``Series``, rather than the name of the ``Index``. (:issue:`17407`) +- Bug in :func:`String.str_get` raises ``IndexError`` instead of inserting NaNs when using a negative index. (:issue:`17704`) I/O ^^^ +- Bug in :func:`read_hdf` when reading a timezone aware index from ``fixed`` format HDFStore (:issue:`17618`) - Bug in :func:`read_csv` in which columns were not being thoroughly de-duplicated (:issue:`17060`) - Bug in :func:`read_csv` in which specified column names were not being thoroughly de-duplicated (:issue:`17095`) - Bug in :func:`read_csv` in which non integer values for the header argument generated an unhelpful / unrelated error message (:issue:`16338`) - Bug in :func:`read_csv` in which memory management issues in exception handling, under certain conditions, would cause the interpreter to segfault (:issue:`14696`, :issue:`16798`). - Bug in :func:`read_csv` when called with ``low_memory=False`` in which a CSV with at least one column > 2GB in size would incorrectly raise a ``MemoryError`` (:issue:`16798`). - Bug in :func:`read_csv` when called with a single-element list ``header`` would return a ``DataFrame`` of all NaN values (:issue:`7757`) +- Bug in :meth:`DataFrame.to_csv` defaulting to 'ascii' encoding in Python 3, instead of 'utf-8' (:issue:`17097`) - Bug in :func:`read_stata` where value labels could not be read when using an iterator (:issue:`16923`) +- Bug in :func:`read_stata` where the index was not set (:issue:`16342`) - Bug in :func:`read_html` where import check fails when run in multiple threads (:issue:`16928`) +- Bug in :func:`read_csv` where automatic delimiter detection caused a ``TypeError`` to be thrown when a bad line was encountered rather than the correct error message (:issue:`13374`) +- Bug in :meth:`DataFrame.to_html` with ``notebook=True`` where DataFrames with named indices or non-MultiIndex indices had undesired horizontal or vertical alignment for column or row labels, respectively (:issue:`16792`) +- Bug in :meth:`DataFrame.to_html` in which there was no validation of the ``justify`` parameter (:issue:`17527`) +- Bug in :func:`HDFStore.select` when reading a contiguous mixed-data table featuring VLArray (:issue:`17021`) +- Bug in :func:`to_json` where several conditions (including objects with unprintable symbols, objects with deep recursion, overlong labels) caused segfaults instead of raising the appropriate exception (:issue:`14256`) Plotting ^^^^^^^^ - Bug in plotting methods using ``secondary_y`` and ``fontsize`` not setting secondary axis font size (:issue:`12565`) +- Bug when plotting ``timedelta`` and ``datetime`` dtypes on y-axis (:issue:`16953`) +- Line plots no longer assume monotonic x data when calculating xlims, they show the entire lines now even for unsorted x data. (:issue:`11310`, :issue:`11471`) +- With matplotlib 2.0.0 and above, calculation of x limits for line plots is left to matplotlib, so that its new default settings are applied. (:issue:`15495`) +- Bug in ``Series.plot.bar`` or ``DataFrame.plot.bar`` with ``y`` not respecting user-passed ``color`` (:issue:`16822`) +- Bug causing ``plotting.parallel_coordinates`` to reset the random seed when using random colors (:issue:`17525`) Groupby/Resample/Rolling @@ -481,15 +1120,21 @@ Groupby/Resample/Rolling - Bug in ``.rolling(...).quantile()`` which incorrectly used different defaults than :func:`Series.quantile()` and :func:`DataFrame.quantile()` (:issue:`9413`, :issue:`16211`) - Bug in ``groupby.transform()`` that would coerce boolean dtypes back to float (:issue:`16875`) - Bug in ``Series.resample(...).apply()`` where an empty ``Series`` modified the source index and did not return the name of a ``Series`` (:issue:`14313`) -- Bug in ``.rolling(...).apply(...)`` with a ``DataFrame`` with a ``DatetimeIndex``, a ``window`` of a timedelta-convertible and ``min_periods >= 1` (:issue:`15305`) +- Bug in ``.rolling(...).apply(...)`` with a ``DataFrame`` with a ``DatetimeIndex``, a ``window`` of a timedelta-convertible and ``min_periods >= 1`` (:issue:`15305`) - Bug in ``DataFrame.groupby`` where index and column keys were not recognized correctly when the number of keys equaled the number of elements on the groupby axis (:issue:`16859`) +- Bug in ``groupby.nunique()`` with ``TimeGrouper`` which cannot handle ``NaT`` correctly (:issue:`17575`) +- Bug in ``DataFrame.groupby`` where a single level selection from a ``MultiIndex`` unexpectedly sorts (:issue:`17537`) +- Bug in ``DataFrame.groupby`` where spurious warning is raised when ``Grouper`` object is used to override ambiguous column name (:issue:`17383`) +- Bug in ``TimeGrouper`` differs when passes as a list and as a scalar (:issue:`17530`) Sparse ^^^^^^ - Bug in ``SparseSeries`` raises ``AttributeError`` when a dictionary is passed in as data (:issue:`16905`) - Bug in :func:`SparseDataFrame.fillna` not filling all NaNs when frame was instantiated from SciPy sparse matrix (:issue:`16112`) - +- Bug in :func:`SparseSeries.unstack` and :func:`SparseDataFrame.stack` (:issue:`16614`, :issue:`15045`) +- Bug in :func:`make_sparse` treating two numeric/boolean data, which have same bits, as same when array ``dtype`` is ``object`` (:issue:`17574`) +- :func:`SparseArray.all` and :func:`SparseArray.any` are now implemented to handle ``SparseArray``, these were used but not implemented (:issue:`17570`) Reshaping ^^^^^^^^^ @@ -503,6 +1148,11 @@ Reshaping - :func:`Series.argmin`, :func:`Series.argmax`, and their counterparts on ``DataFrame`` and groupby objects work correctly with floating point data that contains infinite values (:issue:`13595`). - Bug in :func:`unique` where checking a tuple of strings raised a ``TypeError`` (:issue:`17108`) - Bug in :func:`concat` where order of result index was unpredictable if it contained non-comparable elements (:issue:`17344`) +- Fixes regression when sorting by multiple columns on a ``datetime64`` dtype ``Series`` with ``NaT`` values (:issue:`16836`) +- Bug in :func:`pivot_table` where the result's columns did not preserve the categorical dtype of ``columns`` when ``dropna`` was ``False`` (:issue:`17842`) +- Bug in ``DataFrame.drop_duplicates`` where dropping with non-unique column names raised a ``ValueError`` (:issue:`17836`) +- Bug in :func:`unstack` which, when called on a list of levels, would discard the ``fillna`` argument (:issue:`13971`) +- Bug in the alignment of ``range`` objects and other list-likes with ``DataFrame`` leading to operations being performed row-wise instead of column-wise (:issue:`17901`) Numeric ^^^^^^^ @@ -512,13 +1162,26 @@ Numeric Categorical ^^^^^^^^^^^ -- Bug in :func:`Series.isin` when called with a categorical (:issue`16639`) -- Bug in the categorical constructor with empty values and categories causing - the ``.categories`` to be an empty ``Float64Index`` rather than an empty - ``Index`` with object dtype (:issue:`17248`) +- Bug in :func:`Series.isin` when called with a categorical (:issue:`16639`) +- Bug in the categorical constructor with empty values and categories causing the ``.categories`` to be an empty ``Float64Index`` rather than an empty ``Index`` with object dtype (:issue:`17248`) +- Bug in categorical operations with :ref:`Series.cat ` not preserving the original Series' name (:issue:`17509`) +- Bug in :func:`DataFrame.merge` failing for categorical columns with boolean/int data types (:issue:`17187`) +- Bug in constructing a ``Categorical``/``CategoricalDtype`` when the specified ``categories`` are of categorical type (:issue:`17884`). + +.. _whatsnew_0210.pypy: + +PyPy +^^^^ +- Compatibility with PyPy in :func:`read_csv` with ``usecols=[]`` and + :func:`read_json` (:issue:`17351`) +- Split tests into cases for CPython and PyPy where needed, which highlights the fragility + of index matching with ``float('nan')``, ``np.nan`` and ``NAT`` (:issue:`17351`) +- Fix :func:`DataFrame.memory_usage` to support PyPy. Objects on PyPy do not have a fixed size, + so an approximation is used instead (:issue:`17228`) Other ^^^^^ +- Bug where some inplace operators were not being wrapped and produced a copy when invoked (:issue:`12962`) - Bug in :func:`eval` where the ``inplace`` parameter was being incorrectly handled (:issue:`16732`) -- Several ``NaT`` method docstrings (e.g. :func:`NaT.ctime`) were incorrect (:issue:`17327`) + From ea4269724e372f8b2167d59b6872d620ef149b1c Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 31 Oct 2017 21:38:31 -0400 Subject: [PATCH 10/21] rebase and cleanup --- doc/source/whatsnew/v0.21.0.txt | 32 ---------- pandas/core/groupby.py | 3 - pandas/tests/groupby/test_groupby.py | 81 -------------------------- pandas/tests/groupby/test_whitelist.py | 19 ------ 4 files changed, 135 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 4c8e094e69068..4c460eeb85b82 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -309,28 +309,12 @@ New keywords - :func:`read_feather` has gained the ``nthreads`` parameter for multi-threaded operations (:issue:`16359`) - :func:`DataFrame.clip()` and :func:`Series.clip()` have gained an ``inplace`` argument. (:issue:`15388`) - :func:`crosstab` has gained a ``margins_name`` parameter to define the name of the row / column that will contain the totals when ``margins=True``. (:issue:`15972`) -<<<<<<< HEAD -- :func:`DataFrame.select_dtypes` now accepts scalar values for include/exclude as well as list-like. (:issue:`16855`) -- :func:`date_range` now accepts 'YS' in addition to 'AS' as an alias for start of year (:issue:`9313`) -- :func:`date_range` now accepts 'Y' in addition to 'A' as an alias for end of year (:issue:`9313`) -- Integration with `Apache Parquet `__, including a new top-level :func:`read_parquet` and :func:`DataFrame.to_parquet` method, see :ref:`here `. -- :func:`DataFrame.add_prefix` and :func:`DataFrame.add_suffix` now accept strings containing the '%' character. (:issue:`17151`) -- `read_*` methods can now infer compression from non-string paths, such as ``pathlib.Path`` objects (:issue:`17206`). -- :func:`pd.read_sas()` now recognizes much more of the most frequently used date (datetime) formats in SAS7BDAT files (:issue:`15871`). -- :func:`DataFrame.items` and :func:`Series.items` is now present in both Python 2 and 3 and is lazy in all cases (:issue:`13918`, :issue:`17213`) -<<<<<<< HEAD -<<<<<<< HEAD -<<<<<<< HEAD -<<<<<<< HEAD - -======= - :func:`read_json` now accepts a ``chunksize`` parameter that can be used when ``lines=True``. If ``chunksize`` is passed, read_json now returns an iterator which reads in ``chunksize`` lines with each iteration. (:issue:`17048`) - :func:`read_json` and :func:`~DataFrame.to_json` now accept a ``compression`` argument which allows them to transparently handle compressed files. (:issue:`17798`) Various enhancements """""""""""""""""""" ->>>>>>> 95d4ba8... ENH: gb.is_monotonic_increasing #17015 fix rebase conflicts - Improved the import time of pandas by about 2.25x. (:issue:`16764`) - Support for `PEP 519 -- Adding a file system path protocol `_ on most readers (e.g. @@ -354,22 +338,6 @@ Various enhancements - :func:`read_excel` raises ``ImportError`` with a better message if ``xlrd`` is not installed. (:issue:`17613`) - :meth:`DataFrame.assign` will preserve the original order of ``**kwargs`` for Python 3.6+ users instead of sorting the column names. (:issue:`14207`) - :func:`Series.reindex`, :func:`DataFrame.reindex`, :func:`Index.get_indexer` now support list-like argument for ``tolerance``. (:issue:`17367`) -<<<<<<< HEAD -======= -- :func: groupby.is_monotonic_increasing and .is_monotonic_decreasing extend Series.is_monotonic_increasing to groups, returning whether each group is monotonically increasing or decreasing, respectively. (:issue:`17015`) -======= -- is_monotonic_increasing/decreasing is added to .groupby(). (:issue:`17015`) ->>>>>>> 740c7c2... added tests for gb.is_monotonically_increasing()/decreasing -======= ->>>>>>> 8ed37cd... removed edits to whatsnew 0.21.0 - ->>>>>>> e99897c... ENH: gb.is_monotonic_increasing, is_monotonic_decreasing #17015 -======= - - ->>>>>>> ceceae1... ENH: gb.is_monotonic_increasing #17015 fix rebase conflicts -======= ->>>>>>> 95d4ba8... ENH: gb.is_monotonic_increasing #17015 fix rebase conflicts .. _whatsnew_0210.api_breaking: diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 9d63cf84ebe3b..ec3cce2821036 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1695,7 +1695,6 @@ def tail(self, n=5): mask = self._cumcount_array(ascending=False) < n return self._selected_obj[mask] -<<<<<<< HEAD def pipe(self, func, *args, **kwargs): """ Apply a function with arguments to this GroupBy object, @@ -1744,8 +1743,6 @@ def pipe(self, func, *args, **kwargs): """ return _pipe(self, func, *args, **kwargs) -======= ->>>>>>> e99897c... ENH: gb.is_monotonic_increasing, is_monotonic_decreasing #17015 @Substitution(name='groupby') @Appender(_doc_template) def is_monotonic_increasing(self): diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 87e06fbb01e81..c69b4730e943e 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -3701,8 +3701,6 @@ def test_cummin_cummax(self): expected = pd.Series([1, 2, 1], name='b') tm.assert_series_equal(result, expected) -<<<<<<< HEAD -<<<<<<< HEAD @pytest.mark.parametrize('in_vals, out_vals', [ # Basics: strictly increasing (T), strictly decreasing (F), # abs val increasing (F), non-strictly increasing (T) @@ -3761,85 +3759,6 @@ def test_is_monotonic_decreasing(self, in_vals, out_vals): expected.index.name = 'B' tm.assert_series_equal(result, expected) -======= - def test_is_increasing_is_decreasing(self): - # GH 17015 - -======= - @pytest.mark.parametrize('in_vals, out_vals', [ ->>>>>>> f8554ee... parametrized tests for gb.is_monotonic_increasing/decreasing - # Basics: strictly increasing (T), strictly decreasing (F), - # abs val increasing (F), non-strictly increasing (T) - ([1, 2, 5, 3, 2, 0, 4, 5, -6, 1, 1], - [True, False, False, True]), - # Test with inf vals - ([1, 2.1, np.inf, 3, 2, np.inf, -np.inf, 5, 11, 1, -np.inf], - [True, False, True, False]), - # Test with nan vals; should always be False - ([1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan], - [False, False, False, False]), - ]) - def test_is_monotonic_increasing(self, in_vals, out_vals): - # GH 17015 - source_dict = { - 'A': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'], - 'B': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd'], - 'C': in_vals} - df = pd.DataFrame(source_dict) - result = df.groupby(['B']).C.is_monotonic_increasing() - expected = pd.Series(index=['a', 'b', 'c', 'd'], - data=out_vals, - name='C') - expected.index.name = 'B' - tm.assert_series_equal(result, expected) - - # Also check result equal to manually taking x.is_monotonic_increasing. - expected = ( - df.groupby(['B']).C.apply(lambda x: x.is_monotonic_increasing)) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize('in_vals, out_vals', [ - # Basics: strictly decreasing (T), strictly increasing (F), - # abs val decreasing (F), non-strictly increasing (T) - ([10, 9, 7, 3, 4, 5, -3, 2, 0, 1, 1], - [True, False, False, True]), - # Test with inf vals - ([np.inf, 1, -np.inf, np.inf, 2, -3, -np.inf, 5, -3, -np.inf, -np.inf], - [True, True, False, True]), - # Test with nan vals; should always be False - ([1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan], - [False, False, False, False]), - ]) - def test_is_monotonic_decreasing(self, in_vals, out_vals): - # GH 17015 - source_dict = { - 'A': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'], - 'B': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd'], - 'C': in_vals} - - df = pd.DataFrame(source_dict) - result = df.groupby('B').C.is_monotonic_decreasing() - expected = pd.Series(index=['a', 'b', 'c', 'd'], - data=out_vals, - name='C') - expected.index.name = 'B' - tm.assert_series_equal(result, expected) -<<<<<<< HEAD ->>>>>>> 740c7c2... added tests for gb.is_monotonically_increasing()/decreasing -======= - ->>>>>>> f8554ee... parametrized tests for gb.is_monotonic_increasing/decreasing - # Also check result equal to manually taking x.is_monotonic_decreasing. - expected = df.groupby('B').C.apply(lambda x: x.is_monotonic_decreasing) - tm.assert_series_equal(result, expected) - -<<<<<<< HEAD -<<<<<<< HEAD -======= - ->>>>>>> 740c7c2... added tests for gb.is_monotonically_increasing()/decreasing -======= ->>>>>>> f8554ee... parametrized tests for gb.is_monotonic_increasing/decreasing def test_apply_numeric_coercion_when_datetime(self): # In the past, group-by/apply operations have been over-eager # in converting dtypes to numeric, in the presence of datetime diff --git a/pandas/tests/groupby/test_whitelist.py b/pandas/tests/groupby/test_whitelist.py index 865e51ca985de..37c3f725ff9a7 100644 --- a/pandas/tests/groupby/test_whitelist.py +++ b/pandas/tests/groupby/test_whitelist.py @@ -239,7 +239,6 @@ def test_groupby_blacklist(df_letters): def test_tab_completion(mframe): grp = mframe.groupby(level='second') results = set([v for v in dir(grp) if not v.startswith('_')]) -<<<<<<< HEAD expected = set( ['A', 'B', 'C', 'agg', 'aggregate', 'apply', 'boxplot', 'filter', 'first', 'get_group', 'groups', 'hist', 'indices', 'last', 'max', @@ -250,26 +249,8 @@ def test_tab_completion(mframe): 'cumsum', 'cumcount', 'ngroup', 'all', 'shift', 'skew', 'take', 'tshift', 'pct_change', 'any', 'mad', 'corr', 'corrwith', 'cov', 'dtypes', 'ndim', 'diff', 'idxmax', 'idxmin', -<<<<<<< HEAD 'ffill', 'bfill', 'pad', 'backfill', 'rolling', 'expanding', 'pipe', -======= - 'ffill', 'bfill', 'pad', 'backfill', 'rolling', 'expanding', ->>>>>>> e99897c... ENH: gb.is_monotonic_increasing, is_monotonic_decreasing #17015 'is_monotonic_increasing', 'is_monotonic_decreasing']) -======= - expected = { - 'A', 'B', 'C', 'agg', 'aggregate', 'apply', 'boxplot', 'filter', - 'first', 'get_group', 'groups', 'hist', 'indices', 'last', 'max', - 'mean', 'median', 'min', 'ngroups', 'nth', 'ohlc', 'plot', - 'prod', 'size', 'std', 'sum', 'transform', 'var', 'sem', 'count', - 'nunique', 'head', 'describe', 'cummax', 'quantile', - 'rank', 'cumprod', 'tail', 'resample', 'cummin', 'fillna', - 'cumsum', 'cumcount', 'ngroup', 'all', 'shift', 'skew', - 'take', 'tshift', 'pct_change', 'any', 'mad', 'corr', 'corrwith', - 'cov', 'dtypes', 'ndim', 'diff', 'idxmax', 'idxmin', - 'ffill', 'bfill', 'pad', 'backfill', 'rolling', 'expanding', 'pipe', - 'is_monotonic_increasing', 'is_monotonic_decreasing'} ->>>>>>> ceceae1... ENH: gb.is_monotonic_increasing #17015 fix rebase conflicts assert results == expected From ffb6200bf2391de2d3c28036de91a81d3532bd6d Mon Sep 17 00:00:00 2001 From: ghasemnaddaf Date: Mon, 13 Nov 2017 13:03:56 -0800 Subject: [PATCH 11/21] DOC: add docstring for MultiIndex.fillna (#18018) (#18269) ENH: gb.is_monotonic_increasing #17015 fix rebase conflicts parametrized tests for gb.is_monotonic_increasing/decreasing ENH: gb.is_monotonic_increasing, is_monotonic_decreasing #17015 added tests for gb.is_monotonically_increasing()/decreasing parametrized tests for gb.is_monotonic_increasing/decreasing ENH: gb.is_monotonic_increasing #17015 fix rebase conflicts ENH: gb.is_monotonic_increasing #17015 fix rebase conflicts rebase and cleanup simplified test format fixed whatsnew to include method tags --- doc/source/api.rst | 2 + doc/source/whatsnew/v0.21.0.txt | 1 - doc/source/whatsnew/v0.21.1.txt | 2 +- pandas/core/groupby.py | 49 ++++++++++++++++++++++++ pandas/core/indexes/multi.py | 5 ++- pandas/tests/groupby/test_groupby.py | 52 ++++++++++++++++++++++++++ pandas/tests/groupby/test_whitelist.py | 23 ++++++------ 7 files changed, 119 insertions(+), 15 deletions(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index ce88aed91823c..a18f6f2adf9e8 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -2096,6 +2096,8 @@ The following methods are available only for ``SeriesGroupBy`` objects. SeriesGroupBy.nunique SeriesGroupBy.unique SeriesGroupBy.value_counts + SeriesGroupBy.is_monotonic_increasing + SeriesGroupBy.is_monotonic_decreasing The following methods are available only for ``DataFrameGroupBy`` objects. diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 89e2d3006696c..4911ecbb161a5 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -1172,4 +1172,3 @@ Other ^^^^^ - Bug where some inplace operators were not being wrapped and produced a copy when invoked (:issue:`12962`) - Bug in :func:`eval` where the ``inplace`` parameter was being incorrectly handled (:issue:`16732`) - diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 793e9bf17bac9..c862a13b16ba5 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -22,7 +22,7 @@ Other Enhancements ^^^^^^^^^^^^^^^^^^ - :meth:`Timestamp.timestamp` is now available in Python 2.7. (:issue:`17329`) -- +- :meth: groupby.is_monotonic_increasing and :meth: .is_monotonic_decreasing extend :meth: Series.is_monotonic_increasing to groups, returning whether each group is monotonically increasing or decreasing, respectively. (:issue:`17015`) - .. _whatsnew_0211.deprecations: diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 7a58b7d358fbb..f6a2272ed8f8d 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1742,6 +1742,55 @@ def pipe(self, func, *args, **kwargs): """ return _pipe(self, func, *args, **kwargs) + @Substitution(name='groupby') + @Appender(_doc_template) + def is_monotonic_increasing(self): + """ + Returns whether each group is monotonically increasing. + + Equivalent to ``.apply(lambda x: x.is_monotonic_increasing)``. + + Examples + -------- + >>> source_dict = { + ... 'A': ['this', 'col', 'is', 'entirely', 'irrelevant', '.'], + ... 'B': ['cat_a', 'cat_a', 'cat_a', 'cat_b', 'cat_b', 'cat_b'], + ... 'C': [1, 2, 3, 2, 2, 0]} + + >>> df = pd.DataFrame(source_dict) + ... df.groupby(['B']).C.is_monotonic_increasing() + B + cat_a True + cat_b False + Name: C, dtype: bool + + """ + return self.apply(lambda x: x.is_monotonic_increasing) + + @Substitution(name='groupby') + @Appender(_doc_template) + def is_monotonic_decreasing(self): + """ + Returns whether each group is monotonically decreasing. + + Equivalent to ``.apply(lambda x: x.is_monotonic_decreasing)``. + + Examples + -------- + >>> source_dict = { + ... 'A': ['this', 'col', 'is', 'entirely', 'irrelevant', '.'], + ... 'B': ['cat_a', 'cat_a', 'cat_a', 'cat_b', 'cat_b', 'cat_b'], + ... 'C': [1, 2, 3, 2, 2, 0]} + + >>> df = pd.DataFrame(source_dict) + ... df.groupby(['B']).C.is_monotonic_decreasing() + B + cat_a False + cat_b True + Name: C, dtype: bool + """ + return self.apply(lambda x: x.is_monotonic_decreasing) + GroupBy._add_numeric_operations() diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index f603a0eef36a5..f4acb6862addb 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -820,9 +820,10 @@ def duplicated(self, keep='first'): return duplicated_int64(ids, keep) - @Appender(ibase._index_shared_docs['fillna']) def fillna(self, value=None, downcast=None): - # isna is not implemented for MultiIndex + """ + fillna is not implemented for MultiIndex + """ raise NotImplementedError('isna is not defined for MultiIndex') @Appender(_index_shared_docs['dropna']) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 03f780957b15e..24b5685a0e2e2 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2591,6 +2591,58 @@ def test_cummin_cummax(self): expected = pd.Series([1, 2, 1], name='b') tm.assert_series_equal(result, expected) + @pytest.mark.parametrize('in_vals, out_vals', [ + # Basics: strictly increasing (T), strictly decreasing (F), + # abs val increasing (F), non-strictly increasing (T) + ([1, 2, 5, 3, 2, 0, 4, 5, -6, 1, 1], + [True, False, False, True]), + # Test with inf vals + ([1, 2.1, np.inf, 3, 2, np.inf, -np.inf, 5, 11, 1, -np.inf], + [True, False, True, False]), + # Test with nan vals; should always be False + ([1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan], + [False, False, False, False]), + ]) + def test_is_monotonic_increasing(self, in_vals, out_vals): + # GH 17015 + source_dict = { + 'A': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'], + 'B': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd'], + 'C': in_vals} + df = pd.DataFrame(source_dict) + result = df.groupby(['B']).C.is_monotonic_increasing() + expected = pd.Series(index=list('abcd'), name='B') + tm.assert_series_equal(result, expected) + + # Also check result equal to manually taking x.is_monotonic_increasing. + expected = ( + df.groupby(['B']).C.apply(lambda x: x.is_monotonic_increasing)) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize('in_vals, out_vals', [ + # Basics: strictly decreasing (T), strictly increasing (F), + # abs val decreasing (F), non-strictly increasing (T) + ([10, 9, 7, 3, 4, 5, -3, 2, 0, 1, 1], + [True, False, False, True]), + # Test with inf vals + ([np.inf, 1, -np.inf, np.inf, 2, -3, -np.inf, 5, -3, -np.inf, -np.inf], + [True, True, False, True]), + # Test with nan vals; should always be False + ([1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan], + [False, False, False, False]), + ]) + def test_is_monotonic_decreasing(self, in_vals, out_vals): + # GH 17015 + source_dict = { + 'A': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'], + 'B': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd'], + 'C': in_vals} + + df = pd.DataFrame(source_dict) + result = df.groupby('B').C.is_monotonic_decreasing() + expected = pd.Series(index=list('abcd'), name='B') + tm.assert_series_equal(result, expected) + def test_apply_numeric_coercion_when_datetime(self): # In the past, group-by/apply operations have been over-eager # in converting dtypes to numeric, in the presence of datetime diff --git a/pandas/tests/groupby/test_whitelist.py b/pandas/tests/groupby/test_whitelist.py index e8e2150558edb..37c3f725ff9a7 100644 --- a/pandas/tests/groupby/test_whitelist.py +++ b/pandas/tests/groupby/test_whitelist.py @@ -239,17 +239,18 @@ def test_groupby_blacklist(df_letters): def test_tab_completion(mframe): grp = mframe.groupby(level='second') results = set([v for v in dir(grp) if not v.startswith('_')]) - expected = { - 'A', 'B', 'C', 'agg', 'aggregate', 'apply', 'boxplot', 'filter', - 'first', 'get_group', 'groups', 'hist', 'indices', 'last', 'max', - 'mean', 'median', 'min', 'ngroups', 'nth', 'ohlc', 'plot', - 'prod', 'size', 'std', 'sum', 'transform', 'var', 'sem', 'count', - 'nunique', 'head', 'describe', 'cummax', 'quantile', - 'rank', 'cumprod', 'tail', 'resample', 'cummin', 'fillna', - 'cumsum', 'cumcount', 'ngroup', 'all', 'shift', 'skew', - 'take', 'tshift', 'pct_change', 'any', 'mad', 'corr', 'corrwith', - 'cov', 'dtypes', 'ndim', 'diff', 'idxmax', 'idxmin', - 'ffill', 'bfill', 'pad', 'backfill', 'rolling', 'expanding', 'pipe'} + expected = set( + ['A', 'B', 'C', 'agg', 'aggregate', 'apply', 'boxplot', 'filter', + 'first', 'get_group', 'groups', 'hist', 'indices', 'last', 'max', + 'mean', 'median', 'min', 'ngroups', 'nth', 'ohlc', 'plot', + 'prod', 'size', 'std', 'sum', 'transform', 'var', 'sem', 'count', + 'nunique', 'head', 'describe', 'cummax', 'quantile', + 'rank', 'cumprod', 'tail', 'resample', 'cummin', 'fillna', + 'cumsum', 'cumcount', 'ngroup', 'all', 'shift', 'skew', + 'take', 'tshift', 'pct_change', 'any', 'mad', 'corr', 'corrwith', + 'cov', 'dtypes', 'ndim', 'diff', 'idxmax', 'idxmin', + 'ffill', 'bfill', 'pad', 'backfill', 'rolling', 'expanding', 'pipe', + 'is_monotonic_increasing', 'is_monotonic_decreasing']) assert results == expected From ee9fa7efbd5d3e05ae6ff2e9adc709773bca0f95 Mon Sep 17 00:00:00 2001 From: No-Stream Date: Tue, 14 Nov 2017 14:17:40 -0800 Subject: [PATCH 12/21] ENH: gb.is_monotonic_increasing #17015 tests revised and passing --- pandas/tests/groupby/test_groupby.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 735afd5f98d19..99dd0d7c12669 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2610,9 +2610,9 @@ def test_is_monotonic_increasing(self, in_vals, out_vals): 'B': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd'], 'C': in_vals} df = pd.DataFrame(source_dict) - result = df.groupby(['B']).C.is_monotonic_increasing() - expected = pd.Series(index=list('abcd'), name='B') - + result = df.groupby('B').C.is_monotonic_increasing() + expected = pd.Series(index=list('abcd'), data=out_vals, name='C') + expected.index.name = 'B' tm.assert_series_equal(result, expected) # Also check result equal to manually taking x.is_monotonic_increasing. @@ -2641,7 +2641,8 @@ def test_is_monotonic_decreasing(self, in_vals, out_vals): df = pd.DataFrame(source_dict) result = df.groupby('B').C.is_monotonic_decreasing() - expected = pd.Series(index=list('abcd'), name='B') + expected = pd.Series(index=list('abcd'), data=out_vals, name='C') + expected.index.name = 'B' tm.assert_series_equal(result, expected) def test_apply_numeric_coercion_when_datetime(self): From f24e476240753a5cb1fb94b8dedcf0b8b0cbea0b Mon Sep 17 00:00:00 2001 From: No-Stream Date: Mon, 11 Dec 2017 15:18:49 -0800 Subject: [PATCH 13/21] ENH: gb.is_monotonic_increasing #17015 minor fixes for @jreback --- doc/source/whatsnew/v0.21.1.txt | 1 - doc/source/whatsnew/v0.22.0.txt | 2 ++ pandas/core/groupby.py | 8 ++++++++ pandas/tests/groupby/test_groupby.py | 15 ++++++++++----- pandas/tests/groupby/test_whitelist.py | 5 +++-- 5 files changed, 23 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index d9c6efb408a07..e307e605687bf 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -23,7 +23,6 @@ Other Enhancements - :meth:`Timestamp.timestamp` is now available in Python 2.7. (:issue:`17329`) - :class:`Grouper` and :class:`TimeGrouper` now have a friendly repr output (:issue:`18203`). -- :meth: groupby.is_monotonic_increasing and :meth: .is_monotonic_decreasing extend :meth: Series.is_monotonic_increasing to groups, returning whether each group is monotonically increasing or decreasing, respectively. (:issue:`17015`) - .. _whatsnew_0211.deprecations: diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 1eb1b548788b9..6880bbed0999a 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -45,6 +45,8 @@ Other Enhancements - Improved wording of ``ValueError`` raised in :func:`to_datetime` when ``unit=`` is passed with a non-convertible value (:issue:`14350`) - :func:`Series.fillna` now accepts a Series or a dict as a ``value`` for a categorical dtype (:issue:`17033`) - :func:`pandas.read_clipboard` updated to use qtpy, falling back to PyQt5 and then PyQt4, adding compatibility with Python3 and multiple python-qt bindings (:issue:`17722`) +- :meth: groupby.is_monotonic_increasing and :meth: .is_monotonic_decreasing extend :meth: Series.is_monotonic_increasing to groups, returning whether each group is monotonically increasing or decreasing, respectively. (:issue:`17015`) +- .. _whatsnew_0220.api_breaking: diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 060e81410b2e7..b9ba72cc3fcf1 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1834,6 +1834,8 @@ def is_monotonic_increasing(self): Equivalent to ``.apply(lambda x: x.is_monotonic_increasing)``. + .. versionadded:: 0.22.0 + Examples -------- >>> source_dict = { @@ -1848,6 +1850,10 @@ def is_monotonic_increasing(self): cat_b False Name: C, dtype: bool + See Also + -------- + pandas.Series.is_monotonic_increasing + pandas.Index.is_monotonic_increasing """ return self.apply(lambda x: x.is_monotonic_increasing) @@ -1859,6 +1865,8 @@ def is_monotonic_decreasing(self): Equivalent to ``.apply(lambda x: x.is_monotonic_decreasing)``. + .. versionadded:: 0.22.0 + Examples -------- >>> source_dict = { diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 1fe797b6e18c9..22284bac5ad48 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2475,7 +2475,7 @@ def test_group_shift_with_null_key(self): # Generate a moderately large dataframe with occasional missing # values in column `B`, and then group by [`A`, `B`]. This should # force `-1` in `labels` array of `g.grouper.group_info` exactly - # at those places, where the group-by key is partilly missing. + # at those places, where the group-by key is partially missing. df = DataFrame([(i % 12, i % 3 if i % 3 else np.nan, i) for i in range(n_rows)], dtype=float, columns=["A", "B", "Z"], index=None) @@ -2601,13 +2601,16 @@ def test_cummin_cummax(self): tm.assert_series_equal(result, expected) @pytest.mark.parametrize('in_vals, out_vals', [ + # Basics: strictly increasing (T), strictly decreasing (F), # abs val increasing (F), non-strictly increasing (T) ([1, 2, 5, 3, 2, 0, 4, 5, -6, 1, 1], [True, False, False, True]), + # Test with inf vals ([1, 2.1, np.inf, 3, 2, np.inf, -np.inf, 5, 11, 1, -np.inf], [True, False, True, False]), + # Test with nan vals; should always be False ([1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan], [False, False, False, False]), @@ -2620,8 +2623,8 @@ def test_is_monotonic_increasing(self, in_vals, out_vals): 'C': in_vals} df = pd.DataFrame(source_dict) result = df.groupby('B').C.is_monotonic_increasing() - expected = pd.Series(index=list('abcd'), data=out_vals, name='C') - expected.index.name = 'B' + index = Index(list('abcd'), name='B') + expected = pd.Series(index=index, data=out_vals, name='C') tm.assert_series_equal(result, expected) # Also check result equal to manually taking x.is_monotonic_increasing. @@ -2634,9 +2637,11 @@ def test_is_monotonic_increasing(self, in_vals, out_vals): # abs val decreasing (F), non-strictly increasing (T) ([10, 9, 7, 3, 4, 5, -3, 2, 0, 1, 1], [True, False, False, True]), + # Test with inf vals ([np.inf, 1, -np.inf, np.inf, 2, -3, -np.inf, 5, -3, -np.inf, -np.inf], [True, True, False, True]), + # Test with nan vals; should always be False ([1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan], [False, False, False, False]), @@ -2650,8 +2655,8 @@ def test_is_monotonic_decreasing(self, in_vals, out_vals): df = pd.DataFrame(source_dict) result = df.groupby('B').C.is_monotonic_decreasing() - expected = pd.Series(index=list('abcd'), data=out_vals, name='C') - expected.index.name = 'B' + index = Index(list('abcd'), name='B') + expected = pd.Series(index=index, data=out_vals, name='C') tm.assert_series_equal(result, expected) def test_apply_numeric_coercion_when_datetime(self): diff --git a/pandas/tests/groupby/test_whitelist.py b/pandas/tests/groupby/test_whitelist.py index e30846d100246..34ceb602ccd37 100644 --- a/pandas/tests/groupby/test_whitelist.py +++ b/pandas/tests/groupby/test_whitelist.py @@ -184,7 +184,7 @@ def test_regression_whitelist_methods( axis, skipna, sort): # GH6944 # GH 17537 - # explicity test the whitelest methods + # explicitly test the whitelist methods if axis == 0: frame = raw_frame @@ -250,7 +250,8 @@ def test_tab_completion(mframe): 'take', 'tshift', 'pct_change', 'any', 'mad', 'corr', 'corrwith', 'cov', 'dtypes', 'ndim', 'diff', 'idxmax', 'idxmin', 'ffill', 'bfill', 'pad', 'backfill', 'rolling', 'expanding', 'pipe', - 'is_monotonic_increasing', 'is_monotonic_decreasing'} + 'is_monotonic_increasing', 'is_monotonic_decreasing' + } assert results == expected From db3e6c084323570cb37687cd6ff72e70dc72a7fa Mon Sep 17 00:00:00 2001 From: No-Stream Date: Mon, 11 Dec 2017 16:21:38 -0800 Subject: [PATCH 14/21] ENH: gb.is_monotonic_increasing #17015 fix changed whatsnew --- doc/source/whatsnew/v0.21.0.txt | 1 + doc/source/whatsnew/v0.22.0.txt | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 4911ecbb161a5..89e2d3006696c 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -1172,3 +1172,4 @@ Other ^^^^^ - Bug where some inplace operators were not being wrapped and produced a copy when invoked (:issue:`12962`) - Bug in :func:`eval` where the ``inplace`` parameter was being incorrectly handled (:issue:`16732`) + diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 6d7058bc335a5..59e84c77346c5 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -139,7 +139,7 @@ Other Enhancements - :func:`read_excel()` has gained the ``nrows`` parameter (:issue:`16645`) - :func:``DataFrame.to_json`` and ``Series.to_json`` now accept an ``index`` argument which allows the user to exclude the index from the JSON output (:issue:`17394`) - :meth: groupby.is_monotonic_increasing and :meth: .is_monotonic_decreasing extend :meth: Series.is_monotonic_increasing to groups, returning whether each group is monotonically increasing or decreasing, respectively. (:issue:`17015`) -- +- .. _whatsnew_0220.api_breaking: From fb0bc3cec777a8c6df1e6c685d107c77020699bc Mon Sep 17 00:00:00 2001 From: No-Stream Date: Mon, 11 Dec 2017 17:29:25 -0800 Subject: [PATCH 15/21] ENH: gb.is_monotonic_increasing #17015 fix 22.0 whatsnew --- doc/source/whatsnew/v0.22.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 59e84c77346c5..3958f4719e3bd 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -138,7 +138,7 @@ Other Enhancements - :func:`Series` / :func:`DataFrame` tab completion also returns identifiers in the first level of a :func:`MultiIndex`. (:issue:`16326`) - :func:`read_excel()` has gained the ``nrows`` parameter (:issue:`16645`) - :func:``DataFrame.to_json`` and ``Series.to_json`` now accept an ``index`` argument which allows the user to exclude the index from the JSON output (:issue:`17394`) -- :meth: groupby.is_monotonic_increasing and :meth: .is_monotonic_decreasing extend :meth: Series.is_monotonic_increasing to groups, returning whether each group is monotonically increasing or decreasing, respectively. (:issue:`17015`) +- :func: groupby.is_monotonic_increasing and :func: .is_monotonic_decreasing extend :func: Series.is_monotonic_increasing to groups, returning whether each group is monotonically increasing or decreasing, respectively. (:issue:`17015`) - .. _whatsnew_0220.api_breaking: From e5a90fa0e016160ff55cd722f77f20b92a27a5f4 Mon Sep 17 00:00:00 2001 From: No-Stream Date: Fri, 15 Dec 2017 15:06:07 -0800 Subject: [PATCH 16/21] NH: gb.is_monotonic_increasing #17015 alternate version to remove function defs --- pandas/core/groupby.py | 61 ++------------------------ pandas/tests/groupby/test_groupby.py | 6 +-- pandas/tests/groupby/test_whitelist.py | 3 +- 3 files changed, 8 insertions(+), 62 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 8aa085d43903e..8843bc5b81152 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -281,7 +281,9 @@ ]) | _plotting_methods _series_apply_whitelist = ((_common_apply_whitelist | - {'nlargest', 'nsmallest'}) - + {'nlargest', 'nsmallest', + 'is_monotonic_increasing', + 'is_monotonic_decreasing'}) - {'boxplot'}) | frozenset(['dtype', 'unique']) _dataframe_apply_whitelist = ((_common_apply_whitelist | @@ -1826,63 +1828,6 @@ def pipe(self, func, *args, **kwargs): """ return _pipe(self, func, *args, **kwargs) - @Substitution(name='groupby') - @Appender(_doc_template) - def is_monotonic_increasing(self): - """ - Returns whether each group is monotonically increasing. - - Equivalent to ``.apply(lambda x: x.is_monotonic_increasing)``. - - .. versionadded:: 0.22.0 - - Examples - -------- - >>> source_dict = { - ... 'A': ['this', 'col', 'is', 'entirely', 'irrelevant', '.'], - ... 'B': ['cat_a', 'cat_a', 'cat_a', 'cat_b', 'cat_b', 'cat_b'], - ... 'C': [1, 2, 3, 2, 2, 0]} - - >>> df = pd.DataFrame(source_dict) - ... df.groupby(['B']).C.is_monotonic_increasing() - B - cat_a True - cat_b False - Name: C, dtype: bool - - See Also - -------- - pandas.Series.is_monotonic_increasing - pandas.Index.is_monotonic_increasing - """ - return self.apply(lambda x: x.is_monotonic_increasing) - - @Substitution(name='groupby') - @Appender(_doc_template) - def is_monotonic_decreasing(self): - """ - Returns whether each group is monotonically decreasing. - - Equivalent to ``.apply(lambda x: x.is_monotonic_decreasing)``. - - .. versionadded:: 0.22.0 - - Examples - -------- - >>> source_dict = { - ... 'A': ['this', 'col', 'is', 'entirely', 'irrelevant', '.'], - ... 'B': ['cat_a', 'cat_a', 'cat_a', 'cat_b', 'cat_b', 'cat_b'], - ... 'C': [1, 2, 3, 2, 2, 0]} - - >>> df = pd.DataFrame(source_dict) - ... df.groupby(['B']).C.is_monotonic_decreasing() - B - cat_a False - cat_b True - Name: C, dtype: bool - """ - return self.apply(lambda x: x.is_monotonic_decreasing) - GroupBy._add_numeric_operations() diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 22284bac5ad48..83053d68eb21e 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2622,13 +2622,13 @@ def test_is_monotonic_increasing(self, in_vals, out_vals): 'B': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd'], 'C': in_vals} df = pd.DataFrame(source_dict) - result = df.groupby('B').C.is_monotonic_increasing() + result = df.groupby('B').C.is_monotonic_increasing index = Index(list('abcd'), name='B') expected = pd.Series(index=index, data=out_vals, name='C') tm.assert_series_equal(result, expected) # Also check result equal to manually taking x.is_monotonic_increasing. - expected = ( + expecteAd = ( df.groupby(['B']).C.apply(lambda x: x.is_monotonic_increasing)) tm.assert_series_equal(result, expected) @@ -2654,7 +2654,7 @@ def test_is_monotonic_decreasing(self, in_vals, out_vals): 'C': in_vals} df = pd.DataFrame(source_dict) - result = df.groupby('B').C.is_monotonic_decreasing() + result = df.groupby('B').C.is_monotonic_decreasing index = Index(list('abcd'), name='B') expected = pd.Series(index=index, data=out_vals, name='C') tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_whitelist.py b/pandas/tests/groupby/test_whitelist.py index 34ceb602ccd37..8d6e074881cbb 100644 --- a/pandas/tests/groupby/test_whitelist.py +++ b/pandas/tests/groupby/test_whitelist.py @@ -88,6 +88,8 @@ 'unique', 'nlargest', 'nsmallest', + 'is_monotonic_increasing', + 'is_monotonic_decreasing', ]) @@ -250,7 +252,6 @@ def test_tab_completion(mframe): 'take', 'tshift', 'pct_change', 'any', 'mad', 'corr', 'corrwith', 'cov', 'dtypes', 'ndim', 'diff', 'idxmax', 'idxmin', 'ffill', 'bfill', 'pad', 'backfill', 'rolling', 'expanding', 'pipe', - 'is_monotonic_increasing', 'is_monotonic_decreasing' } assert results == expected From d747d35727cb05ba0bf185d7e08ad4aa953c9d0e Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 5 Jan 2018 18:43:21 -0500 Subject: [PATCH 17/21] fixup whatsnew --- doc/source/whatsnew/v0.22.0.txt | 26 -------------------------- doc/source/whatsnew/v0.23.0.txt | 3 ++- 2 files changed, 2 insertions(+), 27 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 76876b72da7fe..d165339cb0de9 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -179,34 +179,8 @@ entirely valid. .. ipython:: python -<<<<<<< HEAD - s.rank(na_option='top') - -.. _whatsnew_0220.enhancements.other: - -Other Enhancements -^^^^^^^^^^^^^^^^^^ - -- Better support for :func:`Dataframe.style.to_excel` output with the ``xlsxwriter`` engine. (:issue:`16149`) -- :func:`pandas.tseries.frequencies.to_offset` now accepts leading '+' signs e.g. '+1h'. (:issue:`18171`) -- :func:`MultiIndex.unique` now supports the ``level=`` argument, to get unique values from a specific index level (:issue:`17896`) -- :class:`pandas.io.formats.style.Styler` now has method ``hide_index()`` to determine whether the index will be rendered in ouptut (:issue:`14194`) -- :class:`pandas.io.formats.style.Styler` now has method ``hide_columns()`` to determine whether columns will be hidden in output (:issue:`14194`) -- Improved wording of ``ValueError`` raised in :func:`to_datetime` when ``unit=`` is passed with a non-convertible value (:issue:`14350`) -- :func:`Series.fillna` now accepts a Series or a dict as a ``value`` for a categorical dtype (:issue:`17033`) -- :func:`pandas.read_clipboard` updated to use qtpy, falling back to PyQt5 and then PyQt4, adding compatibility with Python3 and multiple python-qt bindings (:issue:`17722`) -- Improved wording of ``ValueError`` raised in :func:`read_csv` when the ``usecols`` argument cannot match all columns. (:issue:`17301`) -- :func:`DataFrame.corrwith` now silently drops non-numeric columns when passed a Series. Before, an exception was raised (:issue:`18570`). -- :class:`IntervalIndex` now supports time zone aware ``Interval`` objects (:issue:`18537`, :issue:`18538`) -- :func:`Series` / :func:`DataFrame` tab completion also returns identifiers in the first level of a :func:`MultiIndex`. (:issue:`16326`) -- :func:`read_excel()` has gained the ``nrows`` parameter (:issue:`16645`) -- :func:``DataFrame.to_json`` and ``Series.to_json`` now accept an ``index`` argument which allows the user to exclude the index from the JSON output (:issue:`17394`) -- ``IntervalIndex.to_tuples()`` has gained the ``na_tuple`` parameter to control whether NA is returned as a tuple of NA, or NA itself (:issue:`18756`) -- :func: groupby.is_monotonic_increasing and :func: .is_monotonic_decreasing extend :func: Series.is_monotonic_increasing to groups, returning whether each group is monotonically increasing or decreasing, respectively. (:issue:`17015`) -======= idx = pd.DatetimeIndex(['2017-01-01', '2017-01-02']) pd.Series([1, 2], index=idx).resample("12H").sum() ->>>>>>> master Once again, the ``min_count`` keyword is available to restore the 0.21 behavior. diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index d53de30187156..7bbdbfa1f37c7 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -171,6 +171,7 @@ Other Enhancements - ``Resampler`` objects now have a functioning :attr:`~pandas.core.resample.Resampler.pipe` method. Previously, calls to ``pipe`` were diverted to the ``mean`` method (:issue:`17905`). - :func:`~pandas.api.types.is_scalar` now returns ``True`` for ``DateOffset`` objects (:issue:`18943`). +- :func: groupby.is_monotonic_increasing and :func: .is_monotonic_decreasing extend :func: Series.is_monotonic_increasing to groups, returning whether each group is monotonically increasing or decreasing, respectively. (:issue:`17015`) .. _whatsnew_0230.api_breaking: @@ -338,7 +339,7 @@ Conversion - Bug in :class:`Series` floor-division where operating on a scalar ``timedelta`` raises an exception (:issue:`18846`) - Bug in :class:`FY5253Quarter`, :class:`LastWeekOfMonth` where rollback and rollforward behavior was inconsistent with addition and subtraction behavior (:issue:`18854`) - Bug in :class:`Index` constructor with ``dtype=CategoricalDtype(...)`` where ``categories`` and ``ordered`` are not maintained (issue:`19032`) -- Bug in :class:`Index` constructor with ``dtype=CategoricalDtype(...)`` where ``categories`` and ``ordered`` are not maintained (issue:`19032`) +- Bug in :class:`Index` constructor with ``dtype=CategoricalDtype(...)`` where ``categories`` and ``ordered`` are not maintained (issue:`19032`) - Bug in :class:`Series`` with ``dtype='timedelta64[ns]`` where addition or subtraction of ``TimedeltaIndex`` had results cast to ``dtype='int64'`` (:issue:`17250`) - Bug in :class:`TimedeltaIndex` where division by a ``Series`` would return a ``TimedeltaIndex`` instead of a ``Series`` (issue:`19042`) - Bug in :class:`Series` with ``dtype='timedelta64[ns]`` where addition or subtraction of ``TimedeltaIndex`` could return a ``Series`` with an incorrect name (issue:`19043`) From dd9a7fcf74b60e9e21dc88c632c50475bb5fcf64 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Sun, 11 Feb 2018 12:14:08 -0500 Subject: [PATCH 18/21] Resolve v0.23.0.txt merge conflict --- doc/source/whatsnew/v0.23.0.txt | 525 ++++++++++++++++++++++++++++---- 1 file changed, 470 insertions(+), 55 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 7bbdbfa1f37c7..6436a8464b5d0 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -13,10 +13,38 @@ version. New features ~~~~~~~~~~~~ -- -- -- +.. _whatsnew_0210.enhancements.limit_area: + +``DataFrame.interpolate`` has gained the ``limit_area`` kwarg +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`DataFrame.interpolate` has gained a ``limit_area`` parameter to allow further control of which ``NaN`` s are replaced. +Use `limit_area='inside'` to fill only NaNs surrounded by valid values or use `limit_area='outside'` to fill only ``NaN`` s +outside the existing valid values while preserving those inside. (:issue:`16284`) See the :ref:`full documentation here `. + + +.. ipython:: python + + ser = pd.Series([np.nan, np.nan, 5, np.nan, np.nan, np.nan, 13, np.nan, np.nan]) + ser + +Fill one consecutive inside value in both directions + +.. ipython:: python + + ser.interpolate(limit_direction='both', limit_area='inside', limit=1) + +Fill all consecutive outside values backward +.. ipython:: python + + ser.interpolate(limit_direction='backward', limit_area='outside') + +Fill all consecutive outside values in both directions + +.. ipython:: python + + ser.interpolate(limit_direction='both', limit_area='outside') .. _whatsnew_0210.enhancements.get_dummies_dtype: @@ -114,7 +142,7 @@ Previous Behavior: 4 NaN dtype: float64 -Current Behavior +Current Behavior: .. ipython:: python @@ -139,17 +167,133 @@ Previous Behavior: 3 2.5 dtype: float64 -Current Behavior +Current Behavior: .. ipython:: python s.rank(na_option='top') +.. _whatsnew_0230.enhancements.round-trippable_json: + +JSON read/write round-trippable with ``orient='table'`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +A ``DataFrame`` can now be written to and subsequently read back via JSON while preserving metadata through usage of the ``orient='table'`` argument (see :issue:`18912` and :issue:`9146`). Previously, none of the available ``orient`` values guaranteed the preservation of dtypes and index names, amongst other metadata. + +.. ipython:: python + + df = pd.DataFrame({'foo': [1, 2, 3, 4], + 'bar': ['a', 'b', 'c', 'd'], + 'baz': pd.date_range('2018-01-01', freq='d', periods=4), + 'qux': pd.Categorical(['a', 'b', 'c', 'c']) + }, index=pd.Index(range(4), name='idx')) + df + df.dtypes + df.to_json('test.json', orient='table') + new_df = pd.read_json('test.json', orient='table') + new_df + new_df.dtypes + +Please note that the string `index` is not supported with the round trip format, as it is used by default in ``write_json`` to indicate a missing index name. + +.. ipython:: python + + df.index.name = 'index' + df.to_json('test.json', orient='table') + new_df = pd.read_json('test.json', orient='table') + new_df + print(new_df.index.name) + +.. _whatsnew_0230.enhancements.index_division_by_zero: + +Index Division By Zero Fills Correctly +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Division operations on ``Index`` and subclasses will now fill division of positive numbers by zero with ``np.inf``, division of negative numbers by zero with ``-np.inf`` and `0 / 0` with ``np.nan``. This matches existing ``Series`` behavior. (:issue:`19322`, :issue:`19347`) + +Previous Behavior: + +.. code-block:: ipython + + In [6]: index = pd.Int64Index([-1, 0, 1]) + + In [7]: index / 0 + Out[7]: Int64Index([0, 0, 0], dtype='int64') + + # Previous behavior yielded different results depending on the type of zero in the divisor + In [8]: index / 0.0 + Out[8]: Float64Index([-inf, nan, inf], dtype='float64') + + In [9]: index = pd.UInt64Index([0, 1]) + + In [10]: index / np.array([0, 0], dtype=np.uint64) + Out[10]: UInt64Index([0, 0], dtype='uint64') + + In [11]: pd.RangeIndex(1, 5) / 0 + ZeroDivisionError: integer division or modulo by zero + +Current Behavior: + +.. ipython:: python + + index = pd.Int64Index([-1, 0, 1]) + # division by zero gives -infinity where negative, +infinity where positive, and NaN for 0 / 0 + index / 0 + + # The result of division by zero should not depend on whether the zero is int or float + index / 0.0 + + index = pd.UInt64Index([0, 1]) + index / np.array([0, 0], dtype=np.uint64) + + pd.RangeIndex(1, 5) / 0 + +.. _whatsnew_0230.enhancements.assign_dependent: + +``.assign()`` accepts dependent arguments +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The :func:`DataFrame.assign` now accepts dependent keyword arguments for python version later than 3.6 (see also `PEP 468 +`_). Later keyword arguments may now refer to earlier ones if the argument is a callable. See the +:ref:`documentation here ` (:issue:`14207`) + +.. ipython:: python + + df = pd.DataFrame({'A': [1, 2, 3]}) + df + df.assign(B=df.A, C=lambda x:x['A']+ x['B']) + +.. warning:: + + This may subtly change the behavior of your code when you're + using ``.assign()`` to update an existing column. Previously, callables + referring to other variables being updated would get the "old" values + + Previous Behaviour: + + .. code-block:: ipython + + In [2]: df = pd.DataFrame({"A": [1, 2, 3]}) + + In [3]: df.assign(A=lambda df: df.A + 1, C=lambda df: df.A * -1) + Out[3]: + A C + 0 2 -1 + 1 3 -2 + 2 4 -3 + + New Behaviour: + + .. ipython:: python + + df.assign(A=df.A+1, C= lambda df: df.A* -1) + .. _whatsnew_0230.enhancements.other: Other Enhancements ^^^^^^^^^^^^^^^^^^ +- Unary ``+`` now permitted for ``Series`` and ``DataFrame`` as numeric operator (:issue:`16073`) - Better support for :func:`Dataframe.style.to_excel` output with the ``xlsxwriter`` engine. (:issue:`16149`) - :func:`pandas.tseries.frequencies.to_offset` now accepts leading '+' signs e.g. '+1h'. (:issue:`18171`) - :func:`MultiIndex.unique` now supports the ``level=`` argument, to get unique values from a specific index level (:issue:`17896`) @@ -171,6 +315,14 @@ Other Enhancements - ``Resampler`` objects now have a functioning :attr:`~pandas.core.resample.Resampler.pipe` method. Previously, calls to ``pipe`` were diverted to the ``mean`` method (:issue:`17905`). - :func:`~pandas.api.types.is_scalar` now returns ``True`` for ``DateOffset`` objects (:issue:`18943`). +- Added :func:`pandas.api.extensions.register_dataframe_accessor`, + :func:`pandas.api.extensions.register_series_accessor`, and + :func:`pandas.api.extensions.register_index_accessor`, accessor for libraries downstream of pandas + to register custom accessors like ``.cat`` on pandas objects. See + :ref:`Registering Custom Accessors ` for more (:issue:`14781`). + +- ``IntervalIndex.astype`` now supports conversions between subtypes when passed an ``IntervalDtype`` (:issue:`19197`) +- :class:`IntervalIndex` and its associated constructor methods (``from_arrays``, ``from_breaks``, ``from_tuples``) have gained a ``dtype`` parameter (:issue:`19262`) - :func: groupby.is_monotonic_increasing and :func: .is_monotonic_decreasing extend :func: Series.is_monotonic_increasing to groups, returning whether each group is monotonically increasing or decreasing, respectively. (:issue:`17015`) .. _whatsnew_0230.api_breaking: @@ -194,6 +346,104 @@ If installed, we now require: | openpyxl | 2.4.0 | | +-----------------+-----------------+----------+ +.. _whatsnew_0230.api_breaking.deprecate_panel: + +Deprecate Panel +^^^^^^^^^^^^^^^ + +``Panel`` was deprecated in the 0.20.x release, showing as a ``DeprecationWarning``. Using ``Panel`` will now show a ``FutureWarning``. The recommended way to represent 3-D data are +with a ``MultiIndex`` on a ``DataFrame`` via the :meth:`~Panel.to_frame` or with the `xarray package `__. Pandas +provides a :meth:`~Panel.to_xarray` method to automate this conversion. For more details see :ref:`Deprecate Panel ` documentation. (:issue:`13563`, :issue:`18324`). + +.. ipython:: python + :okwarning: + + p = tm.makePanel() + p + +Convert to a MultiIndex DataFrame + +.. ipython:: python + + p.to_frame() + +Convert to an xarray DataArray + +.. ipython:: python + :okwarning: + + p.to_xarray() + +.. _whatsnew_0230.api_breaking.apply: + +Changes to make output of ``DataFrame.apply`` consistent +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:func:`DataFrame.apply` was inconsistent when applying an arbitrary user-defined-function that returned a list-like with ``axis=1``. Several bugs and inconsistencies +are resolved. If the applied function returns a Series, then pandas will return a DataFrame; otherwise a Series will be returned, this includes the case +where a list-like (e.g. ``tuple`` or ``list`` is returned) (:issue:`16353`, :issue:`17437`, :issue:`17970`, :issue:`17348`, :issue:`17892`, :issue:`18573`, +:issue:`17602`, :issue:`18775`, :issue:`18901`, :issue:`18919`). + +.. ipython:: python + + df = pd.DataFrame(np.tile(np.arange(3), 6).reshape(6, -1) + 1, columns=['A', 'B', 'C']) + df + +Previous Behavior: if the returned shape happened to match the length of original columns, this would return a ``DataFrame``. +If the return shape did not match, a ``Series`` with lists was returned. + +.. code-block:: python + + In [3]: df.apply(lambda x: [1, 2, 3], axis=1) + Out[3]: + A B C + 0 1 2 3 + 1 1 2 3 + 2 1 2 3 + 3 1 2 3 + 4 1 2 3 + 5 1 2 3 + + In [4]: df.apply(lambda x: [1, 2], axis=1) + Out[4]: + 0 [1, 2] + 1 [1, 2] + 2 [1, 2] + 3 [1, 2] + 4 [1, 2] + 5 [1, 2] + dtype: object + + +New Behavior: When the applied function returns a list-like, this will now *always* return a ``Series``. + +.. ipython:: python + + df.apply(lambda x: [1, 2, 3], axis=1) + df.apply(lambda x: [1, 2], axis=1) + +To have expanded columns, you can use ``result_type='expand'`` + +.. ipython:: python + + df.apply(lambda x: [1, 2, 3], axis=1, result_type='expand') + +To broadcast the result across the original columns (the old behaviour for +list-likes of the correct length), you can use ``result_type='broadcast'``. +The shape must match the original columns. + +.. ipython:: python + + df.apply(lambda x: [1, 2, 3], axis=1, result_type='broadcast') + +Returning a ``Series`` allows one to control the exact return structure and column names: + +.. ipython:: python + + df.apply(lambda x: Series([1, 2, 3], index=['D', 'E', 'F']]), axis=1) + + +.. _whatsnew_0230.api_breaking.build_changes: Build Changes ^^^^^^^^^^^^^ @@ -202,6 +452,78 @@ Build Changes - Building from source now explicitly requires ``setuptools`` in ``setup.py`` (:issue:`18113`) - Updated conda recipe to be in compliance with conda-build 3.0+ (:issue:`18002`) +.. _whatsnew_0230.api_breaking.extract: + +Extraction of matching patterns from strings +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +By default, extracting matching patterns from strings with :func:`str.extract` used to return a +``Series`` if a single group was being extracted (a ``DataFrame`` if more than one group was +extracted``). As of Pandas 0.23.0 :func:`str.extract` always returns a ``DataFrame``, unless +``expand`` is set to ``False`` (:issue:`11386`). + +Also, ``None`` was an accepted value for the ``expand`` parameter (which was equivalent to +``False``), but now raises a ``ValueError``. + +Previous Behavior: + +.. code-block:: ipython + + In [1]: s = pd.Series(['number 10', '12 eggs']) + + In [2]: extracted = s.str.extract('.*(\d\d).*') + + In [3]: extracted + Out [3]: + 0 10 + 1 12 + dtype: object + + In [4]: type(extracted) + Out [4]: + pandas.core.series.Series + +New Behavior: + +.. ipython:: python + + s = pd.Series(['number 10', '12 eggs']) + extracted = s.str.extract('.*(\d\d).*') + extracted + type(extracted) + +To restore previous behavior, simply set ``expand`` to ``False``: + +.. ipython:: python + + s = pd.Series(['number 10', '12 eggs']) + extracted = s.str.extract('.*(\d\d).*', expand=False) + extracted + type(extracted) + +.. _whatsnew_0230.api_breaking.cdt_ordered: + +Default value for the ``ordered`` parameter of ``CategoricalDtype`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The default value of the ``ordered`` parameter for :class:`~pandas.api.types.CategoricalDtype` has changed from ``False`` to ``None`` to allow updating of ``categories`` without impacting ``ordered``. Behavior should remain consistent for downstream objects, such as :class:`Categorical` (:issue:`18790`) + +In previous versions, the default value for the ``ordered`` parameter was ``False``. This could potentially lead to the ``ordered`` parameter unintentionally being changed from ``True`` to ``False`` when users attempt to update ``categories`` if ``ordered`` is not explicitly specified, as it would silently default to ``False``. The new behavior for ``ordered=None`` is to retain the existing value of ``ordered``. + +New Behavior: + +.. ipython:: python + + from pandas.api.types import CategoricalDtype + cat = pd.Categorical(list('abcaba'), ordered=True, categories=list('cba')) + cat + cdt = CategoricalDtype(categories=list('cbad')) + cat.astype(cdt) + +Notice in the example above that the converted ``Categorical`` has retained ``ordered=True``. Had the default value for ``ordered`` remained as ``False``, the converted ``Categorical`` would have become unordered, despite ``ordered=False`` never being explicitly specified. To change the value of ``ordered``, explicitly pass it to the new dtype, e.g. ``CategoricalDtype(categories=list('cbad'), ordered=False)``. + +Note that the unintenional conversion of ``ordered`` discussed above did not arise in previous versions due to separate bugs that prevented ``astype`` from doing any type of category to category conversion (:issue:`10696`, :issue:`18593`). These bugs have been fixed in this release, and motivated changing the default value of ``ordered``. + .. _whatsnew_0230.api: Other API Changes @@ -240,6 +562,15 @@ Other API Changes - Subtraction of :class:`Series` with timezone-aware ``dtype='datetime64[ns]'`` with mis-matched timezones will raise ``TypeError`` instead of ``ValueError`` (issue:`18817`) - :class:`IntervalIndex` and ``IntervalDtype`` no longer support categorical, object, and string subtypes (:issue:`19016`) - The default ``Timedelta`` constructor now accepts an ``ISO 8601 Duration`` string as an argument (:issue:`19040`) +- ``IntervalDtype`` now returns ``True`` when compared against ``'interval'`` regardless of subtype, and ``IntervalDtype.name`` now returns ``'interval'`` regardless of subtype (:issue:`18980`) +- ``KeyError`` now raises instead of ``ValueError`` in :meth:`~DataFrame.drop`, :meth:`~Panel.drop`, :meth:`~Series.drop`, :meth:`~Index.drop` when dropping a non-existent element in an axis with duplicates (:issue:`19186`) +- :func:`Series.to_csv` now accepts a ``compression`` argument that works in the same way as the ``compression`` argument in :func:`DataFrame.to_csv` (:issue:`18958`) +- Addition or subtraction of ``NaT`` from :class:`TimedeltaIndex` will return ``TimedeltaIndex`` instead of ``DatetimeIndex`` (:issue:`19124`) +- :func:`DatetimeIndex.shift` and :func:`TimedeltaIndex.shift` will now raise ``NullFrequencyError`` (which subclasses ``ValueError``, which was raised in older versions) when the index object frequency is ``None`` (:issue:`19147`) +- Addition and subtraction of ``NaN`` from a :class:`Series` with ``dtype='timedelta64[ns]'`` will raise a ``TypeError` instead of treating the ``NaN`` as ``NaT`` (:issue:`19274`) +- Set operations (union, difference...) on :class:`IntervalIndex` with incompatible index types will now raise a ``TypeError`` rather than a ``ValueError`` (:issue:`19329`) +- :class:`DateOffset` objects render more simply, e.g. "" instead of "" (:issue:`19403`) +- :func:`pandas.merge` provides a more informative error message when trying to merge on timezone-aware and timezone-naive columns (:issue:`15800`) .. _whatsnew_0230.deprecations: @@ -256,6 +587,10 @@ Deprecations - ``Series.valid`` is deprecated. Use :meth:`Series.dropna` instead (:issue:`18800`). - :func:`read_excel` has deprecated the ``skip_footer`` parameter. Use ``skipfooter`` instead (:issue:`18836`) - The ``is_copy`` attribute is deprecated and will be removed in a future version (:issue:`18801`). +- ``IntervalIndex.from_intervals`` is deprecated in favor of the :class:`IntervalIndex` constructor (:issue:`19263`) +- :func:``DataFrame.from_items`` is deprecated. Use :func:``DataFrame.from_dict()`` instead, or :func:``DataFrame.from_dict(OrderedDict())`` if you wish to preserve the key order (:issue:`17320`) +- The ``broadcast`` parameter of ``.apply()`` is deprecated in favor of ``result_type='broadcast'`` (:issue:`18577`) +- The ``reduce`` parameter of ``.apply()`` is deprecated in favor of ``result_type='reduce'`` (:issue:`18577`) .. _whatsnew_0230.prior_deprecations: @@ -283,6 +618,12 @@ Removal of prior version deprecations/changes - The ``freqstr`` keyword has been removed from ``pandas.tseries.frequencies.to_offset`` in favor of ``freq`` (:issue:`13874`) - The ``Panel4D`` and ``PanelND`` classes have been removed (:issue:`13776`) - The ``Panel``class has dropped the ``to_long``and ``toLong`` methods (:issue:`19077`) +- The options ``display.line_with`` and ``display.height`` are removed in favor of ``display.width`` and ``display.max_rows`` respectively (:issue:`4391`, :issue:`19107`) +- The ``labels`` attribute of the ``Categorical`` class has been removed in favor of :attribute:`Categorical.codes` (:issue:`7768`) +- The ``flavor`` parameter have been removed from func:`to_sql` method (:issue:`13611`) +- The modules `pandas.tools.hashing` and `pandas.util.hashing` have been removed (:issue:`16223`) +- The top-level functions ``pd.rolling_*``, ``pd.expanding_*`` and ``pd.ewm*`` have been removed (Deprecated since v0.18). + Instead, use the DataFrame/Series methods :attr:`~DataFrame.rolling`, :attr:`~DataFrame.expanding` and :attr:`~DataFrame.ewm` (:issue:`18723`) .. _whatsnew_0230.performance: @@ -301,6 +642,10 @@ Performance Improvements - Improved performance of :func:`IntervalIndex.symmetric_difference()` (:issue:`18475`) - Improved performance of ``DatetimeIndex`` and ``Series`` arithmetic operations with Business-Month and Business-Quarter frequencies (:issue:`18489`) - :func:`Series` / :func:`DataFrame` tab completion limits to 100 values, for better performance. (:issue:`18587`) +- Improved performance of :func:`DataFrame.median` with ``axis=1`` when bottleneck is not installed (:issue:`16468`) +- Improved performance of :func:`MultiIndex.get_loc` for large indexes, at the cost of a reduction in performance for small ones (:issue:`18519`) +- Improved performance of pairwise ``.rolling()`` and ``.expanding()`` with ``.cov()`` and ``.corr()`` operations (:issue:`17917`) +- Improved performance of :func:`DataFrameGroupBy.rank` (:issue:`15779`) .. _whatsnew_0230.docs: @@ -319,70 +664,144 @@ Documentation Changes Bug Fixes ~~~~~~~~~ +Categorical +^^^^^^^^^^^ -Conversion -^^^^^^^^^^ - -- Bug in :class:`Index` constructor with ``dtype='uint64'`` where int-like floats were not coerced to :class:`UInt64Index` (:issue:`18400`) -- Bug in the :class:`DataFrame` constructor in which data containing very large positive or very large negative numbers was causing ``OverflowError`` (:issue:`18584`) -- Fixed a bug where creating a Series from an array that contains both tz-naive and tz-aware values will result in a Series whose dtype is tz-aware instead of object (:issue:`16406`) -- Adding a ``Period`` object to a ``datetime`` or ``Timestamp`` object will now correctly raise a ``TypeError`` (:issue:`17983`) -- Fixed a bug where ``FY5253`` date offsets could incorrectly raise an ``AssertionError`` in arithmetic operatons (:issue:`14774`) +.. warning:: + + A class of bugs were introduced in pandas 0.21 with ``CategoricalDtype`` that + affects the correctness of operations like ``merge``, ``concat``, and + indexing when comparing multiple unordered ``Categorical`` arrays that have + the same categories, but in a different order. We highly recommend upgrading + or manually aligning your categories before doing these operations. + +- Bug in ``Categorical.equals`` returning the wrong result when comparing two + unordered ``Categorical`` arrays with the same categories, but in a different + order (:issue:`16603`) +- Bug in :func:`pandas.api.types.union_categoricals` returning the wrong result + when for unordered categoricals with the categories in a different order. + This affected :func:`pandas.concat` with Categorical data (:issue:`19096`). +- Bug in :func:`pandas.merge` returning the wrong result when joining on an + unordered ``Categorical`` that had the same categories but in a different + order (:issue:`19551`) +- Bug in :meth:`CategoricalIndex.get_indexer` returning the wrong result when + ``target`` was an unordered ``Categorical`` that had the same categories as + ``self`` but in a different order (:issue:`19551`) - Bug in :meth:`Index.astype` with a categorical dtype where the resultant index is not converted to a :class:`CategoricalIndex` for all types of index (:issue:`18630`) - Bug in :meth:`Series.astype` and ``Categorical.astype()`` where an existing categorical data does not get updated (:issue:`10696`, :issue:`18593`) -- Bug in :class:`Series` constructor with an int or float list where specifying ``dtype=str``, ``dtype='str'`` or ``dtype='U'`` failed to convert the data elements to strings (:issue:`16605`) +- Bug in :class:`Index` constructor with ``dtype=CategoricalDtype(...)`` where ``categories`` and ``ordered`` are not maintained (issue:`19032`) + +Datetimelike +^^^^^^^^^^^^ + +- Bug in :func:`Series.__sub__` subtracting a non-nanosecond ``np.datetime64`` object from a ``Series`` gave incorrect results (:issue:`7996`) +- Bug in :class:`DatetimeIndex`, :class:`TimedeltaIndex` addition and subtraction of zero-dimensional integer arrays gave incorrect results (:issue:`19012`) +- Bug in :func:`Series.__add__` adding Series with dtype ``timedelta64[ns]`` to a timezone-aware ``DatetimeIndex`` incorrectly dropped timezone information (:issue:`13905`) +- Bug in :func:`Timedelta.__floordiv__` and :func:`Timedelta.__rfloordiv__` dividing by many incompatible numpy objects was incorrectly allowed (:issue:`18846`) +- Adding a ``Period`` object to a ``datetime`` or ``Timestamp`` object will now correctly raise a ``TypeError`` (:issue:`17983`) - Bug in :class:`Timestamp` where comparison with an array of ``Timestamp`` objects would result in a ``RecursionError`` (:issue:`15183`) -- Bug in :class:`WeekOfMonth` and class:`Week` where addition and subtraction did not roll correctly (:issue:`18510`,:issue:`18672`,:issue:`18864`) -- Bug in :meth:`DatetimeIndex.astype` when converting between timezone aware dtypes, and converting from timezone aware to naive (:issue:`18951`) -- Bug in :class:`FY5253` where ``datetime`` addition and subtraction incremented incorrectly for dates on the year-end but not normalized to midnight (:issue:`18854`) -- Bug in :class:`DatetimeIndex` where adding or subtracting an array-like of ``DateOffset`` objects either raised (``np.array``, ``pd.Index``) or broadcast incorrectly (``pd.Series``) (:issue:`18849`) +- Bug in :class:`DatetimeIndex` and :class:`TimedeltaIndex` where adding or subtracting an array-like of ``DateOffset`` objects either raised (``np.array``, ``pd.Index``) or broadcast incorrectly (``pd.Series``) (:issue:`18849`) - Bug in :class:`Series` floor-division where operating on a scalar ``timedelta`` raises an exception (:issue:`18846`) -- Bug in :class:`FY5253Quarter`, :class:`LastWeekOfMonth` where rollback and rollforward behavior was inconsistent with addition and subtraction behavior (:issue:`18854`) -- Bug in :class:`Index` constructor with ``dtype=CategoricalDtype(...)`` where ``categories`` and ``ordered`` are not maintained (issue:`19032`) -- Bug in :class:`Index` constructor with ``dtype=CategoricalDtype(...)`` where ``categories`` and ``ordered`` are not maintained (issue:`19032`) - Bug in :class:`Series`` with ``dtype='timedelta64[ns]`` where addition or subtraction of ``TimedeltaIndex`` had results cast to ``dtype='int64'`` (:issue:`17250`) - Bug in :class:`TimedeltaIndex` where division by a ``Series`` would return a ``TimedeltaIndex`` instead of a ``Series`` (issue:`19042`) -- Bug in :class:`Series` with ``dtype='timedelta64[ns]`` where addition or subtraction of ``TimedeltaIndex`` could return a ``Series`` with an incorrect name (issue:`19043`) +- Bug in :class:`Series` with ``dtype='timedelta64[ns]`` where addition or subtraction of ``TimedeltaIndex`` could return a ``Series`` with an incorrect name (:issue:`19043`) +- Bug in :class:`DatetimeIndex` where the repr was not showing high-precision time values at the end of a day (e.g., 23:59:59.999999999) (:issue:`19030`) +- Bug where dividing a scalar timedelta-like object with :class:`TimedeltaIndex` performed the reciprocal operation (:issue:`19125`) +- Bug in ``.astype()`` to non-ns timedelta units would hold the incorrect dtype (:issue:`19176`, :issue:`19223`, :issue:`12425`) +- Bug in subtracting :class:`Series` from ``NaT`` incorrectly returning ``NaT`` (:issue:`19158`) +- Bug in :func:`Series.truncate` which raises ``TypeError`` with a monotonic ``PeriodIndex`` (:issue:`17717`) +- Bug in :func:`~DataFrame.pct_change` using ``periods`` and ``freq`` returned different length outputs (:issue:`7292`) +- Bug in comparison of :class:`DatetimeIndex` against ``None`` or ``datetime.date`` objects raising ``TypeError`` for ``==`` and ``!=`` comparisons instead of all-``False`` and all-``True``, respectively (:issue:`19301`) +- Bug in :class:`Timestamp` and :func:`to_datetime` where a string representing a barely out-of-bounds timestamp would be incorrectly rounded down instead of raising ``OutOfBoundsDatetime`` (:issue:`19382`) +- Bug in :func:`Timestamp.floor` :func:`DatetimeIndex.floor` where time stamps far in the future and past were not rounded correctly (:issue:`19206`) +- Bug in :func:`to_datetime` where passing an out-of-bounds datetime with ``errors='coerce'`` and ``utc=True`` would raise ``OutOfBoundsDatetime`` instead of parsing to ``NaT`` (:issue:`19612`) - +Timezones +^^^^^^^^^ + +- Bug in creating a ``Series`` from an array that contains both tz-naive and tz-aware values will result in a ``Series`` whose dtype is tz-aware instead of object (:issue:`16406`) +- Bug in comparison of timezone-aware :class:`DatetimeIndex` against ``NaT`` incorrectly raising ``TypeError`` (:issue:`19276`) +- Bug in :meth:`DatetimeIndex.astype` when converting between timezone aware dtypes, and converting from timezone aware to naive (:issue:`18951`) +- Bug in comparing :class:`DatetimeIndex`, which failed to raise ``TypeError`` when attempting to compare timezone-aware and timezone-naive datetimelike objects (:issue:`18162`) +- Bug in localization of a naive, datetime string in a ``Series`` constructor with a ``datetime64[ns, tz]`` dtype (:issue:`174151`) +- :func:`Timestamp.replace` will now handle Daylight Savings transitions gracefully (:issue:`18319`) +- Bug in tz-aware :class:`DatetimeIndex` where addition/subtraction with a :class:`TimedeltaIndex` or array with ``dtype='timedelta64[ns]'`` was incorrect (:issue:`17558`) +- Bug in :func:`DatetimeIndex.insert` where inserting ``NaT`` into a timezone-aware index incorrectly raised (:issue:`16357`) +- Bug in the :class:`DataFrame` constructor, where tz-aware Datetimeindex and a given column name will result in an empty ``DataFrame`` (:issue:`19157`) +- Bug in :func:`Timestamp.tz_localize` where localizing a timestamp near the minimum or maximum valid values could overflow and return a timestamp with an incorrect nanosecond value (:issue:`12677`) + +Offsets +^^^^^^^ + +- Bug in :class:`WeekOfMonth` and class:`Week` where addition and subtraction did not roll correctly (:issue:`18510`,:issue:`18672`,:issue:`18864`) +- Bug in :class:`WeekOfMonth` and :class:`LastWeekOfMonth` where default keyword arguments for constructor raised ``ValueError`` (:issue:`19142`) +- Bug in :class:`FY5253Quarter`, :class:`LastWeekOfMonth` where rollback and rollforward behavior was inconsistent with addition and subtraction behavior (:issue:`18854`) +- Bug in :class:`FY5253` where ``datetime`` addition and subtraction incremented incorrectly for dates on the year-end but not normalized to midnight (:issue:`18854`) +- Bug in :class:`FY5253` where date offsets could incorrectly raise an ``AssertionError`` in arithmetic operatons (:issue:`14774`) + + +Numeric +^^^^^^^ +- Bug in :class:`Series` constructor with an int or float list where specifying ``dtype=str``, ``dtype='str'`` or ``dtype='U'`` failed to convert the data elements to strings (:issue:`16605`) +- Bug in :class:`Index` multiplication and division methods where operating with a ``Series`` would return an ``Index`` object instead of a ``Series`` object (:issue:`19042`) +- Bug in the :class:`DataFrame` constructor in which data containing very large positive or very large negative numbers was causing ``OverflowError`` (:issue:`18584`) +- Bug in :class:`Index` constructor with ``dtype='uint64'`` where int-like floats were not coerced to :class:`UInt64Index` (:issue:`18400`) +- Bug in :class:`DataFrame` flex arithmetic (e.g. `df.add(other, fill_value=foo)`) with a `fill_value` other than ``None`` failed to raise ``NotImplementedError`` in corner cases where either the frame or ``other`` has length zero (:issue:`19522`) + + Indexing ^^^^^^^^ -- Bug in :func:`Series.truncate` which raises ``TypeError`` with a monotonic ``PeriodIndex`` (:issue:`17717`) -- Bug in :func:`DataFrame.groupby` where tuples were interpreted as lists of keys rather than as keys (:issue:`17979`, :issue:`18249`) -- Bug in :func:`MultiIndex.get_level_values` which would return an invalid index on level of ints with missing values (:issue:`17924`) -- Bug in :func:`MultiIndex.remove_unused_levels` which would fill nan values (:issue:`18417`) -- Bug in :func:`MultiIndex.from_tuples`` which would fail to take zipped tuples in python3 (:issue:`18434`) - Bug in :class:`Index` construction from list of mixed type tuples (:issue:`18505`) - Bug in :func:`Index.drop` when passing a list of both tuples and non-tuples (:issue:`18304`) -- Bug in :class:`IntervalIndex` where empty and purely NA data was constructed inconsistently depending on the construction method (:issue:`18421`) -- Bug in :func:`IntervalIndex.symmetric_difference` where the symmetric difference with a non-``IntervalIndex`` did not raise (:issue:`18475`) +- Bug in :meth:`~DataFrame.drop`, :meth:`~Panel.drop`, :meth:`~Series.drop`, :meth:`~Index.drop` where no ``KeyError`` is raised when dropping a non-existent element from an axis that contains duplicates (:issue:`19186`) - Bug in indexing a datetimelike ``Index`` that raised ``ValueError`` instead of ``IndexError`` (:issue:`18386`). -- Bug in tz-aware :class:`DatetimeIndex` where addition/subtraction with a :class:`TimedeltaIndex` or array with ``dtype='timedelta64[ns]'`` was incorrect (:issue:`17558`) - :func:`Index.to_series` now accepts ``index`` and ``name`` kwargs (:issue:`18699`) - :func:`DatetimeIndex.to_series` now accepts ``index`` and ``name`` kwargs (:issue:`18699`) - Bug in indexing non-scalar value from ``Series`` having non-unique ``Index`` will return value flattened (:issue:`17610`) -- Bug in :func:`DatetimeIndex.insert` where inserting ``NaT`` into a timezone-aware index incorrectly raised (:issue:`16357`) - Bug in ``__setitem__`` when indexing a :class:`DataFrame` with a 2-d boolean ndarray (:issue:`18582`) - Bug in ``str.extractall`` when there were no matches empty :class:`Index` was returned instead of appropriate :class:`MultiIndex` (:issue:`19034`) +- Bug in :class:`IntervalIndex` where empty and purely NA data was constructed inconsistently depending on the construction method (:issue:`18421`) +- Bug in :func:`IntervalIndex.symmetric_difference` where the symmetric difference with a non-``IntervalIndex`` did not raise (:issue:`18475`) +- Bug in :class:`IntervalIndex` where set operations that returned an empty ``IntervalIndex`` had the wrong dtype (:issue:`19101`) + +MultiIndex +^^^^^^^^^^ + +- Bug in :func:`MultiIndex.__contains__` where non-tuple keys would return ``True`` even if they had been dropped (:issue:`19027`) +- Bug in :func:`MultiIndex.set_labels` which would cause casting (and potentially clipping) of the new labels if the ``level`` argument is not 0 or a list like [0, 1, ... ] (:issue:`19057`) +- Bug in :func:`MultiIndex.get_level_values` which would return an invalid index on level of ints with missing values (:issue:`17924`) +- Bug in :func:`MultiIndex.remove_unused_levels` which would fill nan values (:issue:`18417`) +- Bug in :func:`MultiIndex.from_tuples`` which would fail to take zipped tuples in python3 (:issue:`18434`) +- Bug in :func:`MultiIndex.get_loc`` which would fail to automatically cast values between float and int (:issue:`18818`, :issue:`15994`) +- Bug in :func:`MultiIndex.get_loc`` which would cast boolean to integer labels (:issue:`19086`) +- Bug in :func:`MultiIndex.get_loc`` which would fail to locate keys containing ``NaN`` (:issue:`18485`) +- Bug in :func:`MultiIndex.get_loc`` in large :class:`MultiIndex`, would fail when levels had different dtypes (:issue:`18520`) + I/O ^^^ - :func:`read_html` now rewinds seekable IO objects after parse failure, before attempting to parse with a new parser. If a parser errors and the object is non-seekable, an informative error is raised suggesting the use of a different parser (:issue:`17975`) +- :meth:`DataFrame.to_html` now has an option to add an id to the leading `` tag (:issue:`8496`) - Bug in :func:`read_msgpack` with a non existent file is passed in Python 2 (:issue:`15296`) - Bug in :func:`read_csv` where a ``MultiIndex`` with duplicate columns was not being mangled appropriately (:issue:`18062`) +- Bug in :func:`read_csv` where missing values were not being handled properly when ``keep_default_na=False`` with dictionary ``na_values`` (:issue:`19227`) - Bug in :func:`read_sas` where a file with 0 variables gave an ``AttributeError`` incorrectly. Now it gives an ``EmptyDataError`` (:issue:`18184`) - Bug in :func:`DataFrame.to_latex()` where pairs of braces meant to serve as invisible placeholders were escaped (:issue:`18667`) - Bug in :func:`read_json` where large numeric values were causing an ``OverflowError`` (:issue:`18842`) -- +- Bug in :func:`DataFrame.to_parquet` where an exception was raised if the write destination is S3 (:issue:`19134`) +- :class:`Interval` now supported in :func:`DataFrame.to_excel` for all Excel file types (:issue:`19242`) +- :class:`Timedelta` now supported in :func:`DataFrame.to_excel` for xls file type (:issue:`19242`, :issue:`9155`) +- Bug in :meth:`pandas.io.stata.StataReader.value_labels` raising an ``AttributeError`` when called on very old files. Now returns an empty dict (:issue:`19417`) Plotting ^^^^^^^^ - :func: `DataFrame.plot` now raises a ``ValueError`` when the ``x`` or ``y`` argument is improperly formed (:issue:`18671`) - Bug in formatting tick labels with ``datetime.time()`` and fractional seconds (:issue:`18478`). -- +- :meth:`Series.plot.kde` has exposed the args ``ind`` and ``bw_method`` in the docstring (:issue:`18461`). The argument ``ind`` may now also be an integer (number of sample points). - Groupby/Resample/Rolling @@ -390,43 +809,39 @@ Groupby/Resample/Rolling - Bug when grouping by a single column and aggregating with a class like ``list`` or ``tuple`` (:issue:`18079`) - Fixed regression in :func:`DataFrame.groupby` which would not emit an error when called with a tuple key not in the index (:issue:`18798`) -- -- +- Bug in :func:`DataFrame.resample` which silently ignored unsupported (or mistyped) options for ``label``, ``closed`` and ``convention`` (:issue:`19303`) +- Bug in :func:`DataFrame.groupby` where tuples were interpreted as lists of keys rather than as keys (:issue:`17979`, :issue:`18249`) +- Bug in :func:`DataFrame.groupby` where aggregation by ``first``/``last``/``min``/``max`` was causing timestamps to lose precision (:issue:`19526`) +- Bug in :func:`DataFrame.transform` where particular aggregation functions were being incorrectly cast to match the dtype(s) of the grouped data (:issue:`19200`) +- Bug in :func:`DataFrame.groupby` passing the `on=` kwarg, and subsequently using ``.apply()`` (:issue:`17813`) +- Bug in :func:`DataFrame.resample().aggregate` not raising a ``KeyError`` when aggregating a non-existent column (:issue:`16766`, :issue:`19566`) +- Fixed a performance regression for ``GroupBy.nth`` and ``GroupBy.last`` with some object columns (:issue:`19283`) Sparse ^^^^^^ -- -- -- +- Bug in which creating a ``SparseDataFrame`` from a dense ``Series`` or an unsupported type raised an uncontrolled exception (:issue:`19374`) +- Bug in :class:`SparseDataFrame.to_csv` causing exception (:issue:`19384`) +- Bug in :class:`SparseSeries.memory_usage` which caused segfault by accessing non sparse elements (:issue:`19368`) Reshaping ^^^^^^^^^ - Bug in :func:`DataFrame.stack` which fails trying to sort mixed type levels under Python 3 (:issue:`18310`) +- Bug in :func:`DataFrame.unstack` which casts int to float if ``columns`` is a ``MultiIndex`` with unused levels (:issue:`17845`) +- Bug in :func:`DataFrame.unstack` which raises an error if ``index`` is a ``MultiIndex`` with unused labels on the unstacked level (:issue:`18562`) - Fixed construction of a :class:`Series` from a ``dict`` containing ``NaN`` as key (:issue:`18480`) - Bug in :func:`Series.rank` where ``Series`` containing ``NaT`` modifies the ``Series`` inplace (:issue:`18521`) - Bug in :func:`cut` which fails when using readonly arrays (:issue:`18773`) - Bug in :func:`Dataframe.pivot_table` which fails when the ``aggfunc`` arg is of type string. The behavior is now consistent with other methods like ``agg`` and ``apply`` (:issue:`18713`) - - -Numeric -^^^^^^^ - -- Bug in :func:`Series.__sub__` subtracting a non-nanosecond ``np.datetime64`` object from a ``Series`` gave incorrect results (:issue:`7996`) -- Bug in :class:`DatetimeIndex`, :class:`TimedeltaIndex` addition and subtraction of zero-dimensional integer arrays gave incorrect results (:issue:`19012`) -- Bug in :func:`Series.__add__` adding Series with dtype ``timedelta64[ns]`` to a timezone-aware ``DatetimeIndex`` incorrectly dropped timezone information (:issue:`13905`) -- - -Categorical -^^^^^^^^^^^ - -- -- -- +- Bug in :func:`DataFrame.merge` in which merging using ``Index`` objects as vectors raised an Exception (:issue:`19038`) +- Bug in :func:`DataFrame.stack`, :func:`DataFrame.unstack`, :func:`Series.unstack` which were not returning subclasses (:issue:`15563`) +- Bug in timezone comparisons, manifesting as a conversion of the index to UTC in ``.concat()`` (:issue:`18523`) +- Bug in :func:`concat` when concatting sparse and dense series it returns only a ``SparseDataFrame``. Should be a ``DataFrame``. (:issue:`18914`, :issue:`18686`, and :issue:`16874`) +- Improved error message for :func:`DataFrame.merge` when there is no common merge key (:issue:`19427`) +- Bug in :func:`DataFrame.join` which does an *outer* instead of a *left* join when being called with multiple DataFrames and some have non-unique indices (:issue:`19624`) Other ^^^^^ - Improved error message when attempting to use a Python keyword as an identifier in a ``numexpr`` backed query (:issue:`18221`) -- :func:`Timestamp.replace` will now handle Daylight Savings transitions gracefully (:issue:`18319`) From 02d4369e91422dc6a0c99f622ac1eab00163d808 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 11 Feb 2018 22:16:37 +0100 Subject: [PATCH 19/21] typo --- pandas/tests/groupby/test_groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index ecdbb9cdd0ad2..4cf7c8013aa2b 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2792,7 +2792,7 @@ def test_is_monotonic_increasing(self, in_vals, out_vals): tm.assert_series_equal(result, expected) # Also check result equal to manually taking x.is_monotonic_increasing. - expecteAd = ( + expected = ( df.groupby(['B']).C.apply(lambda x: x.is_monotonic_increasing)) tm.assert_series_equal(result, expected) From 13d4dd959bbd879b5c5a71ddfeaffcf8cdecf07e Mon Sep 17 00:00:00 2001 From: gfyoung Date: Sun, 11 Feb 2018 22:16:25 -0500 Subject: [PATCH 20/21] Add ticks for function rendering --- doc/source/whatsnew/v0.23.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 6436a8464b5d0..30496c51058ec 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -323,7 +323,7 @@ Other Enhancements - ``IntervalIndex.astype`` now supports conversions between subtypes when passed an ``IntervalDtype`` (:issue:`19197`) - :class:`IntervalIndex` and its associated constructor methods (``from_arrays``, ``from_breaks``, ``from_tuples``) have gained a ``dtype`` parameter (:issue:`19262`) -- :func: groupby.is_monotonic_increasing and :func: .is_monotonic_decreasing extend :func: Series.is_monotonic_increasing to groups, returning whether each group is monotonically increasing or decreasing, respectively. (:issue:`17015`) +- :func:`groupby.is_monotonic_increasing` and :func:`.is_monotonic_decreasing` extend `:func: Series.is_monotonic_increasing` to groups, returning whether each group is monotonically increasing or decreasing, respectively. (:issue:`17015`) .. _whatsnew_0230.api_breaking: From 787c4362604ffa3d0ac2033a546d17de24da0b20 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Mon, 12 Feb 2018 14:54:15 -0500 Subject: [PATCH 21/21] Make whatsnew statement more concise --- doc/source/whatsnew/v0.23.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 30496c51058ec..3990edac8584a 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -323,7 +323,7 @@ Other Enhancements - ``IntervalIndex.astype`` now supports conversions between subtypes when passed an ``IntervalDtype`` (:issue:`19197`) - :class:`IntervalIndex` and its associated constructor methods (``from_arrays``, ``from_breaks``, ``from_tuples``) have gained a ``dtype`` parameter (:issue:`19262`) -- :func:`groupby.is_monotonic_increasing` and :func:`.is_monotonic_decreasing` extend `:func: Series.is_monotonic_increasing` to groups, returning whether each group is monotonically increasing or decreasing, respectively. (:issue:`17015`) +- Added :func:`SeriesGroupBy.is_monotonic_increasing` and :func:`SeriesGroupBy.is_monotonic_decreasing` (:issue:`17015`) .. _whatsnew_0230.api_breaking: