From 0830e36583a8ccf7a31479c22eca40b03769b108 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Wed, 22 Jan 2020 22:18:09 -0600 Subject: [PATCH 01/26] Add value_counts tests --- pandas/tests/frame/test_value_counts.py | 111 ++++++++++++++++++++++++ 1 file changed, 111 insertions(+) create mode 100644 pandas/tests/frame/test_value_counts.py diff --git a/pandas/tests/frame/test_value_counts.py b/pandas/tests/frame/test_value_counts.py new file mode 100644 index 0000000000000..565ff4fcdf3b8 --- /dev/null +++ b/pandas/tests/frame/test_value_counts.py @@ -0,0 +1,111 @@ +import pytest + +import numpy as np +import pandas as pd +import pandas._testing as tm + + +def test_data_frame_value_counts_unsorted(): + df = pd.DataFrame( + {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, + index=["falcon", "dog", "cat", "ant"], + ) + result = df.value_counts(sort=False) + expected = pd.Series( + data=[1, 2, 1], + index=pd.MultiIndex.from_arrays( + [(2, 4, 6), (2, 0, 0)], names=["num_legs", "num_wings"] + ), + ) + tm.assert_series_equal(result, expected) + + +def test_data_frame_value_counts_ascending(): + df = pd.DataFrame( + {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, + index=["falcon", "dog", "cat", "ant"], + ) + result = df.value_counts(ascending=True) + expected = pd.Series( + data=[1, 1, 2], + index=pd.MultiIndex.from_arrays( + [(2, 6, 4), (2, 0, 0)], names=["num_legs", "num_wings"] + ), + ) + tm.assert_series_equal(result, expected) + + +def test_data_frame_value_counts_default(): + df = pd.DataFrame( + {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, + index=["falcon", "dog", "cat", "ant"], + ) + result = df.value_counts() + expected = pd.Series( + data=[2, 1, 1], + index=pd.MultiIndex.from_arrays( + [(4, 6, 2), (0, 0, 2)], names=["num_legs", "num_wings"] + ), + ) + tm.assert_series_equal(result, expected) + + +def test_data_frame_value_counts_normalize(): + df = pd.DataFrame( + {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, + index=["falcon", "dog", "cat", "ant"], + ) + result = df.value_counts(normalize=True) + expected = pd.Series( + data=[0.5, 0.25, 0.25], + index=pd.MultiIndex.from_arrays( + [(4, 6, 2), (0, 0, 2)], names=["num_legs", "num_wings"] + ), + ) + tm.assert_series_equal(result, expected) + + +def test_data_frame_value_counts_dropna_not_supported_yet(): + df = pd.DataFrame( + {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, + index=["falcon", "dog", "cat", "ant"], + ) + with pytest.raises(NotImplementedError, match="not yet supported"): + df.value_counts(dropna=False) + + +def test_data_frame_value_counts_bins_not_supported(): + df = pd.DataFrame( + {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, + index=["falcon", "dog", "cat", "ant"], + ) + with pytest.raises(NotImplementedError, match="not yet supported"): + df.value_counts(bins=2) + + +def test_data_frame_value_counts_single_col_default(): + df = pd.DataFrame( + {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, + index=["falcon", "dog", "cat", "ant"], + ) + df_single_col = df[["num_legs"]] + result = df_single_col.value_counts() + expected = pd.Series( + data=[2, 1, 1], + index=pd.MultiIndex.from_arrays([[4, 6, 2]], names=["num_legs"]), + ) + tm.assert_series_equal(result, expected) + + +def test_data_frame_value_counts_empty(): + df_no_cols = pd.DataFrame() + result = df_no_cols.value_counts() + expected = pd.Series([], dtype=np.int64) + tm.assert_series_equal(result, expected) + + +def test_data_frame_value_counts_empty_normalize(): + df_no_cols = pd.DataFrame() + result = df_no_cols.value_counts(normalize=True) + expected = pd.Series([], dtype=np.float64) + tm.assert_series_equal(result, expected) From d946e93db1638f31791ea4120cc270b597c0a420 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Wed, 22 Jan 2020 22:18:33 -0600 Subject: [PATCH 02/26] Update docs --- doc/source/reference/frame.rst | 1 + doc/source/whatsnew/v1.1.0.rst | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst index 01aa6c60e3b2f..f9aa87e0235da 100644 --- a/doc/source/reference/frame.rst +++ b/doc/source/reference/frame.rst @@ -168,6 +168,7 @@ Computations / descriptive stats DataFrame.std DataFrame.var DataFrame.nunique + DataFrame.value_counts Reindexing / selection / label manipulation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index c8e811ce82b1f..5bd00f3429687 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -54,7 +54,7 @@ Other API changes - :meth:`Series.describe` will now show distribution percentiles for ``datetime`` dtypes, statistics ``first`` and ``last`` will now be ``min`` and ``max`` to match with numeric dtypes in :meth:`DataFrame.describe` (:issue:`30164`) -- +- Added :meth:`DataFrame.value_counts` (:issue:`5377`) - .. --------------------------------------------------------------------------- From 7d9306de423dd2286c8b3fb3e89c694e3f72330c Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Wed, 22 Jan 2020 22:21:05 -0600 Subject: [PATCH 03/26] Start implementing value_counts --- pandas/core/frame.py | 117 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 116 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4257083cc8dc5..267440a9c94e1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -106,7 +106,7 @@ from pandas.core.indexes import base as ibase from pandas.core.indexes.api import Index, ensure_index, ensure_index_from_sequences from pandas.core.indexes.datetimes import DatetimeIndex -from pandas.core.indexes.multi import maybe_droplevels +from pandas.core.indexes.multi import MultiIndex, maybe_droplevels from pandas.core.indexes.period import PeriodIndex from pandas.core.indexing import check_bool_indexer, convert_to_index_sliceable from pandas.core.internals import BlockManager @@ -5064,6 +5064,121 @@ def sort_index( else: return self._constructor(new_data).__finalize__(self) + def value_counts( + self, + subset: Optional[Union[Hashable, Sequence[Hashable]]] = None, + normalize: bool = False, + sort: bool = True, + ascending: bool = False, + bins=None, + dropna: bool = True, + ): + """ + Return a Series containing counts of unique rows in the DataFrame. + .. versionadded:: 1.0.0 + The returned Series will have a MultiIndex with one level per input + column. + By default, rows that contain any NaN value are omitted from the + results. + By default, the resulting series will be in descending order so that the + first element is the most frequently-occurring row. + Parameters + ---------- + subset : list-like, default self.columns + Columns to use when counting unique combinations. + normalize : boolean, default False + Return proportions rather than frequencies. + sort : boolean, default True + Sort by frequencies. + ascending : boolean, default False + Sort in ascending order. + bins : integer, optional + This parameter is not yet supported and must be set to None (the + default value). It exists to ensure compatibiliy with + `Series.value_counts`. + Rather than count values, group them into half-open bins, + a convenience for ``pd.cut``, only works with single-column numeric + data. + dropna : boolean, default True + This parameter is not yet supported and must be set to True (the + default value). It exists to ensure compatibiliy with + `Series.value_counts`. + Don't include counts of rows containing NaN. + Returns + ------- + counts : Series + See Also + -------- + Series.value_counts: Equivalent method on Series. + Examples + -------- + >>> df = pd.DataFrame({'num_legs': [2, 4, 4, 6], + ... 'num_wings': [2, 0, 0, 0]}, + ... index=['falcon', 'dog', 'cat', 'ant']) + >>> df + num_legs num_wings + falcon 2 2 + dog 4 0 + cat 4 0 + ant 6 0 + >>> df.value_counts() + num_legs num_wings + 4 0 2 + 6 0 1 + 2 2 1 + dtype: int64 + >>> df.value_counts(sort=False) + num_legs num_wings + 2 2 1 + 4 0 2 + 6 0 1 + dtype: int64 + >>> df.value_counts(ascending=True) + num_legs num_wings + 2 2 1 + 6 0 1 + 4 0 2 + dtype: int64 + >>> df.value_counts(normalize=True) + num_legs num_wings + 4 0 0.50 + 6 0 0.25 + 2 2 0.25 + dtype: float64 + >>> single_col_df = df[['num_legs']] + >>> single_col_df.value_counts(bins=4) + num_legs + (3.0, 4.0] 2 + (5.0, 6.0] 1 + (1.995, 3.0] 1 + (4.0, 5.0] 0 + dtype: int64 + """ + if subset is None: + subset = self.columns.tolist() + + # Some features not supported yet + if not dropna: + raise NotImplementedError( + "`dropna=False` not yet supported for DataFrames." + ) + if (bins is not None) and (len(subset) > 1): + raise NotImplementedError( + "`bins` parameter not yet supported for more than one column." + ) + + counts = self.groupby(subset).size() + + if sort: + counts = counts.sort_values(ascending=ascending) + if normalize: + counts /= counts.sum() + # Force MultiIndex for single column + if len(subset) == 1: + counts.index = MultiIndex.from_arrays([counts.index]) + + return counts + def nlargest(self, n, columns, keep="first") -> "DataFrame": """ Return the first `n` rows ordered by `columns` in descending order. From 2e58db44335efcc38dd98f0cc04a5620104d8962 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Wed, 22 Jan 2020 22:26:03 -0600 Subject: [PATCH 04/26] Set MultiIndex name --- pandas/core/frame.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 267440a9c94e1..025953826499e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5175,7 +5175,9 @@ def value_counts( counts /= counts.sum() # Force MultiIndex for single column if len(subset) == 1: - counts.index = MultiIndex.from_arrays([counts.index]) + counts.index = MultiIndex.from_arrays( + [counts.index], names=[counts.index.name] + ) return counts From 25d7f2f30d6eea7b22727511b5a62013ed7b4393 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Thu, 23 Jan 2020 07:59:48 -0600 Subject: [PATCH 05/26] Format --- pandas/core/frame.py | 17 +++++++++++------ pandas/tests/frame/test_value_counts.py | 20 ++++++++++++++++++-- 2 files changed, 29 insertions(+), 8 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 025953826499e..2545d97ff9c79 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5075,13 +5075,14 @@ def value_counts( ): """ Return a Series containing counts of unique rows in the DataFrame. - .. versionadded:: 1.0.0 + .. versionadded:: 1.1.0 The returned Series will have a MultiIndex with one level per input column. - By default, rows that contain any NaN value are omitted from the - results. - By default, the resulting series will be in descending order so that the + By default, rows that contain any NA values are omitted from the + result. + By default, the resulting Series will be in descending order so that the first element is the most frequently-occurring row. + Parameters ---------- subset : list-like, default self.columns @@ -5103,13 +5104,16 @@ def value_counts( This parameter is not yet supported and must be set to True (the default value). It exists to ensure compatibiliy with `Series.value_counts`. - Don't include counts of rows containing NaN. + Don't include counts of rows containing NA values. + Returns ------- - counts : Series + Series + See Also -------- Series.value_counts: Equivalent method on Series. + Examples -------- >>> df = pd.DataFrame({'num_legs': [2, 4, 4, 6], @@ -5162,6 +5166,7 @@ def value_counts( raise NotImplementedError( "`dropna=False` not yet supported for DataFrames." ) + if (bins is not None) and (len(subset) > 1): raise NotImplementedError( "`bins` parameter not yet supported for more than one column." diff --git a/pandas/tests/frame/test_value_counts.py b/pandas/tests/frame/test_value_counts.py index 565ff4fcdf3b8..28089e5bd34be 100644 --- a/pandas/tests/frame/test_value_counts.py +++ b/pandas/tests/frame/test_value_counts.py @@ -10,6 +10,7 @@ def test_data_frame_value_counts_unsorted(): {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, index=["falcon", "dog", "cat", "ant"], ) + result = df.value_counts(sort=False) expected = pd.Series( data=[1, 2, 1], @@ -17,6 +18,7 @@ def test_data_frame_value_counts_unsorted(): [(2, 4, 6), (2, 0, 0)], names=["num_legs", "num_wings"] ), ) + tm.assert_series_equal(result, expected) @@ -25,6 +27,7 @@ def test_data_frame_value_counts_ascending(): {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, index=["falcon", "dog", "cat", "ant"], ) + result = df.value_counts(ascending=True) expected = pd.Series( data=[1, 1, 2], @@ -32,6 +35,7 @@ def test_data_frame_value_counts_ascending(): [(2, 6, 4), (2, 0, 0)], names=["num_legs", "num_wings"] ), ) + tm.assert_series_equal(result, expected) @@ -40,6 +44,7 @@ def test_data_frame_value_counts_default(): {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, index=["falcon", "dog", "cat", "ant"], ) + result = df.value_counts() expected = pd.Series( data=[2, 1, 1], @@ -47,6 +52,7 @@ def test_data_frame_value_counts_default(): [(4, 6, 2), (0, 0, 2)], names=["num_legs", "num_wings"] ), ) + tm.assert_series_equal(result, expected) @@ -55,6 +61,7 @@ def test_data_frame_value_counts_normalize(): {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, index=["falcon", "dog", "cat", "ant"], ) + result = df.value_counts(normalize=True) expected = pd.Series( data=[0.5, 0.25, 0.25], @@ -62,6 +69,7 @@ def test_data_frame_value_counts_normalize(): [(4, 6, 2), (0, 0, 2)], names=["num_legs", "num_wings"] ), ) + tm.assert_series_equal(result, expected) @@ -70,6 +78,7 @@ def test_data_frame_value_counts_dropna_not_supported_yet(): {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, index=["falcon", "dog", "cat", "ant"], ) + with pytest.raises(NotImplementedError, match="not yet supported"): df.value_counts(dropna=False) @@ -79,33 +88,40 @@ def test_data_frame_value_counts_bins_not_supported(): {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, index=["falcon", "dog", "cat", "ant"], ) + with pytest.raises(NotImplementedError, match="not yet supported"): df.value_counts(bins=2) - - + + def test_data_frame_value_counts_single_col_default(): df = pd.DataFrame( {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, index=["falcon", "dog", "cat", "ant"], ) df_single_col = df[["num_legs"]] + result = df_single_col.value_counts() expected = pd.Series( data=[2, 1, 1], index=pd.MultiIndex.from_arrays([[4, 6, 2]], names=["num_legs"]), ) + tm.assert_series_equal(result, expected) def test_data_frame_value_counts_empty(): df_no_cols = pd.DataFrame() + result = df_no_cols.value_counts() expected = pd.Series([], dtype=np.int64) + tm.assert_series_equal(result, expected) def test_data_frame_value_counts_empty_normalize(): df_no_cols = pd.DataFrame() + result = df_no_cols.value_counts(normalize=True) expected = pd.Series([], dtype=np.float64) + tm.assert_series_equal(result, expected) From aa96c9805becb5c6db61a18a9c87435aa0e412b2 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Thu, 23 Jan 2020 08:35:38 -0600 Subject: [PATCH 06/26] Sort imports --- pandas/tests/frame/test_value_counts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/frame/test_value_counts.py b/pandas/tests/frame/test_value_counts.py index 28089e5bd34be..5dc51d0c685aa 100644 --- a/pandas/tests/frame/test_value_counts.py +++ b/pandas/tests/frame/test_value_counts.py @@ -1,6 +1,6 @@ +import numpy as np import pytest -import numpy as np import pandas as pd import pandas._testing as tm From aef75ae62ac68060f40347c5f69399fafabd8c03 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Thu, 23 Jan 2020 09:40:19 -0600 Subject: [PATCH 07/26] Remove typing for now --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2545d97ff9c79..f6be59754d812 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5066,7 +5066,7 @@ def sort_index( def value_counts( self, - subset: Optional[Union[Hashable, Sequence[Hashable]]] = None, + subset=None, normalize: bool = False, sort: bool = True, ascending: bool = False, From acb81cc73ae39da619fe9a9e15f2ff8d8db11550 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Thu, 23 Jan 2020 15:59:59 -0600 Subject: [PATCH 08/26] Simplify test a little --- pandas/tests/frame/test_value_counts.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/pandas/tests/frame/test_value_counts.py b/pandas/tests/frame/test_value_counts.py index 5dc51d0c685aa..ef2dc790940b0 100644 --- a/pandas/tests/frame/test_value_counts.py +++ b/pandas/tests/frame/test_value_counts.py @@ -94,13 +94,9 @@ def test_data_frame_value_counts_bins_not_supported(): def test_data_frame_value_counts_single_col_default(): - df = pd.DataFrame( - {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, - index=["falcon", "dog", "cat", "ant"], - ) - df_single_col = df[["num_legs"]] + df = pd.DataFrame({"num_legs": [2, 4, 4, 6]}) - result = df_single_col.value_counts() + result = df.value_counts() expected = pd.Series( data=[2, 1, 1], index=pd.MultiIndex.from_arrays([[4, 6, 2]], names=["num_legs"]), From 786de344cdda81561b0e6e53d1a3bc586d6a0813 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Thu, 23 Jan 2020 16:00:45 -0600 Subject: [PATCH 09/26] Remove single col example for now --- pandas/core/frame.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f6be59754d812..8e27af019c30b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5149,14 +5149,6 @@ def value_counts( 6 0 0.25 2 2 0.25 dtype: float64 - >>> single_col_df = df[['num_legs']] - >>> single_col_df.value_counts(bins=4) - num_legs - (3.0, 4.0] 2 - (5.0, 6.0] 1 - (1.995, 3.0] 1 - (4.0, 5.0] 0 - dtype: int64 """ if subset is None: subset = self.columns.tolist() From 7eba59ae8e72ee7a4bf9e3db9bbe9b8e6b831b4c Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Thu, 23 Jan 2020 16:52:27 -0600 Subject: [PATCH 10/26] Update error for bins --- pandas/core/frame.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8e27af019c30b..5a7e3f7c8e1c9 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5159,9 +5159,9 @@ def value_counts( "`dropna=False` not yet supported for DataFrames." ) - if (bins is not None) and (len(subset) > 1): + if bins is not None: raise NotImplementedError( - "`bins` parameter not yet supported for more than one column." + "`bins` parameter not yet supported for DataFrames." ) counts = self.groupby(subset).size() From 60554e91661be4b349259ceda6bdf47e71469438 Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Fri, 24 Jan 2020 13:00:55 -0600 Subject: [PATCH 11/26] Update pandas/core/frame.py Co-Authored-By: William Ayd --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5a7e3f7c8e1c9..a777da3b95032 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5066,7 +5066,7 @@ def sort_index( def value_counts( self, - subset=None, + subset: Sequence[Label] = None, normalize: bool = False, sort: bool = True, ascending: bool = False, From 4c4e858ffe0c0a59142904182e9f094e3464fd04 Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Fri, 24 Jan 2020 13:01:01 -0600 Subject: [PATCH 12/26] Update pandas/core/frame.py Co-Authored-By: William Ayd --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a777da3b95032..3c1b99e7e364f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5070,7 +5070,7 @@ def value_counts( normalize: bool = False, sort: bool = True, ascending: bool = False, - bins=None, + bins: Optional[int] = None, dropna: bool = True, ): """ From 07f0e762ceeb6b493fd05a2d0eb67d2a965d886c Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Fri, 24 Jan 2020 13:01:59 -0600 Subject: [PATCH 13/26] Import Label type --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3c1b99e7e364f..c6170abac437d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -39,7 +39,7 @@ from pandas._config import get_option from pandas._libs import algos as libalgos, lib -from pandas._typing import Axes, Axis, Dtype, FilePathOrBuffer, Level, Renamer +from pandas._typing import Axes, Axis, Dtype, FilePathOrBuffer, Label, Level, Renamer from pandas.compat import PY37 from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import function as nv From 957a8ecb78585363ebb4b62906442b43dd3995a3 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Fri, 24 Jan 2020 13:13:59 -0600 Subject: [PATCH 14/26] Make Sequence optional --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6421cc466fbf3..c6aa4048cc90f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5072,7 +5072,7 @@ def sort_index( def value_counts( self, - subset: Sequence[Label] = None, + subset: Optional[Sequence[Label]] = None, normalize: bool = False, sort: bool = True, ascending: bool = False, From 4fee5e038bdeff1837c32937b199c98dbff01372 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Fri, 24 Jan 2020 15:27:58 -0600 Subject: [PATCH 15/26] Fix docstring --- pandas/core/frame.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c6aa4048cc90f..c0b45801dc4bb 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5091,22 +5091,22 @@ def value_counts( Parameters ---------- - subset : list-like, default self.columns + subset : list-like, optional Columns to use when counting unique combinations. - normalize : boolean, default False + normalize : bool, default False Return proportions rather than frequencies. - sort : boolean, default True + sort : bool, default True Sort by frequencies. - ascending : boolean, default False + ascending : bool, default False Sort in ascending order. - bins : integer, optional + bins : int, optional This parameter is not yet supported and must be set to None (the default value). It exists to ensure compatibiliy with `Series.value_counts`. Rather than count values, group them into half-open bins, a convenience for ``pd.cut``, only works with single-column numeric data. - dropna : boolean, default True + dropna : bool, default True This parameter is not yet supported and must be set to True (the default value). It exists to ensure compatibiliy with `Series.value_counts`. From b8f4126453f5708ba140cc36853d640e59def874 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sun, 26 Jan 2020 00:51:45 -0600 Subject: [PATCH 16/26] Clean docstring --- pandas/core/frame.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c0b45801dc4bb..f4457bba02fdc 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5081,7 +5081,9 @@ def value_counts( ): """ Return a Series containing counts of unique rows in the DataFrame. + .. versionadded:: 1.1.0 + The returned Series will have a MultiIndex with one level per input column. By default, rows that contain any NA values are omitted from the @@ -5131,24 +5133,28 @@ def value_counts( dog 4 0 cat 4 0 ant 6 0 + >>> df.value_counts() num_legs num_wings 4 0 2 6 0 1 2 2 1 dtype: int64 + >>> df.value_counts(sort=False) num_legs num_wings 2 2 1 4 0 2 6 0 1 dtype: int64 + >>> df.value_counts(ascending=True) num_legs num_wings 2 2 1 6 0 1 4 0 2 dtype: int64 + >>> df.value_counts(normalize=True) num_legs num_wings 4 0 0.50 From 310c688d077b4be38ac78ae63a79592ec44ec737 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sun, 26 Jan 2020 00:58:12 -0600 Subject: [PATCH 17/26] Update to comments --- pandas/core/frame.py | 28 ++-------------------------- 1 file changed, 2 insertions(+), 26 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f4457bba02fdc..434752b1bd60d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5076,8 +5076,6 @@ def value_counts( normalize: bool = False, sort: bool = True, ascending: bool = False, - bins: Optional[int] = None, - dropna: bool = True, ): """ Return a Series containing counts of unique rows in the DataFrame. @@ -5101,18 +5099,6 @@ def value_counts( Sort by frequencies. ascending : bool, default False Sort in ascending order. - bins : int, optional - This parameter is not yet supported and must be set to None (the - default value). It exists to ensure compatibiliy with - `Series.value_counts`. - Rather than count values, group them into half-open bins, - a convenience for ``pd.cut``, only works with single-column numeric - data. - dropna : bool, default True - This parameter is not yet supported and must be set to True (the - default value). It exists to ensure compatibiliy with - `Series.value_counts`. - Don't include counts of rows containing NA values. Returns ------- @@ -5163,18 +5149,7 @@ def value_counts( dtype: float64 """ if subset is None: - subset = self.columns.tolist() - - # Some features not supported yet - if not dropna: - raise NotImplementedError( - "`dropna=False` not yet supported for DataFrames." - ) - - if bins is not None: - raise NotImplementedError( - "`bins` parameter not yet supported for DataFrames." - ) + subset = self.columns counts = self.groupby(subset).size() @@ -5182,6 +5157,7 @@ def value_counts( counts = counts.sort_values(ascending=ascending) if normalize: counts /= counts.sum() + # Force MultiIndex for single column if len(subset) == 1: counts.index = MultiIndex.from_arrays( From a2660217d1765fd0f52b49e27f9abf4a8d1501cd Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sun, 26 Jan 2020 01:18:39 -0600 Subject: [PATCH 18/26] Add to Series See Also --- pandas/core/base.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/base.py b/pandas/core/base.py index 6ad237cbc7c51..0986ff3942c10 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1202,6 +1202,7 @@ def value_counts( -------- Series.count: Number of non-NA elements in a Series. DataFrame.count: Number of non-NA elements in a DataFrame. + DataFrame.value_counts: Equivalent method on DataFrames. Examples -------- From d738bf7d5f4cbbf221e6f45e4bb51c71cd4333b8 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sun, 26 Jan 2020 11:19:23 -0600 Subject: [PATCH 19/26] Update tests and add back tolist --- pandas/core/frame.py | 2 +- pandas/tests/frame/test_value_counts.py | 20 -------------------- 2 files changed, 1 insertion(+), 21 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 434752b1bd60d..e73a3088aa047 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5149,7 +5149,7 @@ def value_counts( dtype: float64 """ if subset is None: - subset = self.columns + subset = self.columns.tolist() counts = self.groupby(subset).size() diff --git a/pandas/tests/frame/test_value_counts.py b/pandas/tests/frame/test_value_counts.py index ef2dc790940b0..ec4e148462354 100644 --- a/pandas/tests/frame/test_value_counts.py +++ b/pandas/tests/frame/test_value_counts.py @@ -73,26 +73,6 @@ def test_data_frame_value_counts_normalize(): tm.assert_series_equal(result, expected) -def test_data_frame_value_counts_dropna_not_supported_yet(): - df = pd.DataFrame( - {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, - index=["falcon", "dog", "cat", "ant"], - ) - - with pytest.raises(NotImplementedError, match="not yet supported"): - df.value_counts(dropna=False) - - -def test_data_frame_value_counts_bins_not_supported(): - df = pd.DataFrame( - {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, - index=["falcon", "dog", "cat", "ant"], - ) - - with pytest.raises(NotImplementedError, match="not yet supported"): - df.value_counts(bins=2) - - def test_data_frame_value_counts_single_col_default(): df = pd.DataFrame({"num_legs": [2, 4, 4, 6]}) From 26182203a580806b4fa3c575abd3b057207f4340 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sun, 26 Jan 2020 12:04:32 -0600 Subject: [PATCH 20/26] Don't import pytest --- pandas/tests/frame/test_value_counts.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/frame/test_value_counts.py b/pandas/tests/frame/test_value_counts.py index ec4e148462354..c409b0bbe6fa9 100644 --- a/pandas/tests/frame/test_value_counts.py +++ b/pandas/tests/frame/test_value_counts.py @@ -1,5 +1,4 @@ import numpy as np -import pytest import pandas as pd import pandas._testing as tm From 0d46697374b48a4d3581f92ebb03bb11fa72dc28 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sun, 9 Feb 2020 13:03:47 -0600 Subject: [PATCH 21/26] Add to basics.rst --- doc/source/getting_started/basics.rst | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/doc/source/getting_started/basics.rst b/doc/source/getting_started/basics.rst index 277080006cb3c..0afa451887dab 100644 --- a/doc/source/getting_started/basics.rst +++ b/doc/source/getting_started/basics.rst @@ -689,6 +689,15 @@ of a 1D array of values. It can also be used as a function on regular arrays: s.value_counts() pd.value_counts(data) +The :meth:`~DataFrame.value_counts` method can be used to count combinations across multiple columns. +By default all columns are used but a subset can be selected using the ``subset`` argument. + +.. ipython:: python + + data = {"a": [1, 2, 3, 4], "b": ["x", "x", "y", "y"]} + df = pd.DataFrame(data) + df.value_counts() + Similarly, you can get the most frequently occurring value(s) (the mode) of the values in a Series or DataFrame: .. ipython:: python From 81991a10a7681855fd2fed9bc06747150fa7bd82 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sun, 9 Feb 2020 13:07:50 -0600 Subject: [PATCH 22/26] Move to Notes --- pandas/core/frame.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 50ae785ae039d..db12e9eb0e58d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5091,13 +5091,6 @@ def value_counts( .. versionadded:: 1.1.0 - The returned Series will have a MultiIndex with one level per input - column. - By default, rows that contain any NA values are omitted from the - result. - By default, the resulting Series will be in descending order so that the - first element is the most frequently-occurring row. - Parameters ---------- subset : list-like, optional @@ -5117,6 +5110,13 @@ def value_counts( -------- Series.value_counts: Equivalent method on Series. + Notes + ----- + The returned Series will have a MultiIndex with one level per input + column. By default, rows that contain any NA values are omitted from + the result. By default, the resulting Series will be in descending + order so that the first element is the most frequently-occurring row. + Examples -------- >>> df = pd.DataFrame({'num_legs': [2, 4, 4, 6], From 85bc2136b2dfd2163894bf800397b182621a1d5f Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sun, 9 Feb 2020 14:14:33 -0600 Subject: [PATCH 23/26] Rename to avoid doc error --- doc/source/getting_started/basics.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/getting_started/basics.rst b/doc/source/getting_started/basics.rst index 0afa451887dab..3f72b3d206d25 100644 --- a/doc/source/getting_started/basics.rst +++ b/doc/source/getting_started/basics.rst @@ -695,8 +695,8 @@ By default all columns are used but a subset can be selected using the ``subset` .. ipython:: python data = {"a": [1, 2, 3, 4], "b": ["x", "x", "y", "y"]} - df = pd.DataFrame(data) - df.value_counts() + frame = pd.DataFrame(data) + frame.value_counts() Similarly, you can get the most frequently occurring value(s) (the mode) of the values in a Series or DataFrame: From 47683ad74b03f262dd9e17ce964f85548a117d99 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sun, 23 Feb 2020 12:59:37 -0600 Subject: [PATCH 24/26] Add See Also --- pandas/core/frame.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e4af8e3550b66..3fc10444ee064 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4569,6 +4569,10 @@ def drop_duplicates( ------- DataFrame DataFrame with duplicates removed or None if ``inplace=True``. + + See Also + -------- + DataFrame.value_counts: Count unique combinations of columns. """ if self.empty: return self.copy() From e60de83d7cf7dabcba59a4f8519db5b45d414032 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sun, 23 Feb 2020 13:00:30 -0600 Subject: [PATCH 25/26] Move tests --- pandas/tests/frame/{ => methods}/test_value_counts.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename pandas/tests/frame/{ => methods}/test_value_counts.py (100%) diff --git a/pandas/tests/frame/test_value_counts.py b/pandas/tests/frame/methods/test_value_counts.py similarity index 100% rename from pandas/tests/frame/test_value_counts.py rename to pandas/tests/frame/methods/test_value_counts.py From 3903a4db317c0e981b3af2dec3030dc92f95621e Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sun, 23 Feb 2020 13:07:21 -0600 Subject: [PATCH 26/26] versionadded --- doc/source/getting_started/basics.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/getting_started/basics.rst b/doc/source/getting_started/basics.rst index 3f72b3d206d25..c6d9a48fcf8ed 100644 --- a/doc/source/getting_started/basics.rst +++ b/doc/source/getting_started/basics.rst @@ -689,6 +689,8 @@ of a 1D array of values. It can also be used as a function on regular arrays: s.value_counts() pd.value_counts(data) +.. versionadded:: 1.1.0 + The :meth:`~DataFrame.value_counts` method can be used to count combinations across multiple columns. By default all columns are used but a subset can be selected using the ``subset`` argument.