diff --git a/doc/source/getting_started/basics.rst b/doc/source/getting_started/basics.rst index 277080006cb3c..c6d9a48fcf8ed 100644 --- a/doc/source/getting_started/basics.rst +++ b/doc/source/getting_started/basics.rst @@ -689,6 +689,17 @@ of a 1D array of values. It can also be used as a function on regular arrays: s.value_counts() pd.value_counts(data) +.. versionadded:: 1.1.0 + +The :meth:`~DataFrame.value_counts` method can be used to count combinations across multiple columns. +By default all columns are used but a subset can be selected using the ``subset`` argument. + +.. ipython:: python + + data = {"a": [1, 2, 3, 4], "b": ["x", "x", "y", "y"]} + frame = pd.DataFrame(data) + frame.value_counts() + Similarly, you can get the most frequently occurring value(s) (the mode) of the values in a Series or DataFrame: .. ipython:: python diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst index c7b1cc1c832be..b326bbb5a465e 100644 --- a/doc/source/reference/frame.rst +++ b/doc/source/reference/frame.rst @@ -170,6 +170,7 @@ Computations / descriptive stats DataFrame.std DataFrame.var DataFrame.nunique + DataFrame.value_counts Reindexing / selection / label manipulation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 705c335acfb48..1414abe13bcf1 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -55,6 +55,7 @@ Other API changes - :meth:`Series.describe` will now show distribution percentiles for ``datetime`` dtypes, statistics ``first`` and ``last`` will now be ``min`` and ``max`` to match with numeric dtypes in :meth:`DataFrame.describe` (:issue:`30164`) +- Added :meth:`DataFrame.value_counts` (:issue:`5377`) - :meth:`Groupby.groups` now returns an abbreviated representation when called on large dataframes (:issue:`1135`) - ``loc`` lookups with an object-dtype :class:`Index` and an integer key will now raise ``KeyError`` instead of ``TypeError`` when key is missing (:issue:`31905`) - diff --git a/pandas/core/base.py b/pandas/core/base.py index 56d3596f71813..85424e35fa0e0 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1196,6 +1196,7 @@ def value_counts( -------- Series.count: Number of non-NA elements in a Series. DataFrame.count: Number of non-NA elements in a DataFrame. + DataFrame.value_counts: Equivalent method on DataFrames. Examples -------- diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 7efb4fbb878d6..3fc10444ee064 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -111,7 +111,7 @@ from pandas.core.indexes import base as ibase from pandas.core.indexes.api import Index, ensure_index, ensure_index_from_sequences from pandas.core.indexes.datetimes import DatetimeIndex -from pandas.core.indexes.multi import maybe_droplevels +from pandas.core.indexes.multi import MultiIndex, maybe_droplevels from pandas.core.indexes.period import PeriodIndex from pandas.core.indexing import check_bool_indexer, convert_to_index_sliceable from pandas.core.internals import BlockManager @@ -4569,6 +4569,10 @@ def drop_duplicates( ------- DataFrame DataFrame with duplicates removed or None if ``inplace=True``. + + See Also + -------- + DataFrame.value_counts: Count unique combinations of columns. """ if self.empty: return self.copy() @@ -4814,6 +4818,102 @@ def sort_index( else: return self._constructor(new_data).__finalize__(self) + def value_counts( + self, + subset: Optional[Sequence[Label]] = None, + normalize: bool = False, + sort: bool = True, + ascending: bool = False, + ): + """ + Return a Series containing counts of unique rows in the DataFrame. + + .. versionadded:: 1.1.0 + + Parameters + ---------- + subset : list-like, optional + Columns to use when counting unique combinations. + normalize : bool, default False + Return proportions rather than frequencies. + sort : bool, default True + Sort by frequencies. + ascending : bool, default False + Sort in ascending order. + + Returns + ------- + Series + + See Also + -------- + Series.value_counts: Equivalent method on Series. + + Notes + ----- + The returned Series will have a MultiIndex with one level per input + column. By default, rows that contain any NA values are omitted from + the result. By default, the resulting Series will be in descending + order so that the first element is the most frequently-occurring row. + + Examples + -------- + >>> df = pd.DataFrame({'num_legs': [2, 4, 4, 6], + ... 'num_wings': [2, 0, 0, 0]}, + ... index=['falcon', 'dog', 'cat', 'ant']) + >>> df + num_legs num_wings + falcon 2 2 + dog 4 0 + cat 4 0 + ant 6 0 + + >>> df.value_counts() + num_legs num_wings + 4 0 2 + 6 0 1 + 2 2 1 + dtype: int64 + + >>> df.value_counts(sort=False) + num_legs num_wings + 2 2 1 + 4 0 2 + 6 0 1 + dtype: int64 + + >>> df.value_counts(ascending=True) + num_legs num_wings + 2 2 1 + 6 0 1 + 4 0 2 + dtype: int64 + + >>> df.value_counts(normalize=True) + num_legs num_wings + 4 0 0.50 + 6 0 0.25 + 2 2 0.25 + dtype: float64 + """ + if subset is None: + subset = self.columns.tolist() + + counts = self.groupby(subset).size() + + if sort: + counts = counts.sort_values(ascending=ascending) + if normalize: + counts /= counts.sum() + + # Force MultiIndex for single column + if len(subset) == 1: + counts.index = MultiIndex.from_arrays( + [counts.index], names=[counts.index.name] + ) + + return counts + def nlargest(self, n, columns, keep="first") -> "DataFrame": """ Return the first `n` rows ordered by `columns` in descending order. diff --git a/pandas/tests/frame/methods/test_value_counts.py b/pandas/tests/frame/methods/test_value_counts.py new file mode 100644 index 0000000000000..c409b0bbe6fa9 --- /dev/null +++ b/pandas/tests/frame/methods/test_value_counts.py @@ -0,0 +1,102 @@ +import numpy as np + +import pandas as pd +import pandas._testing as tm + + +def test_data_frame_value_counts_unsorted(): + df = pd.DataFrame( + {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, + index=["falcon", "dog", "cat", "ant"], + ) + + result = df.value_counts(sort=False) + expected = pd.Series( + data=[1, 2, 1], + index=pd.MultiIndex.from_arrays( + [(2, 4, 6), (2, 0, 0)], names=["num_legs", "num_wings"] + ), + ) + + tm.assert_series_equal(result, expected) + + +def test_data_frame_value_counts_ascending(): + df = pd.DataFrame( + {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, + index=["falcon", "dog", "cat", "ant"], + ) + + result = df.value_counts(ascending=True) + expected = pd.Series( + data=[1, 1, 2], + index=pd.MultiIndex.from_arrays( + [(2, 6, 4), (2, 0, 0)], names=["num_legs", "num_wings"] + ), + ) + + tm.assert_series_equal(result, expected) + + +def test_data_frame_value_counts_default(): + df = pd.DataFrame( + {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, + index=["falcon", "dog", "cat", "ant"], + ) + + result = df.value_counts() + expected = pd.Series( + data=[2, 1, 1], + index=pd.MultiIndex.from_arrays( + [(4, 6, 2), (0, 0, 2)], names=["num_legs", "num_wings"] + ), + ) + + tm.assert_series_equal(result, expected) + + +def test_data_frame_value_counts_normalize(): + df = pd.DataFrame( + {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, + index=["falcon", "dog", "cat", "ant"], + ) + + result = df.value_counts(normalize=True) + expected = pd.Series( + data=[0.5, 0.25, 0.25], + index=pd.MultiIndex.from_arrays( + [(4, 6, 2), (0, 0, 2)], names=["num_legs", "num_wings"] + ), + ) + + tm.assert_series_equal(result, expected) + + +def test_data_frame_value_counts_single_col_default(): + df = pd.DataFrame({"num_legs": [2, 4, 4, 6]}) + + result = df.value_counts() + expected = pd.Series( + data=[2, 1, 1], + index=pd.MultiIndex.from_arrays([[4, 6, 2]], names=["num_legs"]), + ) + + tm.assert_series_equal(result, expected) + + +def test_data_frame_value_counts_empty(): + df_no_cols = pd.DataFrame() + + result = df_no_cols.value_counts() + expected = pd.Series([], dtype=np.int64) + + tm.assert_series_equal(result, expected) + + +def test_data_frame_value_counts_empty_normalize(): + df_no_cols = pd.DataFrame() + + result = df_no_cols.value_counts(normalize=True) + expected = pd.Series([], dtype=np.float64) + + tm.assert_series_equal(result, expected)