From 44c31db0a2fff4c070591238fcb3ba7193542266 Mon Sep 17 00:00:00 2001 From: Taylor Packard <3.t.packard@gmail.com> Date: Mon, 23 Jan 2023 23:59:49 -0500 Subject: [PATCH 01/10] first value_counts commit --- pandas/core/frame.py | 9 ++++++++- .../tests/frame/methods/test_value_counts.py | 19 +++++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1d86c81745a6a..142fcb54d993f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6932,7 +6932,7 @@ def value_counts( Parameters ---------- - subset : list-like, optional + subset : mapping, function, label, list of labels, optional Columns to use when counting unique combinations. normalize : bool, default False Return proportions rather than frequencies. @@ -7024,6 +7024,13 @@ def value_counts( John Smith 1 NaN 1 dtype: int64 + + >>> df.value_counts("first_name") + first_name + John 2 + Anne 1 + Beth 1 + dtype: int64 """ if subset is None: subset = self.columns.tolist() diff --git a/pandas/tests/frame/methods/test_value_counts.py b/pandas/tests/frame/methods/test_value_counts.py index 9859ffb83da66..d8b7e21ebc210 100644 --- a/pandas/tests/frame/methods/test_value_counts.py +++ b/pandas/tests/frame/methods/test_value_counts.py @@ -144,3 +144,22 @@ def test_data_frame_value_counts_dropna_false(nulls_fixture): ) tm.assert_series_equal(result, expected) + + +def test_data_frame_value_counts_subset(nulls_fixture): + # GH 50829 + df = pd.DataFrame( + { + "first_name": ["John", "Anne", "John", "Beth"], + "middle_name": ["Smith", nulls_fixture, nulls_fixture, "Louise"], + }, + ) + result = df.value_counts("first_name") + expected = pd.Series( + data=[2, 1, 1], + index=pd.MultiIndex.from_arrays( + [["John", "Anne", "Beth"]], names=["first_name"] + ), + ) + + tm.assert_series_equal(result, expected) From c20e994ea5e3e2c3e3426e65d4624282cbb59c79 Mon Sep 17 00:00:00 2001 From: Taylor Packard <3.t.packard@gmail.com> Date: Tue, 24 Jan 2023 14:52:38 -0500 Subject: [PATCH 02/10] updated test_data_frame_value_counts_subset --- pandas/tests/frame/methods/test_value_counts.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/tests/frame/methods/test_value_counts.py b/pandas/tests/frame/methods/test_value_counts.py index d8b7e21ebc210..8bf54aadf340c 100644 --- a/pandas/tests/frame/methods/test_value_counts.py +++ b/pandas/tests/frame/methods/test_value_counts.py @@ -157,9 +157,7 @@ def test_data_frame_value_counts_subset(nulls_fixture): result = df.value_counts("first_name") expected = pd.Series( data=[2, 1, 1], - index=pd.MultiIndex.from_arrays( - [["John", "Anne", "Beth"]], names=["first_name"] - ), + index=pd.Index(["John", "Anne", "Beth"], name="first_name"), ) tm.assert_series_equal(result, expected) From da419881f698bde16bcace407909aa6f4632a44a Mon Sep 17 00:00:00 2001 From: Taylor Packard <3.t.packard@gmail.com> Date: Sat, 28 Jan 2023 12:29:57 -0500 Subject: [PATCH 03/10] updated subset docstring --- pandas/core/frame.py | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 142fcb54d993f..43e3f36170ea6 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -16,7 +16,6 @@ import functools from io import StringIO import itertools -import sys from textwrap import dedent from typing import ( TYPE_CHECKING, @@ -49,7 +48,7 @@ from pandas._libs.hashtable import duplicated from pandas._libs.lib import ( NoDefault, - is_range_indexer, + array_equal_fast, no_default, ) from pandas._typing import ( @@ -92,17 +91,12 @@ WriteBuffer, npt, ) -from pandas.compat import PYPY from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import ( function as nv, np_percentile_argname, ) -from pandas.errors import ( - ChainedAssignmentError, - InvalidIndexError, - _chained_assignment_msg, -) +from pandas.errors import InvalidIndexError from pandas.util._decorators import ( Appender, Substitution, @@ -3868,10 +3862,6 @@ def isetitem(self, loc, value) -> None: self._iset_item_mgr(loc, arraylike, inplace=False) def __setitem__(self, key, value): - if not PYPY and using_copy_on_write(): - if sys.getrefcount(self) <= 3: - raise ChainedAssignmentError(_chained_assignment_msg) - key = com.apply_if_callable(key, self) # see if we can slice the rows @@ -6734,7 +6724,7 @@ def sort_values( else: return self.copy(deep=None) - if is_range_indexer(indexer, len(indexer)): + if array_equal_fast(indexer, np.arange(0, len(indexer), dtype=indexer.dtype)): if inplace: return self._update_inplace(self) else: @@ -6932,8 +6922,8 @@ def value_counts( Parameters ---------- - subset : mapping, function, label, list of labels, optional - Columns to use when counting unique combinations. + subset : label or list of labels, optional + Column(s) to use when counting unique combinations. normalize : bool, default False Return proportions rather than frequencies. sort : bool, default True @@ -10887,7 +10877,11 @@ def quantile( f"Invalid method: {method}. Method must be in {valid_method}." ) if method == "single": - res = data._mgr.quantile(qs=q, axis=1, interpolation=interpolation) + # error: Argument "qs" to "quantile" of "BlockManager" has incompatible type + # "Index"; expected "Float64Index" + res = data._mgr.quantile( + qs=q, axis=1, interpolation=interpolation # type: ignore[arg-type] + ) elif method == "table": valid_interpolation = {"nearest", "lower", "higher"} if interpolation not in valid_interpolation: From efae1bf006277f87e2bfa44ab80ddb0397fc659e Mon Sep 17 00:00:00 2001 From: Taylor Packard <3.t.packard@gmail.com> Date: Sun, 29 Jan 2023 22:40:14 -0500 Subject: [PATCH 04/10] Back to pre updated subset docstring commit --- pandas/core/frame.py | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 43e3f36170ea6..142fcb54d993f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -16,6 +16,7 @@ import functools from io import StringIO import itertools +import sys from textwrap import dedent from typing import ( TYPE_CHECKING, @@ -48,7 +49,7 @@ from pandas._libs.hashtable import duplicated from pandas._libs.lib import ( NoDefault, - array_equal_fast, + is_range_indexer, no_default, ) from pandas._typing import ( @@ -91,12 +92,17 @@ WriteBuffer, npt, ) +from pandas.compat import PYPY from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import ( function as nv, np_percentile_argname, ) -from pandas.errors import InvalidIndexError +from pandas.errors import ( + ChainedAssignmentError, + InvalidIndexError, + _chained_assignment_msg, +) from pandas.util._decorators import ( Appender, Substitution, @@ -3862,6 +3868,10 @@ def isetitem(self, loc, value) -> None: self._iset_item_mgr(loc, arraylike, inplace=False) def __setitem__(self, key, value): + if not PYPY and using_copy_on_write(): + if sys.getrefcount(self) <= 3: + raise ChainedAssignmentError(_chained_assignment_msg) + key = com.apply_if_callable(key, self) # see if we can slice the rows @@ -6724,7 +6734,7 @@ def sort_values( else: return self.copy(deep=None) - if array_equal_fast(indexer, np.arange(0, len(indexer), dtype=indexer.dtype)): + if is_range_indexer(indexer, len(indexer)): if inplace: return self._update_inplace(self) else: @@ -6922,8 +6932,8 @@ def value_counts( Parameters ---------- - subset : label or list of labels, optional - Column(s) to use when counting unique combinations. + subset : mapping, function, label, list of labels, optional + Columns to use when counting unique combinations. normalize : bool, default False Return proportions rather than frequencies. sort : bool, default True @@ -10877,11 +10887,7 @@ def quantile( f"Invalid method: {method}. Method must be in {valid_method}." ) if method == "single": - # error: Argument "qs" to "quantile" of "BlockManager" has incompatible type - # "Index"; expected "Float64Index" - res = data._mgr.quantile( - qs=q, axis=1, interpolation=interpolation # type: ignore[arg-type] - ) + res = data._mgr.quantile(qs=q, axis=1, interpolation=interpolation) elif method == "table": valid_interpolation = {"nearest", "lower", "higher"} if interpolation not in valid_interpolation: From 245526bef90462c3cba486620f5dc44c0708852c Mon Sep 17 00:00:00 2001 From: Taylor Packard <3.t.packard@gmail.com> Date: Sun, 29 Jan 2023 23:09:04 -0500 Subject: [PATCH 05/10] added integer column label test --- pandas/core/frame.py | 2 +- pandas/tests/frame/methods/test_value_counts.py | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 142fcb54d993f..f4ae2b4e3a946 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6932,7 +6932,7 @@ def value_counts( Parameters ---------- - subset : mapping, function, label, list of labels, optional + subset : label or list of labels, optional Columns to use when counting unique combinations. normalize : bool, default False Return proportions rather than frequencies. diff --git a/pandas/tests/frame/methods/test_value_counts.py b/pandas/tests/frame/methods/test_value_counts.py index 8bf54aadf340c..b21f5d4530d87 100644 --- a/pandas/tests/frame/methods/test_value_counts.py +++ b/pandas/tests/frame/methods/test_value_counts.py @@ -160,4 +160,13 @@ def test_data_frame_value_counts_subset(nulls_fixture): index=pd.Index(["John", "Anne", "Beth"], name="first_name"), ) + df = pd.DataFrame( + {100: [2, 100, 5, 9], 200: [2, 6, 2, 6], 300: [4, 6, 2, 1]}, + ) + result = df.value_counts([200]) + expected = pd.Series( + data=[2, 2], + index=pd.Index([2, 6], name=200), + ) + tm.assert_series_equal(result, expected) From e5f96b236589c6e4ef292c0eb2588d41592d7c36 Mon Sep 17 00:00:00 2001 From: Taylor Packard <3.t.packard@gmail.com> Date: Sun, 29 Jan 2023 23:37:35 -0500 Subject: [PATCH 06/10] updated value_counts subset test --- pandas/tests/frame/methods/test_value_counts.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/frame/methods/test_value_counts.py b/pandas/tests/frame/methods/test_value_counts.py index b21f5d4530d87..9e32755821486 100644 --- a/pandas/tests/frame/methods/test_value_counts.py +++ b/pandas/tests/frame/methods/test_value_counts.py @@ -160,13 +160,15 @@ def test_data_frame_value_counts_subset(nulls_fixture): index=pd.Index(["John", "Anne", "Beth"], name="first_name"), ) + tm.assert_series_equal(result, expected) + df = pd.DataFrame( {100: [2, 100, 5, 9], 200: [2, 6, 2, 6], 300: [4, 6, 2, 1]}, ) result = df.value_counts([200]) expected = pd.Series( data=[2, 2], - index=pd.Index([2, 6], name=200), + index=pd.MultiIndex.from_arrays([[2, 6]], names=[200]), ) tm.assert_series_equal(result, expected) From 80d0576146fa821ae30b5840926232a566d10f12 Mon Sep 17 00:00:00 2001 From: Taylor Packard <3.t.packard@gmail.com> Date: Tue, 31 Jan 2023 21:14:29 -0500 Subject: [PATCH 07/10] parameterized tests and fixed implimentation for integers --- pandas/core/frame.py | 2 +- .../tests/frame/methods/test_value_counts.py | 23 ++++++------------- 2 files changed, 8 insertions(+), 17 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f4ae2b4e3a946..0f44fc1e634d0 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7043,7 +7043,7 @@ def value_counts( counts /= counts.sum() # Force MultiIndex for single column - if len(subset) == 1: + if is_list_like(subset) and len(subset) == 1: counts.index = MultiIndex.from_arrays( [counts.index], names=[counts.index.name] ) diff --git a/pandas/tests/frame/methods/test_value_counts.py b/pandas/tests/frame/methods/test_value_counts.py index 9e32755821486..1b861bee9a667 100644 --- a/pandas/tests/frame/methods/test_value_counts.py +++ b/pandas/tests/frame/methods/test_value_counts.py @@ -1,4 +1,5 @@ import numpy as np +import pytest import pandas as pd import pandas._testing as tm @@ -146,29 +147,19 @@ def test_data_frame_value_counts_dropna_false(nulls_fixture): tm.assert_series_equal(result, expected) -def test_data_frame_value_counts_subset(nulls_fixture): +@pytest.mark.parametrize("columns", (["first_name", "middle_name"], [0, 1])) +def test_data_frame_value_counts_subset(nulls_fixture, columns): # GH 50829 df = pd.DataFrame( { - "first_name": ["John", "Anne", "John", "Beth"], - "middle_name": ["Smith", nulls_fixture, nulls_fixture, "Louise"], + columns[0]: ["John", "Anne", "John", "Beth"], + columns[1]: ["Smith", nulls_fixture, nulls_fixture, "Louise"], }, ) - result = df.value_counts("first_name") + result = df.value_counts(columns[0]) expected = pd.Series( data=[2, 1, 1], - index=pd.Index(["John", "Anne", "Beth"], name="first_name"), - ) - - tm.assert_series_equal(result, expected) - - df = pd.DataFrame( - {100: [2, 100, 5, 9], 200: [2, 6, 2, 6], 300: [4, 6, 2, 1]}, - ) - result = df.value_counts([200]) - expected = pd.Series( - data=[2, 2], - index=pd.MultiIndex.from_arrays([[2, 6]], names=[200]), + index=pd.Index(["John", "Anne", "Beth"], name=columns[0]), ) tm.assert_series_equal(result, expected) From ff87dc8abca8d8e1cdb21aea08e8e487b097c7e5 Mon Sep 17 00:00:00 2001 From: Taylor Packard <3.t.packard@gmail.com> Date: Tue, 7 Feb 2023 23:02:57 -0500 Subject: [PATCH 08/10] fixed docstring --- pandas/core/frame.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6ace24782fb4a..5acd6fde5dfef 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7049,7 +7049,6 @@ def value_counts( John 2 Anne 1 Beth 1 - dtype: int64 Name: count, dtype: int64 """ if subset is None: From a7f61c53b6367f050ca48740892ec64f2dac9354 Mon Sep 17 00:00:00 2001 From: Taylor Packard <3.t.packard@gmail.com> Date: Wed, 8 Feb 2023 00:34:24 -0500 Subject: [PATCH 09/10] fixed test_data_frame_value_counts_subset --- pandas/tests/frame/methods/test_value_counts.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/frame/methods/test_value_counts.py b/pandas/tests/frame/methods/test_value_counts.py index fa38cd15bfa2f..355f05cd5156c 100644 --- a/pandas/tests/frame/methods/test_value_counts.py +++ b/pandas/tests/frame/methods/test_value_counts.py @@ -171,6 +171,7 @@ def test_data_frame_value_counts_subset(nulls_fixture, columns): expected = pd.Series( data=[2, 1, 1], index=pd.Index(["John", "Anne", "Beth"], name=columns[0]), + name="count", ) tm.assert_series_equal(result, expected) From c3adbc9624c2fc092d1abc0ebbcf4bc8a914fe56 Mon Sep 17 00:00:00 2001 From: Taylor Packard <3.t.packard@gmail.com> Date: Sat, 18 Feb 2023 14:10:29 -0500 Subject: [PATCH 10/10] updated whatsnew and docs --- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/core/frame.py | 7 ++++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index ac7d30310be9e..f63563472f1f2 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -1088,6 +1088,7 @@ Removal of prior version deprecations/changes - Arguments after ``expr`` in :meth:`DataFrame.eval` and :meth:`DataFrame.query` are keyword-only (:issue:`47587`) - Removed :meth:`Index._get_attributes_dict` (:issue:`50648`) - Removed :meth:`Series.__array_wrap__` (:issue:`50648`) +- Changed behavior of :meth:`.DataFrame.value_counts` to return a :class:`Series` with :class:`MultiIndex` for any list-like(one element or not) but an :class:`Index` for a single label (:issue:`50829`) .. --------------------------------------------------------------------------- .. _whatsnew_200.performance: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 629259689e5fc..2520f54ae3d99 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6981,9 +6981,10 @@ def value_counts( Notes ----- The returned Series will have a MultiIndex with one level per input - column. By default, rows that contain any NA values are omitted from - the result. By default, the resulting Series will be in descending - order so that the first element is the most frequently-occurring row. + column but an Index (non-multi) for a single label. By default, rows + that contain any NA values are omitted from the result. By default, + the resulting Series will be in descending order so that the first + element is the most frequently-occurring row. Examples --------