From e4be568f40b9eee7c13f9094d6f0b292c730ce09 Mon Sep 17 00:00:00 2001 From: Jay Ahn Date: Fri, 26 Jul 2024 09:23:47 -0400 Subject: [PATCH 1/7] Add doc for counting categorical dtype --- doc/source/user_guide/categorical.rst | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/doc/source/user_guide/categorical.rst b/doc/source/user_guide/categorical.rst index 1e7d66dfeb142..0c102e30bb2c2 100644 --- a/doc/source/user_guide/categorical.rst +++ b/doc/source/user_guide/categorical.rst @@ -240,6 +240,8 @@ expects a ``dtype``. For example :func:`pandas.read_csv`, array. In other words, ``dtype='category'`` is equivalent to ``dtype=CategoricalDtype()``. +.. _categorical.equalitysemantics: + Equality semantics ~~~~~~~~~~~~~~~~~~ @@ -1178,3 +1180,17 @@ Use ``copy=True`` to prevent such a behaviour or simply don't reuse ``Categorica This also happens in some cases when you supply a NumPy array instead of a ``Categorical``: using an int array (e.g. ``np.array([1,2,3,4])``) will exhibit the same behavior, while using a string array (e.g. ``np.array(["a","b","c","a"])``) will not. + +Counting CategoricalDtype +~~~~~~~~~~~~~~~~~~~~~~~~~ + +As mentioned in :ref:`Equality Semantics `, two instances of :class:`~pandas.api.types.CategoricalDtype` compare equal +whenever they have the same categories and order. Therefore, when counting data types, the multiple instances of :class:`~pandas.api.types.CategoricalDtype` will be counted as one group if they have the same categories and order. +In the example below, even though ``a``, ``c``, and ``d`` all have data types of ``category``, they will not be counted as one group since they don't have the same categories. + +.. ipython:: python + + df = pd.DataFrame({'a': [1], 'b': ['2'], 'c': [3], 'd': [3]}).astype({'a': 'category', 'c': 'category', 'd': 'category'}) + df + df.dtypes + df.dtypes.value_counts() From 1348bdc09df6480a0a6108a2bcc21848e04da6d0 Mon Sep 17 00:00:00 2001 From: Jay Ahn Date: Sat, 27 Jul 2024 00:30:39 -0400 Subject: [PATCH 2/7] Move example to docstring instead --- doc/source/user_guide/categorical.rst | 10 +--------- pandas/core/base.py | 28 +++++++++++++++++++++++++++ 2 files changed, 29 insertions(+), 9 deletions(-) diff --git a/doc/source/user_guide/categorical.rst b/doc/source/user_guide/categorical.rst index 0c102e30bb2c2..0e102a3bde383 100644 --- a/doc/source/user_guide/categorical.rst +++ b/doc/source/user_guide/categorical.rst @@ -1185,12 +1185,4 @@ Counting CategoricalDtype ~~~~~~~~~~~~~~~~~~~~~~~~~ As mentioned in :ref:`Equality Semantics `, two instances of :class:`~pandas.api.types.CategoricalDtype` compare equal -whenever they have the same categories and order. Therefore, when counting data types, the multiple instances of :class:`~pandas.api.types.CategoricalDtype` will be counted as one group if they have the same categories and order. -In the example below, even though ``a``, ``c``, and ``d`` all have data types of ``category``, they will not be counted as one group since they don't have the same categories. - -.. ipython:: python - - df = pd.DataFrame({'a': [1], 'b': ['2'], 'c': [3], 'd': [3]}).astype({'a': 'category', 'c': 'category', 'd': 'category'}) - df - df.dtypes - df.dtypes.value_counts() +whenever they have the same categories and order. Therefore, the multiple instances of :class:`~pandas.api.types.CategoricalDtype` will be counted as one group if they have the same categories and order. diff --git a/pandas/core/base.py b/pandas/core/base.py index b784dc8b03292..972f8eafe39d1 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1049,6 +1049,34 @@ def value_counts( 4.0 1 NaN 1 Name: count, dtype: int64 + + **categorial_dtypes** + + Rows with categorical type will be counted as one group\ + if they have same categories and order.\ + In the example below, even though ``a``, ``c``, and ``d``\ + all have the same data types of ``category``,\ + only ``c`` and ``d`` will be counted as one group\ + since ``a`` doesn't have the same categories. + + >>> df = pd.DataFrame({"a": [1], "b": ["2"], "c": [3], "d": [3]}) + .astype({'a': 'category', 'c': 'category', 'd': 'category'}) + >>> df + a b c d + 0 1 2 3 3 + + >>> df.dtypes + a category + b object + c category + d category + dtype: object + + >>> df.dtypes.value_counts() + category 2 + category 1 + object 1 + Name: count, dtype: int64 """ return algorithms.value_counts_internal( self, From d40849bff02c429c5e4715e43db6ae1675e57825 Mon Sep 17 00:00:00 2001 From: Jay Ahn Date: Sat, 27 Jul 2024 01:00:16 -0400 Subject: [PATCH 3/7] Remove backslash --- pandas/core/base.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index 972f8eafe39d1..04454c0c68b20 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1052,11 +1052,11 @@ def value_counts( **categorial_dtypes** - Rows with categorical type will be counted as one group\ - if they have same categories and order.\ - In the example below, even though ``a``, ``c``, and ``d``\ - all have the same data types of ``category``,\ - only ``c`` and ``d`` will be counted as one group\ + Rows with categorical type will be counted as one group + if they have same categories and order. + In the example below, even though ``a``, ``c``, and ``d`` + all have the same data types of ``category``, + only ``c`` and ``d`` will be counted as one group since ``a`` doesn't have the same categories. >>> df = pd.DataFrame({"a": [1], "b": ["2"], "c": [3], "d": [3]}) From bc2378c212f7cc16903ab2e9a57752ecafe28027 Mon Sep 17 00:00:00 2001 From: Jay Ahn Date: Sat, 27 Jul 2024 01:36:54 -0400 Subject: [PATCH 4/7] add line --- pandas/core/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index 04454c0c68b20..af02562c2a756 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1060,7 +1060,7 @@ def value_counts( since ``a`` doesn't have the same categories. >>> df = pd.DataFrame({"a": [1], "b": ["2"], "c": [3], "d": [3]}) - .astype({'a': 'category', 'c': 'category', 'd': 'category'}) + >>> df = df.astype({"a": "category", "c": "category", "d": "category"}) >>> df a b c d 0 1 2 3 3 From d8381812e305c6c60f5fde26ded4b907fe96db04 Mon Sep 17 00:00:00 2001 From: Jay Ahn Date: Sat, 27 Jul 2024 20:05:29 -0400 Subject: [PATCH 5/7] Undo the change in categorical.rst --- doc/source/user_guide/categorical.rst | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/doc/source/user_guide/categorical.rst b/doc/source/user_guide/categorical.rst index 0e102a3bde383..0c102e30bb2c2 100644 --- a/doc/source/user_guide/categorical.rst +++ b/doc/source/user_guide/categorical.rst @@ -1185,4 +1185,12 @@ Counting CategoricalDtype ~~~~~~~~~~~~~~~~~~~~~~~~~ As mentioned in :ref:`Equality Semantics `, two instances of :class:`~pandas.api.types.CategoricalDtype` compare equal -whenever they have the same categories and order. Therefore, the multiple instances of :class:`~pandas.api.types.CategoricalDtype` will be counted as one group if they have the same categories and order. +whenever they have the same categories and order. Therefore, when counting data types, the multiple instances of :class:`~pandas.api.types.CategoricalDtype` will be counted as one group if they have the same categories and order. +In the example below, even though ``a``, ``c``, and ``d`` all have data types of ``category``, they will not be counted as one group since they don't have the same categories. + +.. ipython:: python + + df = pd.DataFrame({'a': [1], 'b': ['2'], 'c': [3], 'd': [3]}).astype({'a': 'category', 'c': 'category', 'd': 'category'}) + df + df.dtypes + df.dtypes.value_counts() From 86facac3c582c399e600a7a0f0f6cb7d26495760 Mon Sep 17 00:00:00 2001 From: Jay Ahn Date: Sat, 27 Jul 2024 20:05:59 -0400 Subject: [PATCH 6/7] Rename it to Categorical Dtypes --- pandas/core/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index af02562c2a756..863cf978426e2 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1050,7 +1050,7 @@ def value_counts( NaN 1 Name: count, dtype: int64 - **categorial_dtypes** + **Categorical Dtypes** Rows with categorical type will be counted as one group if they have same categories and order. From a847c134641a6e5dfa1c95e2e30bd6e67885b8cf Mon Sep 17 00:00:00 2001 From: Jay Ahn Date: Mon, 29 Jul 2024 22:14:16 -0400 Subject: [PATCH 7/7] undo categorical.rst --- doc/source/user_guide/categorical.rst | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/doc/source/user_guide/categorical.rst b/doc/source/user_guide/categorical.rst index 0c102e30bb2c2..1e7d66dfeb142 100644 --- a/doc/source/user_guide/categorical.rst +++ b/doc/source/user_guide/categorical.rst @@ -240,8 +240,6 @@ expects a ``dtype``. For example :func:`pandas.read_csv`, array. In other words, ``dtype='category'`` is equivalent to ``dtype=CategoricalDtype()``. -.. _categorical.equalitysemantics: - Equality semantics ~~~~~~~~~~~~~~~~~~ @@ -1180,17 +1178,3 @@ Use ``copy=True`` to prevent such a behaviour or simply don't reuse ``Categorica This also happens in some cases when you supply a NumPy array instead of a ``Categorical``: using an int array (e.g. ``np.array([1,2,3,4])``) will exhibit the same behavior, while using a string array (e.g. ``np.array(["a","b","c","a"])``) will not. - -Counting CategoricalDtype -~~~~~~~~~~~~~~~~~~~~~~~~~ - -As mentioned in :ref:`Equality Semantics `, two instances of :class:`~pandas.api.types.CategoricalDtype` compare equal -whenever they have the same categories and order. Therefore, when counting data types, the multiple instances of :class:`~pandas.api.types.CategoricalDtype` will be counted as one group if they have the same categories and order. -In the example below, even though ``a``, ``c``, and ``d`` all have data types of ``category``, they will not be counted as one group since they don't have the same categories. - -.. ipython:: python - - df = pd.DataFrame({'a': [1], 'b': ['2'], 'c': [3], 'd': [3]}).astype({'a': 'category', 'c': 'category', 'd': 'category'}) - df - df.dtypes - df.dtypes.value_counts()