From 18de37686fb70d80d1f66486e25ffb44918862d3 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 23 Feb 2018 16:26:01 -0600 Subject: [PATCH 1/8] REF/BUG/API: factorizing categorical data This changes / fixes how Categorical data are factorized. The return value of a factorized categorical is now `Tuple[ndarray[int], Categorical]`. Before ```python In [2]: l, u = pd.factorize(pd.Categorical(['a', 'a', 'b'])) In [3]: l Out[3]: array([0, 0, 1]) In [4]: u Out[4]: array([0, 1]) ``` after ```python In [2]: l, u = pd.factorize(pd.Categorical(['a', 'a', 'b'])) In [3]: l Out[3]: array([0, 0, 1]) In [4]: u Out[4]: [a, b] Categories (2, object): [a, b] ``` The implementation is similar to `.unique`. 1. The algo (`pd.factorize`, `pd.unique`) handles unboxing / dtype coercion 2. The algo dispatches the actual array factorization for extension types 3. The algo boxes the output if necessary, depending on the input. I've implemented this as a new public method on ``Categorical``, mainly since this is what we do for unique, and I think it's a useful method to have. This fixes a bug in factorizing categoricals with missing values. Previously, we included -1 in the uniques. Before ```python In [2]: l, u = pd.factorize(pd.Categorical(['a', 'a', 'b', None])) In [3]: u Out[3]: array([ 0, 1, -1]) ``` After ```python In [2]: l, u = pd.factorize(pd.Categorical(['a', 'a', 'b', None])) In [3]: u Out[3]: [a, b] Categories (2, object): [a, b] ``` --- doc/source/whatsnew/v0.23.0.txt | 2 + pandas/core/algorithms.py | 74 ++++++++++++++++++++------ pandas/core/arrays/categorical.py | 59 ++++++++++++++++++++ pandas/tests/categorical/test_algos.py | 49 +++++++++++++++++ 4 files changed, 167 insertions(+), 17 deletions(-) create mode 100644 pandas/tests/categorical/test_algos.py diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 542e62aa374be..01f68940c1715 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -746,6 +746,8 @@ Categorical - Bug in :meth:`Series.astype` and ``Categorical.astype()`` where an existing categorical data does not get updated (:issue:`10696`, :issue:`18593`) - Bug in :class:`Index` constructor with ``dtype=CategoricalDtype(...)`` where ``categories`` and ``ordered`` are not maintained (issue:`19032`) - Bug in :class:`Series` constructor with scalar and ``dtype=CategoricalDtype(...)`` where ``categories`` and ``ordered`` are not maintained (issue:`19565`) +- Bug in :func:`pandas.factorize` returning the unique codes for the ``uniques``. This now returns a ``Categorical`` with the same dtype as the input (:issue:`19721`) +- Bug in :func:`pandas.factorize` including an item for missing values in the ``uniques`` return value (:issue:`19721`) Datetimelike ^^^^^^^^^^^^ diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index d616e3f92aa4d..3a34880afbd19 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -438,6 +438,35 @@ def isin(comps, values): return f(comps, values) +def _factorize_array(values, check_nulls, na_sentinel=-1, size_hint=None): + """Factorize an array-like to labels and uniques. + + This doesn't do any coercion of types or unboxing before factorization. + + Parameters + ---------- + values : ndarray + check_nulls : bool + Whether to check for nulls in the hashtable's 'get_labels' method. + na_sentinel : int, default -1 + size_hint : int, optional + Passsed through to the hashtable's 'get_labels' method + + Returns + ------- + labels, uniques : ndarray + """ + (hash_klass, vec_klass), values = _get_data_algo(values, _hashtables) + + table = hash_klass(size_hint or len(values)) + uniques = vec_klass() + labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls) + + labels = _ensure_platform_int(labels) + uniques = uniques.to_array() + return labels, uniques + + @deprecate_kwarg(old_arg_name='order', new_arg_name=None) def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): """ @@ -445,8 +474,9 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): Parameters ---------- - values : ndarray (1-d) - Sequence + values : Sequence + ndarrays must be 1-D. Sequences that aren't pandas objects are + coereced to ndarrays before factorization. sort : boolean, default False Sort by values na_sentinel : int, default -1 @@ -461,26 +491,36 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): Series note: an array of Periods will ignore sort as it returns an always sorted - PeriodIndex + PeriodIndex. """ + # Implementation notes: This method is responsible for 3 things + # 1.) coercing data to array-like (ndarray, Index, extension array) + # 2.) factorizing labels and uniques + # 3.) Maybe boxing the output in an Index + # + # Step 2 is dispatched to extension types (like Categorical). They are + # responsible only for factorization and sorting if necessary. All + # data coercion and boxing should happen here. values = _ensure_arraylike(values) original = values - values, dtype, _ = _ensure_data(values) - (hash_klass, vec_klass), values = _get_data_algo(values, _hashtables) - - table = hash_klass(size_hint or len(values)) - uniques = vec_klass() - check_nulls = not is_integer_dtype(original) - labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls) - - labels = _ensure_platform_int(labels) - uniques = uniques.to_array() - if sort and len(uniques) > 0: - from pandas.core.sorting import safe_sort - uniques, labels = safe_sort(uniques, labels, na_sentinel=na_sentinel, - assume_unique=True) + if is_categorical_dtype(values): + values = getattr(values, '_values', values) + labels, uniques = values.factorize(sort=sort) + dtype = original.dtype + else: + values, dtype, _ = _ensure_data(values) + check_nulls = not is_integer_dtype(original) + labels, uniques = _factorize_array(values, check_nulls, + na_sentinel=na_sentinel, + size_hint=size_hint) + + if sort and len(uniques) > 0: + from pandas.core.sorting import safe_sort + uniques, labels = safe_sort(uniques, labels, + na_sentinel=na_sentinel, + assume_unique=True) uniques = _reconstruct_data(uniques, dtype, original) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index c6eeabf0148d0..5f9661c17b0d3 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -7,6 +7,7 @@ from pandas import compat from pandas.compat import u, lzip from pandas._libs import lib, algos as libalgos +from pandas._libs.tslib import iNaT from pandas.core.dtypes.generic import ( ABCSeries, ABCIndexClass, ABCCategoricalIndex) @@ -2068,6 +2069,64 @@ def unique(self): take_codes = sorted(take_codes) return cat.set_categories(cat.categories.take(take_codes)) + def factorize(self, sort=False, na_sentinel=-1): + """Encode the Categorical as an enumerated type. + + Parameters + ---------- + sort : boolean, default False + Sort by values + na_sentinel: int, default -1 + Value to mark "not found" + + Returns + ------- + labels : ndarray + An integer NumPy array that's an indexer into the original + Categorical + uniques : Categorical + A Categorical whose values are the unique values and + whose dtype matches the original CategoricalDtype. Note that if + there any unobserved categories in ``self`` will not be present + in ``uniques.values``. They will be present in + ``uniques.categories`` + + Examples + -------- + >>> cat = pd.Categorical(['a', 'a', 'c'], categories=['a', 'b', 'c']) + >>> labels, uniques = cat.factorize() + >>> labels + (array([0, 0, 1]), + >>> uniques + [a, c] + Categories (3, object): [a, b, c]) + + Missing values are handled + + >>> labels, uniques = pd.factorize(pd.Categorical(['a', 'b', None])) + >>> labels + array([ 0, 1, -1]) + >>> uniques + [a, b] + Categories (2, object): [a, b] + """ + from pandas.core.algorithms import _factorize_array, take_1d + + codes = self.codes.astype('int64') + # We set missing codes, normally -1, to iNaT so that the + # Int64HashTable treats them as missing values. + codes[codes == -1] = iNaT + labels, uniques = _factorize_array(codes, check_nulls=True, + na_sentinel=na_sentinel) + uniques = self._constructor(self.categories.take(uniques), + categories=self.categories, + ordered=self.ordered) + if sort: + order = uniques.argsort() + labels = take_1d(order, labels, fill_value=na_sentinel) + uniques = uniques.take(order) + return labels, uniques + def equals(self, other): """ Returns True if categorical arrays are equal. diff --git a/pandas/tests/categorical/test_algos.py b/pandas/tests/categorical/test_algos.py new file mode 100644 index 0000000000000..6d21f548c65b2 --- /dev/null +++ b/pandas/tests/categorical/test_algos.py @@ -0,0 +1,49 @@ +import pytest +import numpy as np + +import pandas as pd +import pandas.util.testing as tm + + +@pytest.mark.parametrize('ordered', [True, False]) +@pytest.mark.parametrize('categories', [ + ['b', 'a', 'c'], + ['a', 'b', 'c', 'd'], +]) +def test_factorize(categories, ordered): + cat = pd.Categorical(['b', 'b', 'a', 'c', None], + categories=categories, + ordered=ordered) + labels, uniques = pd.factorize(cat) + expected_labels = np.array([0, 0, 1, 2, -1]) + expected_uniques = pd.Categorical(['b', 'a', 'c'], + categories=categories, + ordered=ordered) + + tm.assert_numpy_array_equal(labels, expected_labels) + tm.assert_categorical_equal(uniques, expected_uniques) + + +def test_factorized_sort(): + cat = pd.Categorical(['b', 'b', None, 'a']) + labels, uniques = pd.factorize(cat, sort=True) + expected_labels = np.array([1, 1, -1, 0]) + expected_uniques = pd.Categorical(['a', 'b']) + + tm.assert_numpy_array_equal(labels, expected_labels) + tm.assert_categorical_equal(uniques, expected_uniques) + + +def test_factorized_sort_ordered(): + cat = pd.Categorical(['b', 'b', None, 'a'], + categories=['c', 'b', 'a'], + ordered=True) + + labels, uniques = pd.factorize(cat, sort=True) + expected_labels = np.array([0, 0, -1, 1]) + expected_uniques = pd.Categorical(['b', 'a'], + categories=['c', 'b', 'a'], + ordered=True) + + tm.assert_numpy_array_equal(labels, expected_labels) + tm.assert_categorical_equal(uniques, expected_uniques) From 9ef5be218ca60aea53e24b2c2ae322413e9b401e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 28 Feb 2018 13:57:36 -0600 Subject: [PATCH 2/8] Explicit dtype for expected --- pandas/tests/categorical/test_algos.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/categorical/test_algos.py b/pandas/tests/categorical/test_algos.py index 6d21f548c65b2..61764ec0ff632 100644 --- a/pandas/tests/categorical/test_algos.py +++ b/pandas/tests/categorical/test_algos.py @@ -15,7 +15,7 @@ def test_factorize(categories, ordered): categories=categories, ordered=ordered) labels, uniques = pd.factorize(cat) - expected_labels = np.array([0, 0, 1, 2, -1]) + expected_labels = np.array([0, 0, 1, 2, -1], dtype='int64') expected_uniques = pd.Categorical(['b', 'a', 'c'], categories=categories, ordered=ordered) @@ -27,7 +27,7 @@ def test_factorize(categories, ordered): def test_factorized_sort(): cat = pd.Categorical(['b', 'b', None, 'a']) labels, uniques = pd.factorize(cat, sort=True) - expected_labels = np.array([1, 1, -1, 0]) + expected_labels = np.array([1, 1, -1, 0], dtype='int64') expected_uniques = pd.Categorical(['a', 'b']) tm.assert_numpy_array_equal(labels, expected_labels) @@ -40,7 +40,7 @@ def test_factorized_sort_ordered(): ordered=True) labels, uniques = pd.factorize(cat, sort=True) - expected_labels = np.array([0, 0, -1, 1]) + expected_labels = np.array([0, 0, -1, 1], dtype='int64') expected_uniques = pd.Categorical(['b', 'a'], categories=['c', 'b', 'a'], ordered=True) From 5e52b6f60101bef3eb92bd1197d103fc71b4ba21 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 5 Mar 2018 16:04:03 -0800 Subject: [PATCH 3/8] Clean : imports / remove sort --- pandas/core/arrays/categorical.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 5f9661c17b0d3..b0d5344f06a3f 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -30,7 +30,8 @@ is_scalar, is_dict_like) -from pandas.core.algorithms import factorize, take_1d, unique1d +from pandas.core.algorithms import ( + factorize, take_1d, unique1d, _factorize_array) from pandas.core.accessor import PandasDelegate from pandas.core.base import (PandasObject, NoNewAttributesMixin, _shared_docs) @@ -2069,13 +2070,11 @@ def unique(self): take_codes = sorted(take_codes) return cat.set_categories(cat.categories.take(take_codes)) - def factorize(self, sort=False, na_sentinel=-1): + def factorize(self, na_sentinel=-1): """Encode the Categorical as an enumerated type. Parameters ---------- - sort : boolean, default False - Sort by values na_sentinel: int, default -1 Value to mark "not found" @@ -2110,21 +2109,16 @@ def factorize(self, sort=False, na_sentinel=-1): [a, b] Categories (2, object): [a, b] """ - from pandas.core.algorithms import _factorize_array, take_1d codes = self.codes.astype('int64') + codes[codes == -1] = iNaT # We set missing codes, normally -1, to iNaT so that the # Int64HashTable treats them as missing values. - codes[codes == -1] = iNaT labels, uniques = _factorize_array(codes, check_nulls=True, na_sentinel=na_sentinel) uniques = self._constructor(self.categories.take(uniques), categories=self.categories, ordered=self.ordered) - if sort: - order = uniques.argsort() - labels = take_1d(order, labels, fill_value=na_sentinel) - uniques = uniques.take(order) return labels, uniques def equals(self, other): From 121b682ab4c7b912df2a814305560b23467154f4 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 5 Mar 2018 16:12:07 -0800 Subject: [PATCH 4/8] Restore sort --- pandas/core/arrays/categorical.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index d1ae84184fb89..800a12d2f5ca1 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -30,8 +30,7 @@ is_scalar, is_dict_like) -from pandas.core.algorithms import ( - factorize, take_1d, unique1d, _factorize_array) +from pandas.core.algorithms import factorize, take_1d, unique1d from pandas.core.accessor import PandasDelegate from pandas.core.base import (PandasObject, NoNewAttributesMixin, _shared_docs) @@ -366,10 +365,6 @@ def __init__(self, values, categories=None, ordered=None, dtype=None, self._dtype = self._dtype.update_dtype(dtype) self._codes = coerce_indexer_dtype(codes, dtype.categories) - @classmethod - def _constructor_from_sequence(cls, scalars): - return cls(scalars) - @property def categories(self): """The categories of this categorical. @@ -2074,11 +2069,13 @@ def unique(self): take_codes = sorted(take_codes) return cat.set_categories(cat.categories.take(take_codes)) - def factorize(self, na_sentinel=-1): + def factorize(self, sort=False, na_sentinel=-1): """Encode the Categorical as an enumerated type. Parameters ---------- + sort : boolean, default False + Sort by values na_sentinel: int, default -1 Value to mark "not found" @@ -2113,6 +2110,7 @@ def factorize(self, na_sentinel=-1): [a, b] Categories (2, object): [a, b] """ + from pandas.core.algorithms import _factorize_array, take_1d codes = self.codes.astype('int64') codes[codes == -1] = iNaT @@ -2123,6 +2121,10 @@ def factorize(self, na_sentinel=-1): uniques = self._constructor(self.categories.take(uniques), categories=self.categories, ordered=self.ordered) + if sort: + order = uniques.argsort() + labels = take_1d(order, labels, fill_value=na_sentinel) + uniques = uniques.take(order) return labels, uniques def equals(self, other): From a6bc40594ac706198fcc44057bd99e7869a76f38 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 9 Mar 2018 14:53:48 -0600 Subject: [PATCH 5/8] REF: remove sort from Categorical.factorize --- pandas/core/algorithms.py | 12 +++++++++--- pandas/core/arrays/categorical.py | 8 ++------ 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 3a34880afbd19..9b41b0feeef4a 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -507,7 +507,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): if is_categorical_dtype(values): values = getattr(values, '_values', values) - labels, uniques = values.factorize(sort=sort) + labels, uniques = values.factorize() dtype = original.dtype else: values, dtype, _ = _ensure_data(values) @@ -516,8 +516,14 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): na_sentinel=na_sentinel, size_hint=size_hint) - if sort and len(uniques) > 0: - from pandas.core.sorting import safe_sort + if sort and len(uniques) > 0: + from pandas.core.sorting import safe_sort + try: + order = uniques.argsort() + labels = take_1d(order, labels, fill_value=na_sentinel) + uniques = uniques.take(order) + except TypeError: + # Mixed types, where uniques.argsort fails. uniques, labels = safe_sort(uniques, labels, na_sentinel=na_sentinel, assume_unique=True) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 800a12d2f5ca1..c4da558467e41 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2069,7 +2069,7 @@ def unique(self): take_codes = sorted(take_codes) return cat.set_categories(cat.categories.take(take_codes)) - def factorize(self, sort=False, na_sentinel=-1): + def factorize(self, na_sentinel=-1): """Encode the Categorical as an enumerated type. Parameters @@ -2110,7 +2110,7 @@ def factorize(self, sort=False, na_sentinel=-1): [a, b] Categories (2, object): [a, b] """ - from pandas.core.algorithms import _factorize_array, take_1d + from pandas.core.algorithms import _factorize_array codes = self.codes.astype('int64') codes[codes == -1] = iNaT @@ -2121,10 +2121,6 @@ def factorize(self, sort=False, na_sentinel=-1): uniques = self._constructor(self.categories.take(uniques), categories=self.categories, ordered=self.ordered) - if sort: - order = uniques.argsort() - labels = take_1d(order, labels, fill_value=na_sentinel) - uniques = uniques.take(order) return labels, uniques def equals(self, other): From 0bfbc478a4fe2ccb36f3819bfccec8389d8c05cf Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 9 Mar 2018 14:57:15 -0600 Subject: [PATCH 6/8] Updated comment --- pandas/core/algorithms.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 9b41b0feeef4a..7b6ecd2df93d8 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -499,8 +499,8 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): # 3.) Maybe boxing the output in an Index # # Step 2 is dispatched to extension types (like Categorical). They are - # responsible only for factorization and sorting if necessary. All - # data coercion and boxing should happen here. + # responsible only for factorization. All data coercion, sorting and boxing + # should happen here. values = _ensure_arraylike(values) original = values From 2688c4f39d4ffdb40ddc1ecb6471fb11627cac7e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 12 Mar 2018 07:56:08 -0500 Subject: [PATCH 7/8] Fixed new sort algo --- pandas/core/algorithms.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 7b6ecd2df93d8..884c564763a10 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -520,7 +520,8 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): from pandas.core.sorting import safe_sort try: order = uniques.argsort() - labels = take_1d(order, labels, fill_value=na_sentinel) + order2 = order.argsort() + labels = take_1d(order2, labels, fill_value=na_sentinel) uniques = uniques.take(order) except TypeError: # Mixed types, where uniques.argsort fails. From ab4f01c0eec37baf9c2bb47014ee192484bd7200 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 12 Mar 2018 20:37:52 -0500 Subject: [PATCH 8/8] Implement interface --- pandas/core/arrays/categorical.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index c4da558467e41..b37f88d8bfdce 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -422,6 +422,10 @@ def _ndarray_values(self): def _constructor(self): return Categorical + @classmethod + def _constructor_from_sequence(cls, scalars): + return Categorical(scalars) + def copy(self): """ Copy constructor. """ return self._constructor(values=self._codes.copy(),