From 33acd3a372b430490709ae6d1babdbff9171a50f Mon Sep 17 00:00:00 2001 From: tp Date: Sun, 4 Apr 2021 02:43:26 +0100 Subject: [PATCH 1/5] TYP/CLN: factorize_from_iterable(s) --- pandas/core/arrays/categorical.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 32c3095c3e6ee..fe08ea418493e 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -11,6 +11,7 @@ List, Optional, Sequence, + Tuple, Type, TypeVar, Union, @@ -2642,13 +2643,11 @@ def recode_for_categories( return new_codes -def factorize_from_iterable(values): +def factorize_from_iterable(values) -> Tuple[np.ndarray, Index]: """ Factorize an input `values` into `categories` and `codes`. Preserves categorical dtype in `categories`. - *This is an internal function* - Parameters ---------- values : list-like @@ -2660,6 +2659,8 @@ def factorize_from_iterable(values): If `values` has a categorical dtype, then `categories` is a CategoricalIndex keeping the categories and order of `values`. """ + from pandas import CategoricalIndex + if not is_list_like(values): raise TypeError("Input must be list-like") @@ -2668,7 +2669,8 @@ def factorize_from_iterable(values): # The Categorical we want to build has the same categories # as values but its codes are by def [0, ..., len(n_categories) - 1] cat_codes = np.arange(len(values.categories), dtype=values.codes.dtype) - categories = Categorical.from_codes(cat_codes, dtype=values.dtype) + cat = Categorical.from_codes(cat_codes, dtype=values.dtype) + categories = CategoricalIndex(cat) codes = values.codes else: # The value of ordered is irrelevant since we don't use cat as such, @@ -2680,26 +2682,26 @@ def factorize_from_iterable(values): return codes, categories -def factorize_from_iterables(iterables): +def factorize_from_iterables(iterables) -> Tuple[List[np.ndarray], List[Index]]: """ A higher-level wrapper over `factorize_from_iterable`. - *This is an internal function* - Parameters ---------- iterables : list-like of list-likes Returns ------- - codes_list : list of ndarrays - categories_list : list of Indexes + codes : list of ndarrays + categories : list of Indexes Notes ----- See `factorize_from_iterable` for more info. """ if len(iterables) == 0: - # For consistency, it should return a list of 2 lists. - return [[], []] - return map(list, zip(*(factorize_from_iterable(it) for it in iterables))) + # For consistency, it should return two empty lists. + return [], [] + + codes, categories = zip(*(factorize_from_iterable(it) for it in iterables)) + return list(codes), list(categories) From 193a1d85315162e14bbb750ad1d3bae389c3277a Mon Sep 17 00:00:00 2001 From: tp Date: Sun, 4 Apr 2021 02:56:37 +0100 Subject: [PATCH 2/5] special-casing len(iterables) == 0 not needed --- pandas/core/arrays/categorical.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index fe08ea418493e..68331fbf08353 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2699,9 +2699,5 @@ def factorize_from_iterables(iterables) -> Tuple[List[np.ndarray], List[Index]]: ----- See `factorize_from_iterable` for more info. """ - if len(iterables) == 0: - # For consistency, it should return two empty lists. - return [], [] - codes, categories = zip(*(factorize_from_iterable(it) for it in iterables)) return list(codes), list(categories) From 55e36143c6f42afab1f31848cdaf54b8e7a5560d Mon Sep 17 00:00:00 2001 From: tp Date: Sun, 4 Apr 2021 02:59:57 +0100 Subject: [PATCH 3/5] reverse last commit --- pandas/core/arrays/categorical.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 68331fbf08353..fe08ea418493e 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2699,5 +2699,9 @@ def factorize_from_iterables(iterables) -> Tuple[List[np.ndarray], List[Index]]: ----- See `factorize_from_iterable` for more info. """ + if len(iterables) == 0: + # For consistency, it should return two empty lists. + return [], [] + codes, categories = zip(*(factorize_from_iterable(it) for it in iterables)) return list(codes), list(categories) From 81f2fbf1fc8a61ee8cbfd60cfc39f9868e10cd1e Mon Sep 17 00:00:00 2001 From: tp Date: Sun, 4 Apr 2021 08:52:33 +0100 Subject: [PATCH 4/5] fix mypy complaints --- pandas/core/reshape/reshape.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 613669b8cc1d8..77a097e5a610e 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -991,7 +991,8 @@ def get_empty_frame(data) -> DataFrame: if prefix is None: dummy_cols = levels else: - dummy_cols = [f"{prefix}{prefix_sep}{level}" for level in levels] + template = f"{prefix}{prefix_sep}" + "{label}" + dummy_cols = levels.map(template.format) index: Optional[Index] if isinstance(data, Series): From e11436a51fbc3099da7ef8d9086499e171f1373b Mon Sep 17 00:00:00 2001 From: tp Date: Sun, 4 Apr 2021 09:35:29 +0100 Subject: [PATCH 5/5] fix mypy complaints, attempt II --- pandas/core/reshape/reshape.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 77a097e5a610e..346dc3732b212 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -991,8 +991,7 @@ def get_empty_frame(data) -> DataFrame: if prefix is None: dummy_cols = levels else: - template = f"{prefix}{prefix_sep}" + "{label}" - dummy_cols = levels.map(template.format) + dummy_cols = Index([f"{prefix}{prefix_sep}{level}" for level in levels]) index: Optional[Index] if isinstance(data, Series):