From df0cae94416aef4f7370e68c0e504d96b57b8b17 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 8 Apr 2024 11:30:50 -0700 Subject: [PATCH 1/4] Clean up logic in union_indexes --- pandas/_libs/lib.pyx | 12 ++--- pandas/core/indexes/api.py | 90 ++++++++++++-------------------------- 2 files changed, 33 insertions(+), 69 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index a2205454a5a46..c75a1ffc90da1 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -320,8 +320,8 @@ def fast_unique_multiple_list(lists: list, sort: bool | None = True) -> list: Py_ssize_t k = len(lists) Py_ssize_t i, j, n list uniques = [] - dict table = {} - object val, stub = 0 + set table = set() + object val for i in range(k): buf = lists[i] @@ -329,7 +329,7 @@ def fast_unique_multiple_list(lists: list, sort: bool | None = True) -> list: for j in range(n): val = buf[j] if val not in table: - table[val] = stub + table.add(val) uniques.append(val) if sort: try: @@ -361,15 +361,15 @@ def fast_unique_multiple_list_gen(object gen, bint sort=True) -> list: list buf Py_ssize_t j, n list uniques = [] - dict table = {} - object val, stub = 0 + set table = set() + object val for buf in gen: n = len(buf) for j in range(n): val = buf[j] if val not in table: - table[val] = stub + table.add(val) uniques.append(val) if sort: try: diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 9b05eb42c6d6e..e65eb5b3b4240 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -209,60 +209,6 @@ def union_indexes(indexes, sort: bool | None = True) -> Index: indexes, kind = _sanitize_and_check(indexes) - def _unique_indices(inds, dtype) -> Index: - """ - Concatenate indices and remove duplicates. - - Parameters - ---------- - inds : list of Index or list objects - dtype : dtype to set for the resulting Index - - Returns - ------- - Index - """ - if all(isinstance(ind, Index) for ind in inds): - inds = [ind.astype(dtype, copy=False) for ind in inds] - result = inds[0].unique() - other = inds[1].append(inds[2:]) - diff = other[result.get_indexer_for(other) == -1] - if len(diff): - result = result.append(diff.unique()) - if sort: - result = result.sort_values() - return result - - def conv(i): - if isinstance(i, Index): - i = i.tolist() - return i - - return Index( - lib.fast_unique_multiple_list([conv(i) for i in inds], sort=sort), - dtype=dtype, - ) - - def _find_common_index_dtype(inds): - """ - Finds a common type for the indexes to pass through to resulting index. - - Parameters - ---------- - inds: list of Index or list objects - - Returns - ------- - The common type or None if no indexes were given - """ - dtypes = [idx.dtype for idx in indexes if isinstance(idx, Index)] - if dtypes: - dtype = find_common_type(dtypes) - else: - dtype = None - - return dtype - if kind == "special": result = indexes[0] @@ -294,18 +240,36 @@ def _find_common_index_dtype(inds): return result elif kind == "array": - dtype = _find_common_index_dtype(indexes) - index = indexes[0] - if not all(index.equals(other) for other in indexes[1:]): - index = _unique_indices(indexes, dtype) + if not all_indexes_same(indexes): + dtype = find_common_type([idx.dtype for idx in indexes]) + inds = [ind.astype(dtype, copy=False) for ind in indexes] + index = inds[0].unique() + other = inds[1].append(inds[2:]) + diff = other[index.get_indexer_for(other) == -1] + if len(diff): + index = index.append(diff.unique()) + if sort: + index = index.sort_values() + else: + index = indexes[0] name = get_unanimous_names(*indexes)[0] if name != index.name: index = index.rename(name) return index - else: # kind='list' - dtype = _find_common_index_dtype(indexes) - return _unique_indices(indexes, dtype) + elif kind == "list": + dtypes = [idx.dtype for idx in indexes if isinstance(idx, Index)] + if dtypes: + dtype = find_common_type(dtypes) + else: + dtype = None + all_lists = [idx.tolist() if isinstance(idx, Index) else idx for idx in indexes] + return Index( + lib.fast_unique_multiple_list(all_lists, sort=sort), + dtype=dtype, + ) + else: + raise ValueError(f"{kind=} must be 'special', 'array' or 'list'.") def _sanitize_and_check(indexes): @@ -329,14 +293,14 @@ def _sanitize_and_check(indexes): sanitized_indexes : list of Index or list objects type : {'list', 'array', 'special'} """ - kinds = list({type(index) for index in indexes}) + kinds = {type(index) for index in indexes} if list in kinds: if len(kinds) > 1: indexes = [ Index(list(x)) if not isinstance(x, Index) else x for x in indexes ] - kinds.remove(list) + kinds -= {list} else: return indexes, "list" From 480cffde09c796fe9ec9723cc5307d207c754605 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 8 Apr 2024 11:34:37 -0700 Subject: [PATCH 2/4] add typing --- pandas/core/indexes/api.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index e65eb5b3b4240..8b17e33909d06 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -2,6 +2,7 @@ from typing import ( TYPE_CHECKING, + Literal, cast, ) @@ -272,7 +273,9 @@ def union_indexes(indexes, sort: bool | None = True) -> Index: raise ValueError(f"{kind=} must be 'special', 'array' or 'list'.") -def _sanitize_and_check(indexes): +def _sanitize_and_check( + indexes, +) -> tuple[list[Index | list], Literal["list", "array", "special"]]: """ Verify the type of indexes and convert lists to Index. From 6d6fe04619db2b91f06c96d34a82133143897ff0 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 8 Apr 2024 12:29:27 -0700 Subject: [PATCH 3/4] Just use the generator version --- pandas/_libs/lib.pyi | 1 - pandas/_libs/lib.pyx | 28 ---------------------------- pandas/core/indexes/api.py | 4 ++-- 3 files changed, 2 insertions(+), 31 deletions(-) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index b39d32d069619..daaaacee3487d 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -67,7 +67,6 @@ def fast_multiget( default=..., ) -> ArrayLike: ... def fast_unique_multiple_list_gen(gen: Generator, sort: bool = ...) -> list: ... -def fast_unique_multiple_list(lists: list, sort: bool | None = ...) -> list: ... @overload def map_infer( arr: np.ndarray, diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index c75a1ffc90da1..7aa1cb715521e 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -312,34 +312,6 @@ def item_from_zerodim(val: object) -> object: return val -@cython.wraparound(False) -@cython.boundscheck(False) -def fast_unique_multiple_list(lists: list, sort: bool | None = True) -> list: - cdef: - list buf - Py_ssize_t k = len(lists) - Py_ssize_t i, j, n - list uniques = [] - set table = set() - object val - - for i in range(k): - buf = lists[i] - n = len(buf) - for j in range(n): - val = buf[j] - if val not in table: - table.add(val) - uniques.append(val) - if sort: - try: - uniques.sort() - except TypeError: - pass - - return uniques - - @cython.wraparound(False) @cython.boundscheck(False) def fast_unique_multiple_list_gen(object gen, bint sort=True) -> list: diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 8b17e33909d06..8fb7efbe90066 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -264,9 +264,9 @@ def union_indexes(indexes, sort: bool | None = True) -> Index: dtype = find_common_type(dtypes) else: dtype = None - all_lists = [idx.tolist() if isinstance(idx, Index) else idx for idx in indexes] + all_lists = (idx.tolist() if isinstance(idx, Index) else idx for idx in indexes) return Index( - lib.fast_unique_multiple_list(all_lists, sort=sort), + lib.fast_unique_multiple_list_gen(all_lists, sort=bool(sort)), dtype=dtype, ) else: From 938e1c15edcb0bb70b88082c52ace5e6d37925be Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 8 Apr 2024 13:31:35 -0700 Subject: [PATCH 4/4] Undo typing --- pandas/core/indexes/api.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 8fb7efbe90066..c5e3f3a50e10d 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -2,7 +2,6 @@ from typing import ( TYPE_CHECKING, - Literal, cast, ) @@ -273,9 +272,7 @@ def union_indexes(indexes, sort: bool | None = True) -> Index: raise ValueError(f"{kind=} must be 'special', 'array' or 'list'.") -def _sanitize_and_check( - indexes, -) -> tuple[list[Index | list], Literal["list", "array", "special"]]: +def _sanitize_and_check(indexes): """ Verify the type of indexes and convert lists to Index.