From af72f4c15cc9beed3d93e38df29aec31960be453 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 21 Mar 2024 15:59:22 -0500 Subject: [PATCH 01/24] Improve merge performance --- asv_bench/benchmarks/join_merge.py | 114 ++++----------------- pandas/_libs/hashtable_class_helper.pxi.in | 32 ++++++ pandas/core/indexes/base.py | 22 ++++ pandas/core/reshape/merge.py | 2 + 4 files changed, 75 insertions(+), 95 deletions(-) diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index ce64304731116..b5a1007b566c9 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -20,101 +20,6 @@ from pandas import ordered_merge as merge_ordered -class Concat: - params = [0, 1] - param_names = ["axis"] - - def setup(self, axis): - N = 1000 - s = Series(N, index=Index([f"i-{i}" for i in range(N)], dtype=object)) - self.series = [s[i:-i] for i in range(1, 10)] * 50 - self.small_frames = [DataFrame(np.random.randn(5, 4))] * 1000 - df = DataFrame( - {"A": range(N)}, index=date_range("20130101", periods=N, freq="s") - ) - self.empty_left = [DataFrame(), df] - self.empty_right = [df, DataFrame()] - self.mixed_ndims = [df, df.head(N // 2)] - - def time_concat_series(self, axis): - concat(self.series, axis=axis, sort=False) - - def time_concat_small_frames(self, axis): - concat(self.small_frames, axis=axis) - - def time_concat_empty_right(self, axis): - concat(self.empty_right, axis=axis) - - def time_concat_empty_left(self, axis): - concat(self.empty_left, axis=axis) - - def time_concat_mixed_ndims(self, axis): - concat(self.mixed_ndims, axis=axis) - - -class ConcatDataFrames: - params = ([0, 1], [True, False]) - param_names = ["axis", "ignore_index"] - - def setup(self, axis, ignore_index): - frame_c = DataFrame(np.zeros((10000, 200), dtype=np.float32, order="C")) - self.frame_c = [frame_c] * 20 - frame_f = DataFrame(np.zeros((10000, 200), dtype=np.float32, order="F")) - self.frame_f = [frame_f] * 20 - - def time_c_ordered(self, axis, ignore_index): - concat(self.frame_c, axis=axis, ignore_index=ignore_index) - - def time_f_ordered(self, axis, ignore_index): - concat(self.frame_f, axis=axis, ignore_index=ignore_index) - - -class ConcatIndexDtype: - params = ( - [ - "datetime64[ns]", - "int64", - "Int64", - "int64[pyarrow]", - "string[python]", - "string[pyarrow]", - ], - ["monotonic", "non_monotonic", "has_na"], - [0, 1], - [True, False], - ) - param_names = ["dtype", "structure", "axis", "sort"] - - def setup(self, dtype, structure, axis, sort): - N = 10_000 - if dtype == "datetime64[ns]": - vals = date_range("1970-01-01", periods=N) - elif dtype in ("int64", "Int64", "int64[pyarrow]"): - vals = np.arange(N, dtype=np.int64) - elif dtype in ("string[python]", "string[pyarrow]"): - vals = Index([f"i-{i}" for i in range(N)], dtype=object) - else: - raise NotImplementedError - - idx = Index(vals, dtype=dtype) - - if structure == "monotonic": - idx = idx.sort_values() - elif structure == "non_monotonic": - idx = idx[::-1] - elif structure == "has_na": - if not idx._can_hold_na: - raise NotImplementedError - idx = Index([None], dtype=dtype).append(idx) - else: - raise NotImplementedError - - self.series = [Series(i, idx[:-i]) for i in range(1, 6)] - - def time_concat_series(self, dtype, structure, axis, sort): - concat(self.series, axis=axis, sort=sort) - - class Join: params = [True, False] param_names = ["sort"] @@ -328,6 +233,25 @@ def time_i8merge(self, how): merge(self.left, self.right, how=how) +class UniqueMerge: + params = [["left", "right"], [4_000_000, 1_000_000]] + param_names = ["unique", "unique_elements"] + + def setup(self, unique, unique_elements): + N = 1_000_000 + self.left = DataFrame({"a": np.random.randint(1, unique_elements, (N,))}) + self.right = DataFrame({"a": np.random.randint(1, unique_elements, (N,))}) + uniques = self.right.a.drop_duplicates() + self.right["a"] = concat( + [uniques, Series(np.arange(0, -(N - len(uniques)), -1))], ignore_index=True + ) + if unique == "left": + self.left, self.right = self.right, self.left + + def time_unique_merge(self, unique, unique_elements): + merge(self.left, self.right, how="inner") + + class MergeDatetime: params = [ [ diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 629b6b42db852..ae2932a526ce3 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -545,6 +545,38 @@ cdef class {{name}}HashTable(HashTable): self.table.vals[k] = i self.na_position = na_position + @cython.boundscheck(False) + def inner_join_unique(self, const {{dtype}}_t[:] values, const uint8_t[:] mask = None) -> tuple[ndarray, ndarray]: + cdef: + Py_ssize_t i, n = len(values) + int ret = 0, ctr = 0 + {{c_type}} val + khiter_t k + intp_t[::1] locs = np.empty(n, dtype=np.intp) + intp_t[::1] self_locs = np.empty(n, dtype=np.intp) + int8_t na_position = self.na_position + + if self.uses_mask and mask is None: + raise NotImplementedError # pragma: no cover + + with nogil: + for i in range(n): + if self.uses_mask and mask[i]: + if self.na_position == -1: + continue + self_locs[ctr] = na_position + locs[ctr] = na_position + ctr += 1 + else: + val = {{to_c_type}}(values[i]) + k = kh_get_{{dtype}}(self.table, val) + if k != self.table.n_buckets: + self_locs[ctr] = self.table.vals[k] + locs[ctr] = i + ctr += 1 + + return np.asarray(self_locs)[:ctr], np.asarray(locs)[:ctr] + @cython.boundscheck(False) def lookup(self, const {{dtype}}_t[:] values, const uint8_t[:] mask = None) -> ndarray: # -> np.ndarray[np.intp] diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 3c01778e05f3d..bbcdac90256d7 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4698,6 +4698,28 @@ def _can_use_libjoin(self) -> bool: # Doing so seems to break test_concat_datetime_timezone return not isinstance(self, (ABCIntervalIndex, ABCMultiIndex)) + @final + def _can_use_unique_join(self, other: Index, how, sort: bool) -> bool: + if how != "inner": + return False + if sort: + # Not worth it if we're going to sort the result + return False + + if self.dtype.kind in "iufb" or self.dtype == "O": + return other.is_unique or self.is_unique + return False + + def _unique_join(self, other: Index) -> tuple[np.ndarray, np.ndarray]: + if other.is_unique: + ridx, lidx = other._engine.mapping.inner_join_unique(self._values) + else: + lidx, ridx = self._engine.mapping.inner_join_unique(other._values) + indexer = np.argsort(lidx) + lidx = lidx[indexer] + ridx = ridx[indexer] + return lidx, ridx + # -------------------------------------------------------------------- # Uncategorized Methods diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 8ea2ac24e13c8..b3e93e419cb9b 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1745,6 +1745,8 @@ def get_join_indexers( and (left.is_unique or right.is_unique) ): _, lidx, ridx = left.join(right, how=how, return_indexers=True, sort=sort) + elif left._can_use_unique_join(right, how, sort): + lidx, ridx = left._unique_join(right) else: lidx, ridx = get_join_indexers_non_unique( left._values, right._values, sort, how From 6bad79eceecd387d4ab42298dbe62f2ac75a15e7 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 21 Mar 2024 16:07:18 -0500 Subject: [PATCH 02/24] Restrict to other unique --- pandas/core/indexes/base.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index bbcdac90256d7..287e3747c9e0c 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4707,17 +4707,11 @@ def _can_use_unique_join(self, other: Index, how, sort: bool) -> bool: return False if self.dtype.kind in "iufb" or self.dtype == "O": - return other.is_unique or self.is_unique + return other.is_unique return False def _unique_join(self, other: Index) -> tuple[np.ndarray, np.ndarray]: - if other.is_unique: - ridx, lidx = other._engine.mapping.inner_join_unique(self._values) - else: - lidx, ridx = self._engine.mapping.inner_join_unique(other._values) - indexer = np.argsort(lidx) - lidx = lidx[indexer] - ridx = ridx[indexer] + ridx, lidx = other._engine.mapping.inner_join_unique(self._values) return lidx, ridx # -------------------------------------------------------------------- From 8f162eba07104f455fe06f6da24e44546c2d3e3a Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 21 Mar 2024 16:43:14 -0500 Subject: [PATCH 03/24] Fixup --- asv_bench/benchmarks/join_merge.py | 10 ++++------ pandas/core/indexes/base.py | 4 ++-- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index b5a1007b566c9..03c0f92d766c6 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -234,10 +234,10 @@ def time_i8merge(self, how): class UniqueMerge: - params = [["left", "right"], [4_000_000, 1_000_000]] - param_names = ["unique", "unique_elements"] + params = [4_000_000, 1_000_000] + param_names = ["unique_elements"] - def setup(self, unique, unique_elements): + def setup(self, unique_elements): N = 1_000_000 self.left = DataFrame({"a": np.random.randint(1, unique_elements, (N,))}) self.right = DataFrame({"a": np.random.randint(1, unique_elements, (N,))}) @@ -245,10 +245,8 @@ def setup(self, unique, unique_elements): self.right["a"] = concat( [uniques, Series(np.arange(0, -(N - len(uniques)), -1))], ignore_index=True ) - if unique == "left": - self.left, self.right = self.right, self.left - def time_unique_merge(self, unique, unique_elements): + def time_unique_merge(self, unique_elements): merge(self.left, self.right, how="inner") diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 287e3747c9e0c..f5f9bb1830c88 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4706,8 +4706,8 @@ def _can_use_unique_join(self, other: Index, how, sort: bool) -> bool: # Not worth it if we're going to sort the result return False - if self.dtype.kind in "iufb" or self.dtype == "O": - return other.is_unique + if self.dtype.kind in "iufb": + return other.is_unique and other._engine.is_mapping_populated return False def _unique_join(self, other: Index) -> tuple[np.ndarray, np.ndarray]: From 2a52d2ac96bdd97fd5c5ca19906fc9d997e6aac4 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 21 Mar 2024 17:26:35 -0500 Subject: [PATCH 04/24] Rewrite unique computation --- pandas/_libs/hashtable.pyx | 4 +- pandas/_libs/hashtable_class_helper.pxi.in | 47 +++++++++++++++++++++- pandas/core/indexes/base.py | 16 -------- pandas/core/reshape/merge.py | 35 +++++++++++----- 4 files changed, 71 insertions(+), 31 deletions(-) diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index 8250d0242c31f..d78dfef41095e 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -75,7 +75,7 @@ cdef class Factorizer: def get_count(self) -> int: return self.count - def factorize(self, values, na_sentinel=-1, na_value=None, mask=None) -> np.ndarray: + def factorize(self, values, mask=None, na_sentinel=-1, na_value=None) -> np.ndarray: raise NotImplementedError @@ -89,7 +89,7 @@ cdef class ObjectFactorizer(Factorizer): self.uniques = ObjectVector() def factorize( - self, ndarray[object] values, na_sentinel=-1, na_value=None, mask=None + self, ndarray[object] values, mask=None, na_sentinel=-1, na_value=None ) -> np.ndarray: """ diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index ae2932a526ce3..7dbf4027f448d 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -545,6 +545,49 @@ cdef class {{name}}HashTable(HashTable): self.table.vals[k] = i self.na_position = na_position + @cython.boundscheck(False) + def is_unique(self, const {{dtype}}_t[:] values, const uint8_t[:] mask = None) -> bool: + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + {{c_type}} val + khiter_t k + int8_t na_position = self.na_position + + if self.uses_mask and mask is None: + raise NotImplementedError # pragma: no cover + + with nogil: + if self.uses_mask: + for i in range(n): + if mask[i]: + if na_position != -1: + i = 0 + break + na_position = i + else: + val = {{to_c_type}}(values[i]) + k = kh_get_{{dtype}}(self.table, val) + if k != self.table.n_buckets: + i = 0 + break + k = kh_put_{{dtype}}(self.table, val, &ret) + self.table.vals[k] = i + else: + for i in range(n): + val = {{to_c_type}}(values[i]) + k = kh_get_{{dtype}}(self.table, val) + if k != self.table.n_buckets: + i = 0 + break + k = kh_put_{{dtype}}(self.table, val, &ret) + self.table.vals[k] = i + self.na_position = na_position + if i == 0: + return False + else: + return True + @cython.boundscheck(False) def inner_join_unique(self, const {{dtype}}_t[:] values, const uint8_t[:] mask = None) -> tuple[ndarray, ndarray]: cdef: @@ -903,8 +946,8 @@ cdef class {{name}}Factorizer(Factorizer): self.table = {{name}}HashTable(size_hint) self.uniques = {{name}}Vector() - def factorize(self, const {{c_type}}[:] values, - na_sentinel=-1, na_value=None, object mask=None) -> np.ndarray: + def factorize(self, const {{c_type}}[:] values, object mask=None, + na_sentinel=-1, na_value=None) -> np.ndarray: """ Returns ------- diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index f5f9bb1830c88..3c01778e05f3d 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4698,22 +4698,6 @@ def _can_use_libjoin(self) -> bool: # Doing so seems to break test_concat_datetime_timezone return not isinstance(self, (ABCIntervalIndex, ABCMultiIndex)) - @final - def _can_use_unique_join(self, other: Index, how, sort: bool) -> bool: - if how != "inner": - return False - if sort: - # Not worth it if we're going to sort the result - return False - - if self.dtype.kind in "iufb": - return other.is_unique and other._engine.is_mapping_populated - return False - - def _unique_join(self, other: Index) -> tuple[np.ndarray, np.ndarray]: - ridx, lidx = other._engine.mapping.inner_join_unique(self._values) - return lidx, ridx - # -------------------------------------------------------------------- # Uncategorized Methods diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index b3e93e419cb9b..4728a3fee6baf 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1739,15 +1739,24 @@ def get_join_indexers( left = Index(lkey) right = Index(rkey) + computed = False + if ( left.is_monotonic_increasing and right.is_monotonic_increasing and (left.is_unique or right.is_unique) ): _, lidx, ridx = left.join(right, how=how, return_indexers=True, sort=sort) - elif left._can_use_unique_join(right, how, sort): - lidx, ridx = left._unique_join(right) - else: + computed = True + elif how == "inner" and not sort and right.dtype.kind in "iufb": + arr, mask = _convert_arrays_for_rizer(rkey) + htable, arr = algos._get_hashtable_algo(arr) + tbl = htable() + if tbl.is_unique(arr, mask): + ridx, lidx = tbl.inner_join_unique(*_convert_arrays_for_rizer(lkey)) + computed = True + + if not computed: lidx, ridx = get_join_indexers_non_unique( left._values, right._values, sort, how ) @@ -2533,18 +2542,14 @@ def _factorize_keys( if isinstance(lk, BaseMaskedArray): assert isinstance(rk, BaseMaskedArray) - llab = rizer.factorize(lk._data, mask=lk._mask) - rlab = rizer.factorize(rk._data, mask=rk._mask) + llab = rizer.factorize(*_convert_arrays_for_rizer(lk)) + rlab = rizer.factorize(*_convert_arrays_for_rizer(rk)) elif isinstance(lk, ArrowExtensionArray): assert isinstance(rk, ArrowExtensionArray) # we can only get here with numeric dtypes # TODO: Remove when we have a Factorizer for Arrow - llab = rizer.factorize( - lk.to_numpy(na_value=1, dtype=lk.dtype.numpy_dtype), mask=lk.isna() - ) - rlab = rizer.factorize( - rk.to_numpy(na_value=1, dtype=lk.dtype.numpy_dtype), mask=rk.isna() - ) + llab = rizer.factorize(*_convert_arrays_for_rizer(lk)) + rlab = rizer.factorize(*_convert_arrays_for_rizer(rk)) else: # Argument 1 to "factorize" of "ObjectFactorizer" has incompatible type # "Union[ndarray[Any, dtype[signedinteger[_64Bit]]], @@ -2576,6 +2581,14 @@ def _factorize_keys( return llab, rlab, count +def _convert_arrays_for_rizer(arr): + if isinstance(arr, BaseMaskedArray): + return arr._data, arr._mask + elif isinstance(arr, ArrowExtensionArray): + return arr.to_numpy(na_value=1, dtype=arr.dtype.numpy_dtype), arr.isna() + return arr, None + + def _convert_arrays_and_get_rizer_klass( lk: ArrayLike, rk: ArrayLike ) -> tuple[type[libhashtable.Factorizer], ArrayLike, ArrayLike]: From 38018950b124a601cb5471c15fa65b2a47053d20 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 21 Mar 2024 17:32:53 -0500 Subject: [PATCH 05/24] Improve performance --- pandas/_libs/hashtable_class_helper.pxi.in | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 7dbf4027f448d..2012b21061f18 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -567,21 +567,19 @@ cdef class {{name}}HashTable(HashTable): na_position = i else: val = {{to_c_type}}(values[i]) - k = kh_get_{{dtype}}(self.table, val) - if k != self.table.n_buckets: - i = 0 - break k = kh_put_{{dtype}}(self.table, val, &ret) self.table.vals[k] = i else: for i in range(n): val = {{to_c_type}}(values[i]) - k = kh_get_{{dtype}}(self.table, val) - if k != self.table.n_buckets: - i = 0 - break k = kh_put_{{dtype}}(self.table, val, &ret) self.table.vals[k] = i + + if i % 10_000 == 0: + if self.table.size != i + 1: + i = 0 + break + self.na_position = na_position if i == 0: return False From 223f136e6c99d3326b6278eff0857eee8f6618a5 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 21 Mar 2024 17:35:05 -0500 Subject: [PATCH 06/24] Improve performance --- pandas/_libs/hashtable_class_helper.pxi.in | 5 ----- pandas/core/reshape/merge.py | 3 ++- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 2012b21061f18..def8bf576a490 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -575,11 +575,6 @@ cdef class {{name}}HashTable(HashTable): k = kh_put_{{dtype}}(self.table, val, &ret) self.table.vals[k] = i - if i % 10_000 == 0: - if self.table.size != i + 1: - i = 0 - break - self.na_position = na_position if i == 0: return False diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 4728a3fee6baf..3bb0fcebecafb 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1752,7 +1752,8 @@ def get_join_indexers( arr, mask = _convert_arrays_for_rizer(rkey) htable, arr = algos._get_hashtable_algo(arr) tbl = htable() - if tbl.is_unique(arr, mask): + tbl.is_unique(arr, mask) + if len(tbl) == len(arr): ridx, lidx = tbl.inner_join_unique(*_convert_arrays_for_rizer(lkey)) computed = True From 1a3bf1b15dc7535dcffd8599804a5400f3a27a96 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 21 Mar 2024 17:40:07 -0500 Subject: [PATCH 07/24] Improve performance --- pandas/_libs/hashtable_class_helper.pxi.in | 5 +++++ pandas/core/reshape/merge.py | 3 +-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index def8bf576a490..2012b21061f18 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -575,6 +575,11 @@ cdef class {{name}}HashTable(HashTable): k = kh_put_{{dtype}}(self.table, val, &ret) self.table.vals[k] = i + if i % 10_000 == 0: + if self.table.size != i + 1: + i = 0 + break + self.na_position = na_position if i == 0: return False diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 3bb0fcebecafb..4728a3fee6baf 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1752,8 +1752,7 @@ def get_join_indexers( arr, mask = _convert_arrays_for_rizer(rkey) htable, arr = algos._get_hashtable_algo(arr) tbl = htable() - tbl.is_unique(arr, mask) - if len(tbl) == len(arr): + if tbl.is_unique(arr, mask): ridx, lidx = tbl.inner_join_unique(*_convert_arrays_for_rizer(lkey)) computed = True From c0f3532b8609fb71674bce4c1072e511feef5b1a Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 21 Mar 2024 17:47:44 -0500 Subject: [PATCH 08/24] Improve performance --- pandas/core/reshape/merge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 4728a3fee6baf..b94579a370769 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1751,7 +1751,7 @@ def get_join_indexers( elif how == "inner" and not sort and right.dtype.kind in "iufb": arr, mask = _convert_arrays_for_rizer(rkey) htable, arr = algos._get_hashtable_algo(arr) - tbl = htable() + tbl = htable(len(arr)) if tbl.is_unique(arr, mask): ridx, lidx = tbl.inner_join_unique(*_convert_arrays_for_rizer(lkey)) computed = True From 80a1cf8e332b29a011125c060b0b13762c183bb1 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 21 Mar 2024 18:22:42 -0500 Subject: [PATCH 09/24] Integrate better --- pandas/_libs/hashtable_class_helper.pxi.in | 3 ++ pandas/core/reshape/merge.py | 47 +++++++++++++--------- 2 files changed, 31 insertions(+), 19 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 2012b21061f18..de36c353a1d17 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -972,6 +972,9 @@ cdef class {{name}}Factorizer(Factorizer): self.count = len(self.uniques) return labels + def inner_join_unique(self, const {{c_type}}[:] values, const uint8_t[:] mask = None) -> tuple[np.ndarray, np.ndarray]: + return self.table.inner_join_unique(values, mask) + {{endfor}} diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index b94579a370769..cd6afdc2d84c0 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1739,24 +1739,13 @@ def get_join_indexers( left = Index(lkey) right = Index(rkey) - computed = False - if ( left.is_monotonic_increasing and right.is_monotonic_increasing and (left.is_unique or right.is_unique) ): _, lidx, ridx = left.join(right, how=how, return_indexers=True, sort=sort) - computed = True - elif how == "inner" and not sort and right.dtype.kind in "iufb": - arr, mask = _convert_arrays_for_rizer(rkey) - htable, arr = algos._get_hashtable_algo(arr) - tbl = htable(len(arr)) - if tbl.is_unique(arr, mask): - ridx, lidx = tbl.inner_join_unique(*_convert_arrays_for_rizer(lkey)) - computed = True - - if not computed: + else: lidx, ridx = get_join_indexers_non_unique( left._values, right._values, sort, how ) @@ -1791,7 +1780,12 @@ def get_join_indexers_non_unique( np.ndarray[np.intp] Indexer into right. """ - lkey, rkey, count = _factorize_keys(left, right, sort=sort) + lkey, rkey, count = _factorize_keys( + left, right, sort=sort, how=how, unique_merge=True + ) + if count == -1: + # unique merge + return lkey, rkey if how == "left": lidx, ridx = libjoin.left_outer_join(lkey, rkey, count, sort=sort) elif how == "right": @@ -2396,7 +2390,11 @@ def _left_join_on_index( def _factorize_keys( - lk: ArrayLike, rk: ArrayLike, sort: bool = True + lk: ArrayLike, + rk: ArrayLike, + sort: bool = True, + how: str = "inner", + unique_merge: bool = False, ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: """ Encode left and right keys as enumerated types. @@ -2542,20 +2540,31 @@ def _factorize_keys( if isinstance(lk, BaseMaskedArray): assert isinstance(rk, BaseMaskedArray) - llab = rizer.factorize(*_convert_arrays_for_rizer(lk)) - rlab = rizer.factorize(*_convert_arrays_for_rizer(rk)) + lk_data = lk._data + lk_mask = lk._mask + rlab = rizer.factorize(rk._data, mask=rk._mask) elif isinstance(lk, ArrowExtensionArray): assert isinstance(rk, ArrowExtensionArray) # we can only get here with numeric dtypes # TODO: Remove when we have a Factorizer for Arrow - llab = rizer.factorize(*_convert_arrays_for_rizer(lk)) - rlab = rizer.factorize(*_convert_arrays_for_rizer(rk)) + lk_data = lk.to_numpy(na_value=1, dtype=lk.dtype.numpy_dtype) + lk_mask = lk.isna() + rlab = rizer.factorize( + rk.to_numpy(na_value=1, dtype=lk.dtype.numpy_dtype), mask=rk.isna() + ) else: # Argument 1 to "factorize" of "ObjectFactorizer" has incompatible type # "Union[ndarray[Any, dtype[signedinteger[_64Bit]]], # ndarray[Any, dtype[object_]]]"; expected "ndarray[Any, dtype[object_]]" - llab = rizer.factorize(lk) # type: ignore[arg-type] + lk_data = lk + lk_mask = None rlab = rizer.factorize(rk) # type: ignore[arg-type] + + if not sort and how == "inner" and rizer.get_count() == len(rlab) and unique_merge: + ridx, lidx = rizer.inner_join_unique(lk_data, lk_mask) + return lidx, ridx, -1 + + llab = rizer.factorize(lk_data, mask=lk_mask) assert llab.dtype == np.dtype(np.intp), llab.dtype assert rlab.dtype == np.dtype(np.intp), rlab.dtype From b2d11e305dc7ad4aa7d73bc878e2866922ddd312 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 22 Mar 2024 16:35:40 -0500 Subject: [PATCH 10/24] Fix bugs --- pandas/_libs/hashtable.pyi | 2 +- pandas/_libs/hashtable.pyx | 4 +- pandas/_libs/hashtable_class_helper.pxi.in | 45 +--------------------- pandas/core/reshape/merge.py | 43 ++++++++++++--------- pandas/tests/reshape/merge/test_merge.py | 18 ++++----- 5 files changed, 38 insertions(+), 74 deletions(-) diff --git a/pandas/_libs/hashtable.pyi b/pandas/_libs/hashtable.pyi index 3725bfa3362d9..d3fd033935112 100644 --- a/pandas/_libs/hashtable.pyi +++ b/pandas/_libs/hashtable.pyi @@ -16,7 +16,7 @@ def unique_label_indices( class Factorizer: count: int uniques: Any - def __init__(self, size_hint: int) -> None: ... + def __init__(self, size_hint: int, uses_mask: bool = False) -> None: ... def get_count(self) -> int: ... def factorize( self, diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index d78dfef41095e..139d86bd4f268 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -69,7 +69,7 @@ cdef class Factorizer: cdef readonly: Py_ssize_t count - def __cinit__(self, size_hint: int): + def __cinit__(self, size_hint: int, uses_mask: bool = False): self.count = 0 def get_count(self) -> int: @@ -84,7 +84,7 @@ cdef class ObjectFactorizer(Factorizer): PyObjectHashTable table ObjectVector uniques - def __cinit__(self, size_hint: int): + def __cinit__(self, size_hint: int, uses_mask: bool = False): self.table = PyObjectHashTable(size_hint) self.uniques = ObjectVector() diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index de36c353a1d17..704d2d83dd8a8 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -545,47 +545,6 @@ cdef class {{name}}HashTable(HashTable): self.table.vals[k] = i self.na_position = na_position - @cython.boundscheck(False) - def is_unique(self, const {{dtype}}_t[:] values, const uint8_t[:] mask = None) -> bool: - cdef: - Py_ssize_t i, n = len(values) - int ret = 0 - {{c_type}} val - khiter_t k - int8_t na_position = self.na_position - - if self.uses_mask and mask is None: - raise NotImplementedError # pragma: no cover - - with nogil: - if self.uses_mask: - for i in range(n): - if mask[i]: - if na_position != -1: - i = 0 - break - na_position = i - else: - val = {{to_c_type}}(values[i]) - k = kh_put_{{dtype}}(self.table, val, &ret) - self.table.vals[k] = i - else: - for i in range(n): - val = {{to_c_type}}(values[i]) - k = kh_put_{{dtype}}(self.table, val, &ret) - self.table.vals[k] = i - - if i % 10_000 == 0: - if self.table.size != i + 1: - i = 0 - break - - self.na_position = na_position - if i == 0: - return False - else: - return True - @cython.boundscheck(False) def inner_join_unique(self, const {{dtype}}_t[:] values, const uint8_t[:] mask = None) -> tuple[ndarray, ndarray]: cdef: @@ -940,8 +899,8 @@ cdef class {{name}}Factorizer(Factorizer): {{name}}HashTable table {{name}}Vector uniques - def __cinit__(self, size_hint: int): - self.table = {{name}}HashTable(size_hint) + def __cinit__(self, size_hint: int, uses_mask: bool = False): + self.table = {{name}}HashTable(size_hint, uses_mask=uses_mask) self.uniques = {{name}}Vector() def factorize(self, const {{c_type}}[:] values, object mask=None, diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index cd6afdc2d84c0..f504910e32bf0 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1781,7 +1781,7 @@ def get_join_indexers_non_unique( Indexer into right. """ lkey, rkey, count = _factorize_keys( - left, right, sort=sort, how=how, unique_merge=True + left, right, sort=sort, try_unique_merge=not sort and how == "inner" ) if count == -1: # unique merge @@ -2393,8 +2393,7 @@ def _factorize_keys( lk: ArrayLike, rk: ArrayLike, sort: bool = True, - how: str = "inner", - unique_merge: bool = False, + try_unique_merge: bool = False, ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: """ Encode left and right keys as enumerated types. @@ -2536,35 +2535,41 @@ def _factorize_keys( klass, lk, rk = _convert_arrays_and_get_rizer_klass(lk, rk) - rizer = klass(max(len(lk), len(rk))) + rizer = klass( + max(len(lk), len(rk)), + uses_mask=isinstance(rk, (BaseMaskedArray, ArrowExtensionArray)), + ) if isinstance(lk, BaseMaskedArray): assert isinstance(rk, BaseMaskedArray) - lk_data = lk._data - lk_mask = lk._mask - rlab = rizer.factorize(rk._data, mask=rk._mask) + lk_data, lk_mask = lk._data, lk._mask + rk_data, rk_mask = rk._data, rk._mask elif isinstance(lk, ArrowExtensionArray): assert isinstance(rk, ArrowExtensionArray) # we can only get here with numeric dtypes # TODO: Remove when we have a Factorizer for Arrow lk_data = lk.to_numpy(na_value=1, dtype=lk.dtype.numpy_dtype) - lk_mask = lk.isna() - rlab = rizer.factorize( - rk.to_numpy(na_value=1, dtype=lk.dtype.numpy_dtype), mask=rk.isna() - ) + rk_data = rk.to_numpy(na_value=1, dtype=lk.dtype.numpy_dtype) + lk_mask, rk_mask = lk.isna(), rk.isna() else: # Argument 1 to "factorize" of "ObjectFactorizer" has incompatible type # "Union[ndarray[Any, dtype[signedinteger[_64Bit]]], # ndarray[Any, dtype[object_]]]"; expected "ndarray[Any, dtype[object_]]" - lk_data = lk - lk_mask = None - rlab = rizer.factorize(rk) # type: ignore[arg-type] - - if not sort and how == "inner" and rizer.get_count() == len(rlab) and unique_merge: - ridx, lidx = rizer.inner_join_unique(lk_data, lk_mask) - return lidx, ridx, -1 + lk_data, rk_data = lk, rk + lk_mask, rk_mask = None, None + + try_unique_merge = try_unique_merge and lk.dtype.kind in "iufb" + if try_unique_merge: + rlab = rizer.factorize(rk_data, mask=rk_mask) + if rizer.get_count() == len(rlab): + ridx, lidx = rizer.inner_join_unique(lk_data, lk_mask) + return lidx, ridx, -1 + else: + llab = rizer.factorize(lk_data, mask=lk_mask) + else: + llab = rizer.factorize(lk_data, mask=lk_mask) + rlab = rizer.factorize(rk_data, mask=rk_mask) - llab = rizer.factorize(lk_data, mask=lk_mask) assert llab.dtype == np.dtype(np.intp), llab.dtype assert rlab.dtype == np.dtype(np.intp), rlab.dtype diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index f063f333ac889..ab3c5560da02b 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2924,21 +2924,21 @@ def test_merge_ea_int_and_float_numpy(): df2 = DataFrame([1.5]) expected = DataFrame(columns=[0], dtype="Int64") - with tm.assert_produces_warning(UserWarning, match="You are merging"): - result = df1.merge(df2) - tm.assert_frame_equal(result, expected) - - with tm.assert_produces_warning(UserWarning, match="You are merging"): - result = df2.merge(df1) - tm.assert_frame_equal(result, expected.astype("float64")) + # with tm.assert_produces_warning(UserWarning, match="You are merging"): + # result = df1.merge(df2) + # tm.assert_frame_equal(result, expected) + # + # with tm.assert_produces_warning(UserWarning, match="You are merging"): + # result = df2.merge(df1) + # tm.assert_frame_equal(result, expected.astype("float64")) df2 = DataFrame([1.0]) expected = DataFrame([1], columns=[0], dtype="Int64") result = df1.merge(df2) tm.assert_frame_equal(result, expected) - result = df2.merge(df1) - tm.assert_frame_equal(result, expected.astype("float64")) + # result = df2.merge(df1) + # tm.assert_frame_equal(result, expected.astype("float64")) def test_merge_arrow_string_index(any_string_dtype): From 3b6b787e5bfde617e093aa17419cedeb03bb8b67 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 22 Mar 2024 16:35:58 -0500 Subject: [PATCH 11/24] Fix bugs --- pandas/tests/reshape/merge/test_merge.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index ab3c5560da02b..f063f333ac889 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2924,21 +2924,21 @@ def test_merge_ea_int_and_float_numpy(): df2 = DataFrame([1.5]) expected = DataFrame(columns=[0], dtype="Int64") - # with tm.assert_produces_warning(UserWarning, match="You are merging"): - # result = df1.merge(df2) - # tm.assert_frame_equal(result, expected) - # - # with tm.assert_produces_warning(UserWarning, match="You are merging"): - # result = df2.merge(df1) - # tm.assert_frame_equal(result, expected.astype("float64")) + with tm.assert_produces_warning(UserWarning, match="You are merging"): + result = df1.merge(df2) + tm.assert_frame_equal(result, expected) + + with tm.assert_produces_warning(UserWarning, match="You are merging"): + result = df2.merge(df1) + tm.assert_frame_equal(result, expected.astype("float64")) df2 = DataFrame([1.0]) expected = DataFrame([1], columns=[0], dtype="Int64") result = df1.merge(df2) tm.assert_frame_equal(result, expected) - # result = df2.merge(df1) - # tm.assert_frame_equal(result, expected.astype("float64")) + result = df2.merge(df1) + tm.assert_frame_equal(result, expected.astype("float64")) def test_merge_arrow_string_index(any_string_dtype): From 616dfc8bbbf41292f1664244478754047ae66cb8 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 22 Mar 2024 17:16:29 -0500 Subject: [PATCH 12/24] Rename method --- asv_bench/benchmarks/join_merge.py | 95 ++++++++++++++++++++++ pandas/_libs/hashtable.pyi | 6 ++ pandas/_libs/hashtable_class_helper.pxi.in | 6 +- pandas/core/reshape/merge.py | 4 +- 4 files changed, 106 insertions(+), 5 deletions(-) diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index 03c0f92d766c6..a6c6990892d38 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -20,6 +20,101 @@ from pandas import ordered_merge as merge_ordered +class Concat: + params = [0, 1] + param_names = ["axis"] + + def setup(self, axis): + N = 1000 + s = Series(N, index=Index([f"i-{i}" for i in range(N)], dtype=object)) + self.series = [s[i:-i] for i in range(1, 10)] * 50 + self.small_frames = [DataFrame(np.random.randn(5, 4))] * 1000 + df = DataFrame( + {"A": range(N)}, index=date_range("20130101", periods=N, freq="s") + ) + self.empty_left = [DataFrame(), df] + self.empty_right = [df, DataFrame()] + self.mixed_ndims = [df, df.head(N // 2)] + + def time_concat_series(self, axis): + concat(self.series, axis=axis, sort=False) + + def time_concat_small_frames(self, axis): + concat(self.small_frames, axis=axis) + + def time_concat_empty_right(self, axis): + concat(self.empty_right, axis=axis) + + def time_concat_empty_left(self, axis): + concat(self.empty_left, axis=axis) + + def time_concat_mixed_ndims(self, axis): + concat(self.mixed_ndims, axis=axis) + + +class ConcatDataFrames: + params = ([0, 1], [True, False]) + param_names = ["axis", "ignore_index"] + + def setup(self, axis, ignore_index): + frame_c = DataFrame(np.zeros((10000, 200), dtype=np.float32, order="C")) + self.frame_c = [frame_c] * 20 + frame_f = DataFrame(np.zeros((10000, 200), dtype=np.float32, order="F")) + self.frame_f = [frame_f] * 20 + + def time_c_ordered(self, axis, ignore_index): + concat(self.frame_c, axis=axis, ignore_index=ignore_index) + + def time_f_ordered(self, axis, ignore_index): + concat(self.frame_f, axis=axis, ignore_index=ignore_index) + + +class ConcatIndexDtype: + params = ( + [ + "datetime64[ns]", + "int64", + "Int64", + "int64[pyarrow]", + "string[python]", + "string[pyarrow]", + ], + ["monotonic", "non_monotonic", "has_na"], + [0, 1], + [True, False], + ) + param_names = ["dtype", "structure", "axis", "sort"] + + def setup(self, dtype, structure, axis, sort): + N = 10_000 + if dtype == "datetime64[ns]": + vals = date_range("1970-01-01", periods=N) + elif dtype in ("int64", "Int64", "int64[pyarrow]"): + vals = np.arange(N, dtype=np.int64) + elif dtype in ("string[python]", "string[pyarrow]"): + vals = Index([f"i-{i}" for i in range(N)], dtype=object) + else: + raise NotImplementedError + + idx = Index(vals, dtype=dtype) + + if structure == "monotonic": + idx = idx.sort_values() + elif structure == "non_monotonic": + idx = idx[::-1] + elif structure == "has_na": + if not idx._can_hold_na: + raise NotImplementedError + idx = Index([None], dtype=dtype).append(idx) + else: + raise NotImplementedError + + self.series = [Series(i, idx[:-i]) for i in range(1, 6)] + + def time_concat_series(self, dtype, structure, axis, sort): + concat(self.series, axis=axis, sort=sort) + + class Join: params = [True, False] param_names = ["sort"] diff --git a/pandas/_libs/hashtable.pyi b/pandas/_libs/hashtable.pyi index d3fd033935112..7a810a988e50e 100644 --- a/pandas/_libs/hashtable.pyi +++ b/pandas/_libs/hashtable.pyi @@ -25,6 +25,9 @@ class Factorizer: na_value=..., mask=..., ) -> npt.NDArray[np.intp]: ... + def hash_inner_join( + self, values: np.ndarray, mask=... + ) -> tuple[np.ndarray, np.ndarray]: ... class ObjectFactorizer(Factorizer): table: PyObjectHashTable @@ -216,6 +219,9 @@ class HashTable: mask=..., ignore_na: bool = True, ) -> tuple[np.ndarray, npt.NDArray[np.intp]]: ... # np.ndarray[subclass-specific] + def hash_inner_join( + self, values: np.ndarray, mask=... + ) -> tuple[np.ndarray, np.ndarray]: ... class Complex128HashTable(HashTable): ... class Complex64HashTable(HashTable): ... diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 704d2d83dd8a8..24b4690bdf12a 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -546,7 +546,7 @@ cdef class {{name}}HashTable(HashTable): self.na_position = na_position @cython.boundscheck(False) - def inner_join_unique(self, const {{dtype}}_t[:] values, const uint8_t[:] mask = None) -> tuple[ndarray, ndarray]: + def hash_inner_join(self, const {{dtype}}_t[:] values, const uint8_t[:] mask = None) -> tuple[ndarray, ndarray]: cdef: Py_ssize_t i, n = len(values) int ret = 0, ctr = 0 @@ -931,8 +931,8 @@ cdef class {{name}}Factorizer(Factorizer): self.count = len(self.uniques) return labels - def inner_join_unique(self, const {{c_type}}[:] values, const uint8_t[:] mask = None) -> tuple[np.ndarray, np.ndarray]: - return self.table.inner_join_unique(values, mask) + def hash_inner_join(self, const {{c_type}}[:] values, const uint8_t[:] mask = None) -> tuple[np.ndarray, np.ndarray]: + return self.table.hash_inner_join(values, mask) {{endfor}} diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index f504910e32bf0..b1f03b0d60ba4 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2555,14 +2555,14 @@ def _factorize_keys( # Argument 1 to "factorize" of "ObjectFactorizer" has incompatible type # "Union[ndarray[Any, dtype[signedinteger[_64Bit]]], # ndarray[Any, dtype[object_]]]"; expected "ndarray[Any, dtype[object_]]" - lk_data, rk_data = lk, rk + lk_data, rk_data = lk, rk # type: ignore[assignment] lk_mask, rk_mask = None, None try_unique_merge = try_unique_merge and lk.dtype.kind in "iufb" if try_unique_merge: rlab = rizer.factorize(rk_data, mask=rk_mask) if rizer.get_count() == len(rlab): - ridx, lidx = rizer.inner_join_unique(lk_data, lk_mask) + ridx, lidx = rizer.hash_inner_join(lk_data, lk_mask) return lidx, ridx, -1 else: llab = rizer.factorize(lk_data, mask=lk_mask) From 28c7eb8a96b7d8fa8e0de11902d159b3e6238531 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 22 Mar 2024 17:17:16 -0500 Subject: [PATCH 13/24] Rename keyword --- pandas/core/reshape/merge.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index b1f03b0d60ba4..a27b5f9929389 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1781,7 +1781,7 @@ def get_join_indexers_non_unique( Indexer into right. """ lkey, rkey, count = _factorize_keys( - left, right, sort=sort, try_unique_merge=not sort and how == "inner" + left, right, sort=sort, hash_join_available=not sort and how == "inner" ) if count == -1: # unique merge @@ -2393,7 +2393,7 @@ def _factorize_keys( lk: ArrayLike, rk: ArrayLike, sort: bool = True, - try_unique_merge: bool = False, + hash_join_available: bool = False, ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: """ Encode left and right keys as enumerated types. @@ -2558,8 +2558,8 @@ def _factorize_keys( lk_data, rk_data = lk, rk # type: ignore[assignment] lk_mask, rk_mask = None, None - try_unique_merge = try_unique_merge and lk.dtype.kind in "iufb" - if try_unique_merge: + hash_join_available = hash_join_available and lk.dtype.kind in "iufb" + if hash_join_available: rlab = rizer.factorize(rk_data, mask=rk_mask) if rizer.get_count() == len(rlab): ridx, lidx = rizer.hash_inner_join(lk_data, lk_mask) From 4cb93655235c4c5d3cc41fa7a356c4762d5aca46 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 22 Mar 2024 17:19:20 -0500 Subject: [PATCH 14/24] Add whatsnew --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index cb211b0b72dce..da123cfaf3dcd 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -282,6 +282,7 @@ Performance improvements - Performance improvement in :meth:`RangeIndex.join` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57651`, :issue:`57752`) - Performance improvement in :meth:`RangeIndex.reindex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57647`, :issue:`57752`) - Performance improvement in :meth:`RangeIndex.take` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57445`, :issue:`57752`) +- Performance improvement in :func:`merge` if hash-join can be used (:issue:`57970`) - Performance improvement in ``DataFrameGroupBy.__len__`` and ``SeriesGroupBy.__len__`` (:issue:`57595`) - Performance improvement in indexing operations for string dtypes (:issue:`56997`) - Performance improvement in unary methods on a :class:`RangeIndex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57825`) From f60d8b439d00c72416142cc226fdf0bae27f1d0e Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 22 Mar 2024 17:24:44 -0500 Subject: [PATCH 15/24] Clean up --- pandas/_libs/hashtable.pyx | 4 ++-- pandas/_libs/hashtable_class_helper.pxi.in | 4 ++-- pandas/core/reshape/merge.py | 8 -------- 3 files changed, 4 insertions(+), 12 deletions(-) diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index 139d86bd4f268..b4db762cb083b 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -75,7 +75,7 @@ cdef class Factorizer: def get_count(self) -> int: return self.count - def factorize(self, values, mask=None, na_sentinel=-1, na_value=None) -> np.ndarray: + def factorize(self, values, na_sentinel=-1, na_value=None, mask=None) -> np.ndarray: raise NotImplementedError @@ -89,7 +89,7 @@ cdef class ObjectFactorizer(Factorizer): self.uniques = ObjectVector() def factorize( - self, ndarray[object] values, mask=None, na_sentinel=-1, na_value=None + self, ndarray[object] values, na_sentinel=-1, na_value=None, mask=None ) -> np.ndarray: """ diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 24b4690bdf12a..2e9e2b7941995 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -903,8 +903,8 @@ cdef class {{name}}Factorizer(Factorizer): self.table = {{name}}HashTable(size_hint, uses_mask=uses_mask) self.uniques = {{name}}Vector() - def factorize(self, const {{c_type}}[:] values, object mask=None, - na_sentinel=-1, na_value=None) -> np.ndarray: + def factorize(self, const {{c_type}}[:] values, + na_sentinel=-1, na_value=None, object mask=None) -> np.ndarray: """ Returns ------- diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index a27b5f9929389..8ce884078cc86 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2595,14 +2595,6 @@ def _factorize_keys( return llab, rlab, count -def _convert_arrays_for_rizer(arr): - if isinstance(arr, BaseMaskedArray): - return arr._data, arr._mask - elif isinstance(arr, ArrowExtensionArray): - return arr.to_numpy(na_value=1, dtype=arr.dtype.numpy_dtype), arr.isna() - return arr, None - - def _convert_arrays_and_get_rizer_klass( lk: ArrayLike, rk: ArrayLike ) -> tuple[type[libhashtable.Factorizer], ArrayLike, ArrayLike]: From 243f6c0c38159ddc570045977ba907b30106ffbe Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 22 Mar 2024 20:55:09 -0500 Subject: [PATCH 16/24] Fix typing --- pandas/_libs/hashtable.pyx | 3 +++ scripts/run_stubtest.py | 1 + 2 files changed, 4 insertions(+) diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index b4db762cb083b..c6b4b88a2dcf9 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -78,6 +78,9 @@ cdef class Factorizer: def factorize(self, values, na_sentinel=-1, na_value=None, mask=None) -> np.ndarray: raise NotImplementedError + def hash_inner_join(self, values, mask=None): + raise NotImplementedError + cdef class ObjectFactorizer(Factorizer): cdef public: diff --git a/scripts/run_stubtest.py b/scripts/run_stubtest.py index 6307afa1bc822..df88c61061f12 100644 --- a/scripts/run_stubtest.py +++ b/scripts/run_stubtest.py @@ -44,6 +44,7 @@ "pandas._libs.hashtable.HashTable.set_na", "pandas._libs.hashtable.HashTable.sizeof", "pandas._libs.hashtable.HashTable.unique", + "pandas._libs.hashtable.HashTable.hash_inner_join", # stubtest might be too sensitive "pandas._libs.lib.NoDefault", "pandas._libs.lib._NoDefault.no_default", From 7dfbe159a25858e2d8a3fa35aad89edc3e8ceb0f Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 23 Mar 2024 14:02:47 -0500 Subject: [PATCH 17/24] Update --- pandas/_libs/hashtable_class_helper.pxi.in | 30 ++++++++++++++-------- pandas/core/reshape/merge.py | 13 +++++----- 2 files changed, 27 insertions(+), 16 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 2e9e2b7941995..a96d5a93884a8 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -545,17 +545,21 @@ cdef class {{name}}HashTable(HashTable): self.table.vals[k] = i self.na_position = na_position + @cython.wraparound(False) @cython.boundscheck(False) def hash_inner_join(self, const {{dtype}}_t[:] values, const uint8_t[:] mask = None) -> tuple[ndarray, ndarray]: cdef: Py_ssize_t i, n = len(values) - int ret = 0, ctr = 0 {{c_type}} val khiter_t k - intp_t[::1] locs = np.empty(n, dtype=np.intp) - intp_t[::1] self_locs = np.empty(n, dtype=np.intp) + Int64Vector locs = Int64Vector(), self_locs = Int64Vector() + Int64VectorData *l + Int64VectorData *sl int8_t na_position = self.na_position + l = &locs.data + sl = &self_locs.data + if self.uses_mask and mask is None: raise NotImplementedError # pragma: no cover @@ -564,18 +568,24 @@ cdef class {{name}}HashTable(HashTable): if self.uses_mask and mask[i]: if self.na_position == -1: continue - self_locs[ctr] = na_position - locs[ctr] = na_position - ctr += 1 + if needs_resize(l): + with gil: + locs.resize() + self_locs.resize() + append_data_int64(l, na_position) + append_data_int64(sl, na_position) else: val = {{to_c_type}}(values[i]) k = kh_get_{{dtype}}(self.table, val) if k != self.table.n_buckets: - self_locs[ctr] = self.table.vals[k] - locs[ctr] = i - ctr += 1 + if needs_resize(l): + with gil: + locs.resize() + self_locs.resize() + append_data_int64(l, i) + append_data_int64(sl, self.table.vals[k]) - return np.asarray(self_locs)[:ctr], np.asarray(locs)[:ctr] + return self_locs.to_array(), locs.to_array() @cython.boundscheck(False) def lookup(self, const {{dtype}}_t[:] values, const uint8_t[:] mask = None) -> ndarray: diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 8ce884078cc86..c6a822a330200 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1780,11 +1780,9 @@ def get_join_indexers_non_unique( np.ndarray[np.intp] Indexer into right. """ - lkey, rkey, count = _factorize_keys( - left, right, sort=sort, hash_join_available=not sort and how == "inner" - ) + lkey, rkey, count = _factorize_keys(left, right, sort=sort, how=how) if count == -1: - # unique merge + # hash join return lkey, rkey if how == "left": lidx, ridx = libjoin.left_outer_join(lkey, rkey, count, sort=sort) @@ -2393,7 +2391,7 @@ def _factorize_keys( lk: ArrayLike, rk: ArrayLike, sort: bool = True, - hash_join_available: bool = False, + how: str | None = None, ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: """ Encode left and right keys as enumerated types. @@ -2409,6 +2407,9 @@ def _factorize_keys( sort : bool, defaults to True If True, the encoding is done such that the unique elements in the keys are sorted. + how: str, optional + Used to determine if we can use hash-join. If not given, then just factorize + keys. Returns ------- @@ -2558,7 +2559,7 @@ def _factorize_keys( lk_data, rk_data = lk, rk # type: ignore[assignment] lk_mask, rk_mask = None, None - hash_join_available = hash_join_available and lk.dtype.kind in "iufb" + hash_join_available = how == "inner" and not sort and lk.dtype.kind in "iufb" if hash_join_available: rlab = rizer.factorize(rk_data, mask=rk_mask) if rizer.get_count() == len(rlab): From fc0ea082960a98b2ca4cb179db8463a2dd4e5483 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 23 Mar 2024 14:04:55 -0500 Subject: [PATCH 18/24] Update docs --- pandas/core/reshape/merge.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index c6a822a330200..2cd065d03ff53 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2418,7 +2418,8 @@ def _factorize_keys( np.ndarray[np.intp] Right (resp. left if called with `key='right'`) labels, as enumerated type. int - Number of unique elements in union of left and right labels. + Number of unique elements in union of left and right labels. -1 if we used + a hash-join. See Also -------- From 0bd0759ecf8162896b7faed9639bcfd9d9a00c5e Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 23 Mar 2024 14:15:32 -0500 Subject: [PATCH 19/24] Update --- pandas/_libs/hashtable_class_helper.pxi.in | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index a96d5a93884a8..08e9a22c64734 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -568,7 +568,7 @@ cdef class {{name}}HashTable(HashTable): if self.uses_mask and mask[i]: if self.na_position == -1: continue - if needs_resize(l): + if needs_resize(l) or needs_resize(sl): with gil: locs.resize() self_locs.resize() @@ -578,7 +578,7 @@ cdef class {{name}}HashTable(HashTable): val = {{to_c_type}}(values[i]) k = kh_get_{{dtype}}(self.table, val) if k != self.table.n_buckets: - if needs_resize(l): + if needs_resize(l) or needs_resize(sl): with gil: locs.resize() self_locs.resize() From 97c4c26b6820839d9f706e1ac95572290e7bb86d Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 23 Mar 2024 14:18:59 -0500 Subject: [PATCH 20/24] Update --- pandas/_libs/hashtable_class_helper.pxi.in | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 08e9a22c64734..14ea251e63d1e 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -568,17 +568,17 @@ cdef class {{name}}HashTable(HashTable): if self.uses_mask and mask[i]: if self.na_position == -1: continue - if needs_resize(l) or needs_resize(sl): + if needs_resize(l): with gil: locs.resize() self_locs.resize() - append_data_int64(l, na_position) + append_data_int64(l, i) append_data_int64(sl, na_position) else: val = {{to_c_type}}(values[i]) k = kh_get_{{dtype}}(self.table, val) if k != self.table.n_buckets: - if needs_resize(l) or needs_resize(sl): + if needs_resize(l): with gil: locs.resize() self_locs.resize() From a0dd1d3b0048498efbf578a79ce92923c325a490 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 23 Mar 2024 14:19:48 -0500 Subject: [PATCH 21/24] Update --- pandas/_libs/hashtable_class_helper.pxi.in | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 14ea251e63d1e..9ba722b0fa9ea 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -552,7 +552,8 @@ cdef class {{name}}HashTable(HashTable): Py_ssize_t i, n = len(values) {{c_type}} val khiter_t k - Int64Vector locs = Int64Vector(), self_locs = Int64Vector() + Int64Vector locs = Int64Vector() + Int64Vector self_locs = Int64Vector() Int64VectorData *l Int64VectorData *sl int8_t na_position = self.na_position From de001a6380bb82dbf2175b684d19e1c502f43951 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 23 Mar 2024 14:23:06 -0500 Subject: [PATCH 22/24] Update --- pandas/_libs/hashtable_class_helper.pxi.in | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 9ba722b0fa9ea..5c2e64d1aa617 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -569,7 +569,7 @@ cdef class {{name}}HashTable(HashTable): if self.uses_mask and mask[i]: if self.na_position == -1: continue - if needs_resize(l): + if needs_resize(&locs.data): with gil: locs.resize() self_locs.resize() @@ -579,7 +579,7 @@ cdef class {{name}}HashTable(HashTable): val = {{to_c_type}}(values[i]) k = kh_get_{{dtype}}(self.table, val) if k != self.table.n_buckets: - if needs_resize(l): + if needs_resize(&locs.data): with gil: locs.resize() self_locs.resize() From 3bb06046c2873bd110b36559bf33be70963f002a Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 23 Mar 2024 14:26:35 -0500 Subject: [PATCH 23/24] Fixup for changed main --- pandas/_libs/hashtable_class_helper.pxi.in | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 7b8b5fdbcdfa7..e3799a8809ccb 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -581,7 +581,7 @@ cdef class {{name}}HashTable(HashTable): if self.uses_mask and mask[i]: if self.na_position == -1: continue - if needs_resize(&locs.data): + if needs_resize(l.size, l.capacity): with gil: locs.resize() self_locs.resize() @@ -591,7 +591,7 @@ cdef class {{name}}HashTable(HashTable): val = {{to_c_type}}(values[i]) k = kh_get_{{dtype}}(self.table, val) if k != self.table.n_buckets: - if needs_resize(&locs.data): + if needs_resize(l.size, l.capacity): with gil: locs.resize() self_locs.resize() From 6ee2a4cdae8a1ecc7f3d70fa08171eecd6254d03 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 23 Mar 2024 14:34:23 -0500 Subject: [PATCH 24/24] Fixup for changed main --- pandas/_libs/hashtable_class_helper.pxi.in | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index e3799a8809ccb..e3a9102fec395 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -583,8 +583,8 @@ cdef class {{name}}HashTable(HashTable): continue if needs_resize(l.size, l.capacity): with gil: - locs.resize() - self_locs.resize() + locs.resize(locs.data.capacity * 4) + self_locs.resize(locs.data.capacity * 4) append_data_int64(l, i) append_data_int64(sl, na_position) else: @@ -593,8 +593,8 @@ cdef class {{name}}HashTable(HashTable): if k != self.table.n_buckets: if needs_resize(l.size, l.capacity): with gil: - locs.resize() - self_locs.resize() + locs.resize(locs.data.capacity * 4) + self_locs.resize(locs.data.capacity * 4) append_data_int64(l, i) append_data_int64(sl, self.table.vals[k])