From 2b0e85ee5cbaffc3766da272ad7095879bcb8af4 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 10 Nov 2022 21:24:53 +0100 Subject: [PATCH 1/6] ENH: Compile Factorizer class for all numeric dtypes --- pandas/_libs/hashtable.pyi | 132 +++++++++++++++++++++ pandas/_libs/hashtable.pyx | 49 -------- pandas/_libs/hashtable_class_helper.pxi.in | 50 ++++++++ 3 files changed, 182 insertions(+), 49 deletions(-) diff --git a/pandas/_libs/hashtable.pyi b/pandas/_libs/hashtable.pyi index 18ebc1ff2bd1f..14c35ec1eb990 100644 --- a/pandas/_libs/hashtable.pyi +++ b/pandas/_libs/hashtable.pyi @@ -39,6 +39,138 @@ class Int64Factorizer(Factorizer): mask=..., ) -> npt.NDArray[np.intp]: ... +class UInt64Factorizer(Factorizer): + table: UInt64HashTable + uniques: UInt64Vector + def factorize( + self, + values: np.ndarray, # const uint64_t[:] + sort: bool = ..., + na_sentinel=..., + na_value=..., + mask=..., + ) -> npt.NDArray[np.intp]: ... + +class Int32Factorizer(Factorizer): + table: Int32HashTable + uniques: Int32Vector + def factorize( + self, + values: np.ndarray, # const int32_t[:] + sort: bool = ..., + na_sentinel=..., + na_value=..., + mask=..., + ) -> npt.NDArray[np.intp]: ... + +class UInt32Factorizer(Factorizer): + table: UInt32HashTable + uniques: UInt32Vector + def factorize( + self, + values: np.ndarray, # const uint32_t[:] + sort: bool = ..., + na_sentinel=..., + na_value=..., + mask=..., + ) -> npt.NDArray[np.intp]: ... + +class Int16Factorizer(Factorizer): + table: Int16HashTable + uniques: Int16Vector + def factorize( + self, + values: np.ndarray, # const int16_t[:] + sort: bool = ..., + na_sentinel=..., + na_value=..., + mask=..., + ) -> npt.NDArray[np.intp]: ... + +class UInt16Factorizer(Factorizer): + table: UInt16HashTable + uniques: UInt16Vector + def factorize( + self, + values: np.ndarray, # const uint16_t[:] + sort: bool = ..., + na_sentinel=..., + na_value=..., + mask=..., + ) -> npt.NDArray[np.intp]: ... + +class Int8Factorizer(Factorizer): + table: Int8HashTable + uniques: Int8Vector + def factorize( + self, + values: np.ndarray, # const int8_t[:] + sort: bool = ..., + na_sentinel=..., + na_value=..., + mask=..., + ) -> npt.NDArray[np.intp]: ... + +class UInt8Factorizer(Factorizer): + table: UInt8HashTable + uniques: UInt8Vector + def factorize( + self, + values: np.ndarray, # const uint8_t[:] + sort: bool = ..., + na_sentinel=..., + na_value=..., + mask=..., + ) -> npt.NDArray[np.intp]: ... + +class Float64Factorizer(Factorizer): + table: Float64HashTable + uniques: Float64Vector + def factorize( + self, + values: np.ndarray, # const float64_t[:] + sort: bool = ..., + na_sentinel=..., + na_value=..., + mask=..., + ) -> npt.NDArray[np.intp]: ... + +class Float32Factorizer(Factorizer): + table: Float32HashTable + uniques: Float32Vector + def factorize( + self, + values: np.ndarray, # const float32_t[:] + sort: bool = ..., + na_sentinel=..., + na_value=..., + mask=..., + ) -> npt.NDArray[np.intp]: ... + +class Complex64Factorizer(Factorizer): + table: Complex64HashTable + uniques: Complex64Vector + def factorize( + self, + values: np.ndarray, # const complex64_t[:] + sort: bool = ..., + na_sentinel=..., + na_value=..., + mask=..., + ) -> npt.NDArray[np.intp]: ... + +class Complex128Factorizer(Factorizer): + table: Complex128HashTable + uniques: Complex128Vector + def factorize( + self, + values: np.ndarray, # const complex128_t[:] + sort: bool = ..., + na_sentinel=..., + na_value=..., + mask=..., + ) -> npt.NDArray[np.intp]: ... + class Int64Vector: def __init__(self, *args) -> None: ... def __len__(self) -> int: ... diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index e4e9b24d725c6..13da0457868b8 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -69,17 +69,6 @@ else: raise ValueError(np.dtype(np.intp)) -cdef class Factorizer: - cdef readonly: - Py_ssize_t count - - def __cinit__(self, size_hint: int): - self.count = 0 - - def get_count(self) -> int: - return self.count - - cdef class ObjectFactorizer(Factorizer): cdef public: PyObjectHashTable table @@ -117,41 +106,3 @@ cdef class ObjectFactorizer(Factorizer): self.count, na_sentinel, na_value) self.count = len(self.uniques) return labels - - -cdef class Int64Factorizer(Factorizer): - cdef public: - Int64HashTable table - Int64Vector uniques - - def __cinit__(self, size_hint: int): - self.table = Int64HashTable(size_hint) - self.uniques = Int64Vector() - - def factorize(self, const int64_t[:] values, - na_sentinel=-1, na_value=None, object mask=None) -> np.ndarray: - """ - Returns - ------- - ndarray[intp_t] - - Examples - -------- - Factorize values with nans replaced by na_sentinel - - >>> fac = Int64Factorizer(3) - >>> fac.factorize(np.array([1,2,3]), na_sentinel=20) - array([0, 1, 2]) - """ - cdef: - ndarray[intp_t] labels - - if self.uniques.external_view_exists: - uniques = Int64Vector() - uniques.extend(self.uniques.to_array()) - self.uniques = uniques - labels = self.table.get_labels(values, self.uniques, - self.count, na_sentinel, - na_value=na_value, mask=mask) - self.count = len(self.uniques) - return labels diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index bda8cd83c0605..cf65d7c21d1a4 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -101,6 +101,18 @@ from pandas._libs.khash cimport ( from pandas._libs.tslibs.util cimport get_c_string from pandas._libs.missing cimport C_NA + +cdef class Factorizer: + cdef readonly: + Py_ssize_t count + + def __cinit__(self, size_hint: int): + self.count = 0 + + def get_count(self) -> int: + return self.count + + {{py: # name, dtype, c_type @@ -876,6 +888,44 @@ cdef class {{name}}HashTable(HashTable): return np.asarray(labels), arr_uniques {{endif}} + +cdef class {{name}}Factorizer(Factorizer): + cdef public: + {{name}}HashTable table + {{name}}Vector uniques + + def __cinit__(self, size_hint: int): + self.table = {{name}}HashTable(size_hint) + self.uniques = {{name}}Vector() + + def factorize(self, const {{c_type}}[:] values, + na_sentinel=-1, na_value=None, object mask=None) -> np.ndarray: + """ + Returns + ------- + ndarray[intp_t] + + Examples + -------- + Factorize values with nans replaced by na_sentinel + + >>> fac = {{name}}Factorizer(3) + >>> fac.factorize(np.array([1,2,3]), na_sentinel=20) + array([0, 1, 2]) + """ + cdef: + ndarray[intp_t] labels + + if self.uniques.external_view_exists: + uniques = {{name}}Vector() + uniques.extend(self.uniques.to_array()) + self.uniques = uniques + labels = self.table.get_labels(values, self.uniques, + self.count, na_sentinel, + na_value=na_value, mask=mask) + self.count = len(self.uniques) + return labels + {{endfor}} From 6969a802d4e1435d1d125b349a931285ec2cf85a Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 13 Nov 2022 17:37:48 +0100 Subject: [PATCH 2/6] Fix test --- pandas/_libs/hashtable_class_helper.pxi.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index cf65d7c21d1a4..bb3d850803e1e 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -910,7 +910,7 @@ cdef class {{name}}Factorizer(Factorizer): Factorize values with nans replaced by na_sentinel >>> fac = {{name}}Factorizer(3) - >>> fac.factorize(np.array([1,2,3]), na_sentinel=20) + >>> fac.factorize(np.array([1,2,3], dtype={{dtype}}), na_sentinel=20) array([0, 1, 2]) """ cdef: From 61a5928f6bc8da349e716a21e57fd03bf654426d Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 14 Nov 2022 22:59:09 +0100 Subject: [PATCH 3/6] Fix test --- pandas/_libs/hashtable_class_helper.pxi.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index bb3d850803e1e..8eaec40b16ec2 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -910,7 +910,7 @@ cdef class {{name}}Factorizer(Factorizer): Factorize values with nans replaced by na_sentinel >>> fac = {{name}}Factorizer(3) - >>> fac.factorize(np.array([1,2,3], dtype={{dtype}}), na_sentinel=20) + >>> fac.factorize(np.array([1,2,3], dtype="{{dtype}}"), na_sentinel=20) array([0, 1, 2]) """ cdef: From 35b99f372e76fdeedc6ea69043e32ecaa6b1cd4d Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 16 Nov 2022 17:25:58 +0100 Subject: [PATCH 4/6] Add factorize to base class --- pandas/_libs/hashtable.pyi | 106 +-------------------- pandas/_libs/hashtable.pyx | 5 +- pandas/_libs/hashtable_class_helper.pxi.in | 2 + 3 files changed, 11 insertions(+), 102 deletions(-) diff --git a/pandas/_libs/hashtable.pyi b/pandas/_libs/hashtable.pyi index 14c35ec1eb990..eb0b46101c2d8 100644 --- a/pandas/_libs/hashtable.pyi +++ b/pandas/_libs/hashtable.pyi @@ -15,161 +15,65 @@ class Factorizer: count: int def __init__(self, size_hint: int) -> None: ... def get_count(self) -> int: ... - -class ObjectFactorizer(Factorizer): - table: PyObjectHashTable - uniques: ObjectVector def factorize( self, - values: npt.NDArray[np.object_], + values: np.ndarray, sort: bool = ..., na_sentinel=..., na_value=..., ) -> npt.NDArray[np.intp]: ... +class ObjectFactorizer(Factorizer): + table: PyObjectHashTable + uniques: ObjectVector + class Int64Factorizer(Factorizer): table: Int64HashTable uniques: Int64Vector - def factorize( - self, - values: np.ndarray, # const int64_t[:] - sort: bool = ..., - na_sentinel=..., - na_value=..., - mask=..., - ) -> npt.NDArray[np.intp]: ... class UInt64Factorizer(Factorizer): table: UInt64HashTable uniques: UInt64Vector - def factorize( - self, - values: np.ndarray, # const uint64_t[:] - sort: bool = ..., - na_sentinel=..., - na_value=..., - mask=..., - ) -> npt.NDArray[np.intp]: ... class Int32Factorizer(Factorizer): table: Int32HashTable uniques: Int32Vector - def factorize( - self, - values: np.ndarray, # const int32_t[:] - sort: bool = ..., - na_sentinel=..., - na_value=..., - mask=..., - ) -> npt.NDArray[np.intp]: ... class UInt32Factorizer(Factorizer): table: UInt32HashTable uniques: UInt32Vector - def factorize( - self, - values: np.ndarray, # const uint32_t[:] - sort: bool = ..., - na_sentinel=..., - na_value=..., - mask=..., - ) -> npt.NDArray[np.intp]: ... class Int16Factorizer(Factorizer): table: Int16HashTable uniques: Int16Vector - def factorize( - self, - values: np.ndarray, # const int16_t[:] - sort: bool = ..., - na_sentinel=..., - na_value=..., - mask=..., - ) -> npt.NDArray[np.intp]: ... class UInt16Factorizer(Factorizer): table: UInt16HashTable uniques: UInt16Vector - def factorize( - self, - values: np.ndarray, # const uint16_t[:] - sort: bool = ..., - na_sentinel=..., - na_value=..., - mask=..., - ) -> npt.NDArray[np.intp]: ... class Int8Factorizer(Factorizer): table: Int8HashTable uniques: Int8Vector - def factorize( - self, - values: np.ndarray, # const int8_t[:] - sort: bool = ..., - na_sentinel=..., - na_value=..., - mask=..., - ) -> npt.NDArray[np.intp]: ... class UInt8Factorizer(Factorizer): table: UInt8HashTable uniques: UInt8Vector - def factorize( - self, - values: np.ndarray, # const uint8_t[:] - sort: bool = ..., - na_sentinel=..., - na_value=..., - mask=..., - ) -> npt.NDArray[np.intp]: ... class Float64Factorizer(Factorizer): table: Float64HashTable uniques: Float64Vector - def factorize( - self, - values: np.ndarray, # const float64_t[:] - sort: bool = ..., - na_sentinel=..., - na_value=..., - mask=..., - ) -> npt.NDArray[np.intp]: ... class Float32Factorizer(Factorizer): table: Float32HashTable uniques: Float32Vector - def factorize( - self, - values: np.ndarray, # const float32_t[:] - sort: bool = ..., - na_sentinel=..., - na_value=..., - mask=..., - ) -> npt.NDArray[np.intp]: ... class Complex64Factorizer(Factorizer): table: Complex64HashTable uniques: Complex64Vector - def factorize( - self, - values: np.ndarray, # const complex64_t[:] - sort: bool = ..., - na_sentinel=..., - na_value=..., - mask=..., - ) -> npt.NDArray[np.intp]: ... class Complex128Factorizer(Factorizer): table: Complex128HashTable uniques: Complex128Vector - def factorize( - self, - values: np.ndarray, # const complex128_t[:] - sort: bool = ..., - na_sentinel=..., - na_value=..., - mask=..., - ) -> npt.NDArray[np.intp]: ... class Int64Vector: def __init__(self, *args) -> None: ... diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index 13da0457868b8..15db9016af119 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -79,7 +79,7 @@ cdef class ObjectFactorizer(Factorizer): self.uniques = ObjectVector() def factorize( - self, ndarray[object] values, na_sentinel=-1, na_value=None + self, ndarray[object] values, na_sentinel=-1, na_value=None, mask=None ) -> np.ndarray: """ @@ -98,6 +98,9 @@ cdef class ObjectFactorizer(Factorizer): cdef: ndarray[intp_t] labels + if mask is None: + raise NotImplementedError("mask not supported for ObjectFactorizer.") + if self.uniques.external_view_exists: uniques = ObjectVector() uniques.extend(self.uniques.to_array()) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 8eaec40b16ec2..a3df5f6bdce84 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -112,6 +112,8 @@ cdef class Factorizer: def get_count(self) -> int: return self.count + def factorize(self, values, na_sentinel=-1, na_value=None, mask=None) -> np.ndarray: + raise NotImplementedError {{py: From efd6a9ed6dc8604cbc841f53ef7895b08fb0a5c8 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 16 Nov 2022 17:34:21 +0100 Subject: [PATCH 5/6] Remove ignores --- pandas/core/reshape/merge.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 74a1051825820..cc9a7b7f8d40b 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2362,14 +2362,8 @@ def _factorize_keys( rizer = klass(max(len(lk), len(rk))) - # Argument 1 to "factorize" of "ObjectFactorizer" has incompatible type - # "Union[ndarray[Any, dtype[signedinteger[_64Bit]]], - # ndarray[Any, dtype[object_]]]"; expected "ndarray[Any, dtype[object_]]" - llab = rizer.factorize(lk) # type: ignore[arg-type] - # Argument 1 to "factorize" of "ObjectFactorizer" has incompatible type - # "Union[ndarray[Any, dtype[signedinteger[_64Bit]]], - # ndarray[Any, dtype[object_]]]"; expected "ndarray[Any, dtype[object_]]" - rlab = rizer.factorize(rk) # type: ignore[arg-type] + llab = rizer.factorize(lk) + rlab = rizer.factorize(rk) assert llab.dtype == np.dtype(np.intp), llab.dtype assert rlab.dtype == np.dtype(np.intp), rlab.dtype From c2612e1399ad93816774a3bee9a923edafd0c04a Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 16 Nov 2022 17:54:51 +0100 Subject: [PATCH 6/6] Move factorizer --- pandas/_libs/hashtable.pyx | 16 +++++++++++++++- pandas/_libs/hashtable_class_helper.pxi.in | 13 ------------- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index 15db9016af119..ccac3d0b50d45 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -69,6 +69,20 @@ else: raise ValueError(np.dtype(np.intp)) +cdef class Factorizer: + cdef readonly: + Py_ssize_t count + + def __cinit__(self, size_hint: int): + self.count = 0 + + def get_count(self) -> int: + return self.count + + def factorize(self, values, na_sentinel=-1, na_value=None, mask=None) -> np.ndarray: + raise NotImplementedError + + cdef class ObjectFactorizer(Factorizer): cdef public: PyObjectHashTable table @@ -98,7 +112,7 @@ cdef class ObjectFactorizer(Factorizer): cdef: ndarray[intp_t] labels - if mask is None: + if mask is not None: raise NotImplementedError("mask not supported for ObjectFactorizer.") if self.uniques.external_view_exists: diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index a3df5f6bdce84..47dd0cbbd7164 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -102,19 +102,6 @@ from pandas._libs.tslibs.util cimport get_c_string from pandas._libs.missing cimport C_NA -cdef class Factorizer: - cdef readonly: - Py_ssize_t count - - def __cinit__(self, size_hint: int): - self.count = 0 - - def get_count(self) -> int: - return self.count - - def factorize(self, values, na_sentinel=-1, na_value=None, mask=None) -> np.ndarray: - raise NotImplementedError - {{py: # name, dtype, c_type