From ed77ca3b187320a550c7bbbbab784e54ddd746e6 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Sun, 12 Mar 2023 19:31:52 -0400 Subject: [PATCH] PERF: Reject non-string object arrays faster in factorize --- asv_bench/benchmarks/algorithms.py | 7 ++++++- doc/source/whatsnew/v2.1.0.rst | 1 + pandas/core/algorithms.py | 2 +- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index eef81242abc7c..2584e1f13853a 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -23,6 +23,7 @@ class Factorize: "uint", "float", "object", + "object_str", "datetime64[ns]", "datetime64[ns, tz]", "Int64", @@ -46,7 +47,8 @@ def setup(self, unique, sort, dtype): "int": pd.Index(np.arange(N), dtype="int64"), "uint": pd.Index(np.arange(N), dtype="uint64"), "float": pd.Index(np.random.randn(N), dtype="float64"), - "object": string_index, + "object_str": string_index, + "object": pd.Index(np.arange(N), dtype="object"), "datetime64[ns]": pd.date_range("2011-01-01", freq="H", periods=N), "datetime64[ns, tz]": pd.date_range( "2011-01-01", freq="H", periods=N, tz="Asia/Tokyo" @@ -62,6 +64,9 @@ def setup(self, unique, sort, dtype): def time_factorize(self, unique, sort, dtype): pd.factorize(self.data, sort=sort) + def peakmem_factorize(self, unique, sort, dtype): + pd.factorize(self.data, sort=sort) + class Duplicated: params = [ diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index c12807304f74d..06069495ac1d0 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -116,6 +116,7 @@ Performance improvements - Performance improvement when parsing strings to ``boolean[pyarrow]`` dtype (:issue:`51730`) - Performance improvement when searching an :class:`Index` sliced from other indexes (:issue:`51738`) - Performance improvement in :meth:`Series.combine_first` (:issue:`51777`) +- Performance improvement in :func:`factorize` for object columns not containing strings (:issue:`51921`) .. --------------------------------------------------------------------------- .. _whatsnew_210.bug_fixes: diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index c82b47867fbb3..49c82f92439cc 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -292,7 +292,7 @@ def _check_object_for_strings(values: np.ndarray) -> str: # it's cheaper to use a String Hash Table than Object; we infer # including nulls because that is the only difference between # StringHashTable and ObjectHashtable - if lib.infer_dtype(values, skipna=False) in ["string"]: + if lib.is_string_array(values, skipna=False): ndtype = "string" return ndtype