From ed77ca3b187320a550c7bbbbab784e54ddd746e6 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Sun, 12 Mar 2023 19:31:52 -0400
Subject: [PATCH] PERF: Reject non-string object arrays faster in factorize

---
 asv_bench/benchmarks/algorithms.py | 7 ++++++-
 doc/source/whatsnew/v2.1.0.rst     | 1 +
 pandas/core/algorithms.py          | 2 +-
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py
index eef81242abc7c..2584e1f13853a 100644
--- a/asv_bench/benchmarks/algorithms.py
+++ b/asv_bench/benchmarks/algorithms.py
@@ -23,6 +23,7 @@ class Factorize:
             "uint",
             "float",
             "object",
+            "object_str",
             "datetime64[ns]",
             "datetime64[ns, tz]",
             "Int64",
@@ -46,7 +47,8 @@ def setup(self, unique, sort, dtype):
             "int": pd.Index(np.arange(N), dtype="int64"),
             "uint": pd.Index(np.arange(N), dtype="uint64"),
             "float": pd.Index(np.random.randn(N), dtype="float64"),
-            "object": string_index,
+            "object_str": string_index,
+            "object": pd.Index(np.arange(N), dtype="object"),
             "datetime64[ns]": pd.date_range("2011-01-01", freq="H", periods=N),
             "datetime64[ns, tz]": pd.date_range(
                 "2011-01-01", freq="H", periods=N, tz="Asia/Tokyo"
@@ -62,6 +64,9 @@ def setup(self, unique, sort, dtype):
     def time_factorize(self, unique, sort, dtype):
         pd.factorize(self.data, sort=sort)
 
+    def peakmem_factorize(self, unique, sort, dtype):
+        pd.factorize(self.data, sort=sort)
+
 
 class Duplicated:
     params = [
diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
index c12807304f74d..06069495ac1d0 100644
--- a/doc/source/whatsnew/v2.1.0.rst
+++ b/doc/source/whatsnew/v2.1.0.rst
@@ -116,6 +116,7 @@ Performance improvements
 - Performance improvement when parsing strings to ``boolean[pyarrow]`` dtype (:issue:`51730`)
 - Performance improvement when searching an :class:`Index` sliced from other indexes (:issue:`51738`)
 - Performance improvement in :meth:`Series.combine_first` (:issue:`51777`)
+- Performance improvement in :func:`factorize` for object columns not containing strings (:issue:`51921`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_210.bug_fixes:
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index c82b47867fbb3..49c82f92439cc 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -292,7 +292,7 @@ def _check_object_for_strings(values: np.ndarray) -> str:
         # it's cheaper to use a String Hash Table than Object; we infer
         # including nulls because that is the only difference between
         # StringHashTable and ObjectHashtable
-        if lib.infer_dtype(values, skipna=False) in ["string"]:
+        if lib.is_string_array(values, skipna=False):
             ndtype = "string"
     return ndtype