pandas-dev · mroeschke · Mar 13, 2023 · Mar 12, 2023
diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py
@@ -23,6 +23,7 @@ class Factorize:
             "uint",
             "float",
             "object",
+            "object_str",
             "datetime64[ns]",
             "datetime64[ns, tz]",
             "Int64",
@@ -46,7 +47,8 @@ def setup(self, unique, sort, dtype):
             "int": pd.Index(np.arange(N), dtype="int64"),
             "uint": pd.Index(np.arange(N), dtype="uint64"),
             "float": pd.Index(np.random.randn(N), dtype="float64"),
-            "object": string_index,
+            "object_str": string_index,
+            "object": pd.Index(np.arange(N), dtype="object"),
             "datetime64[ns]": pd.date_range("2011-01-01", freq="H", periods=N),
             "datetime64[ns, tz]": pd.date_range(
                 "2011-01-01", freq="H", periods=N, tz="Asia/Tokyo"
@@ -62,6 +64,9 @@ def setup(self, unique, sort, dtype):
     def time_factorize(self, unique, sort, dtype):
         pd.factorize(self.data, sort=sort)
 
+    def peakmem_factorize(self, unique, sort, dtype):
+        pd.factorize(self.data, sort=sort)
+
 
 class Duplicated:
     params = [

diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
@@ -116,6 +116,7 @@ Performance improvements
 - Performance improvement when parsing strings to ``boolean[pyarrow]`` dtype (:issue:`51730`)
 - Performance improvement when searching an :class:`Index` sliced from other indexes (:issue:`51738`)
 - Performance improvement in :meth:`Series.combine_first` (:issue:`51777`)
+- Performance improvement in :func:`factorize` for object columns not containing strings (:issue:`51921`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_210.bug_fixes:

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -292,7 +292,7 @@ def _check_object_for_strings(values: np.ndarray) -> str:
         # it's cheaper to use a String Hash Table than Object; we infer
         # including nulls because that is the only difference between
         # StringHashTable and ObjectHashtable
-        if lib.infer_dtype(values, skipna=False) in ["string"]:
+        if lib.is_string_array(values, skipna=False):
             ndtype = "string"
     return ndtype