diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 84f594acf5e4c..980103ad3ad8e 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -531,7 +531,7 @@ Reshaping - Bug in :meth:`Series.transform` would give incorrect results or raise when the argument ``func`` was dictionary (:issue:`35811`) - Bug in :meth:`DataFrame.pivot` did not preserve :class:`MultiIndex` level names for columns when rows and columns both multiindexed (:issue:`36360`) - Bug in :func:`join` returned a non deterministic level-order for the resulting :class:`MultiIndex` (:issue:`36910`) -- +- Bug in :meth:`DataFrame.combine_first()` caused wrong alignment with dtype ``string`` and one level of ``MultiIndex`` containing only ``NA`` (:issue:`37591`) Sparse ^^^^^^ diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 53e5d95907bde..6620a5b5e737f 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -320,7 +320,7 @@ def __init__( # sanitize_array coerces np.nan to a string under certain versions # of numpy values = maybe_infer_to_datetimelike(values, convert_dates=True) - if not isinstance(values, np.ndarray): + if not isinstance(values, (np.ndarray, ExtensionArray)): values = com.convert_to_list_like(values) # By convention, empty lists result in object dtype: diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index d1f38d90547fd..4850c6a50f8a8 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -353,3 +353,15 @@ def test_combine_first_with_asymmetric_other(self, val): exp = DataFrame({"isBool": [True], "isNum": [val]}) tm.assert_frame_equal(res, exp) + + def test_combine_first_string_dtype_only_na(self): + # GH: 37519 + df = DataFrame({"a": ["962", "85"], "b": [pd.NA] * 2}, dtype="string") + df2 = DataFrame({"a": ["85"], "b": [pd.NA]}, dtype="string") + df.set_index(["a", "b"], inplace=True) + df2.set_index(["a", "b"], inplace=True) + result = df.combine_first(df2) + expected = DataFrame( + {"a": ["962", "85"], "b": [pd.NA] * 2}, dtype="string" + ).set_index(["a", "b"]) + tm.assert_frame_equal(result, expected)