diff --git a/asv_bench/benchmarks/multiindex_object.py b/asv_bench/benchmarks/multiindex_object.py index 25df5b0214959..17ded2ef2f9b1 100644 --- a/asv_bench/benchmarks/multiindex_object.py +++ b/asv_bench/benchmarks/multiindex_object.py @@ -232,4 +232,17 @@ def time_operation(self, index_structure, dtype, method): getattr(self.left, method)(self.right) +class Append: + def setup(self): + self.MI1 = MultiIndex.from_arrays( + [np.arange(10), np.arange(100, 200, 10)], names=["first", "second"] + ) + self.MI2 = MultiIndex.from_arrays( + [np.arange(10, 20, 1), np.arange(200, 300, 10)], names=["first", "third"] + ) + + def time_append(self): + self.MI1.append(self.MI2) + + from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index d654cf5715bdf..1968e450764b4 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -942,6 +942,7 @@ MultiIndex - Bug in :meth:`MultiIndex.intersection` always returning empty when intersecting with :class:`CategoricalIndex` (:issue:`38653`) - Bug in :meth:`MultiIndex.reindex` raising ``ValueError`` with empty MultiIndex and indexing only a specific level (:issue:`41170`) + I/O ^^^ @@ -1060,6 +1061,8 @@ Reshaping - Bug in :func:`to_datetime` raising error when input sequence contains unhashable items (:issue:`39756`) - Bug in :meth:`Series.explode` preserving index when ``ignore_index`` was ``True`` and values were scalars (:issue:`40487`) - Bug in :func:`to_datetime` raising ``ValueError`` when :class:`Series` contains ``None`` and ``NaT`` and has more than 50 elements (:issue:`39882`) +- Bug in :meth:`DataFrame.concat` does not match index names when concatenating two dataframes with a multiindex (:issue:`40849`) + Sparse ^^^^^^ diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 86d1503fe31c0..276bb1ba66ff7 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -26,6 +26,7 @@ lib, ) from pandas._libs.hashtable import duplicated +from pandas._libs.missing import NA from pandas._typing import ( AnyArrayLike, DtypeObj, @@ -45,7 +46,10 @@ doc, ) -from pandas.core.dtypes.cast import coerce_indexer_dtype +from pandas.core.dtypes.cast import ( + coerce_indexer_dtype, + convert_dtypes, +) from pandas.core.dtypes.common import ( ensure_int64, ensure_platform_int, @@ -2147,11 +2151,36 @@ def append(self, other): (isinstance(o, MultiIndex) and o.nlevels >= self.nlevels) for o in other ): arrays = [] - for i in range(self.nlevels): - label = self._get_level_values(i) - appended = [o._get_level_values(i) for o in other] - arrays.append(label.append(appended)) - return MultiIndex.from_arrays(arrays, names=self.names) + if self.names.count(None) > 1 or any( + o.names.count(None) > 1 for o in other + ): + + arrays, index_label_list = self.simple_append(other=other) + + else: + index_label_list = self.get_unique_indexes(other) + + for index_label in index_label_list: + + index = self.get_index_data( + data_index=self, column_name=index_label, other=other + ) + appended = [] + + for o in other: + + data = self.get_index_data( + data_index=o, + column_name=index_label, + other=other, + search_self=True, + ) + appended.append(data) + + index = index.append(data) + + arrays.append(index) + return MultiIndex.from_arrays(arrays, names=index_label_list) to_concat = (self._values,) + tuple(k._values for k in other) new_tuples = np.concatenate(to_concat) @@ -2162,6 +2191,60 @@ def append(self, other): except (TypeError, IndexError): return Index(new_tuples) + def simple_append(self, other): + + arr = [] + + for i in range(self.nlevels): + + label = self._get_level_values(i) + appended = [o._get_level_values(i) for o in other] + arr.append(label.append(appended)) + index_label_list = self.names + + return arr, index_label_list + + def get_index_data(self, data_index, column_name, other, search_self=False): + + # Returns original data if the data_index input has data for this column name + if column_name in data_index.names: + Index_position = data_index.names.index(column_name) + data = data_index._get_level_values(Index_position) + return data + + else: + + # If the data_index input is from other and if it don't + # have the column name, it returns an Index filled with pd.NA + # with data type that the other dataframe has the column. + if search_self is True: + if column_name in self.names: + Index_position = self.names.index(column_name) + NA_type = convert_dtypes(self.levels[Index_position].values) + data = Index([NA] * data_index.size, dtype=NA_type) + return data + + for o in other: + if o is not data_index and column_name in o.names: + Index_position = o.names.index(column_name) + NA_type = convert_dtypes(self.levels[Index_position].values) + data = Index([NA] * data_index.size, dtype=NA_type) + return data + + def get_unique_indexes(self, other): + + Union_list = list(self.names) + + for o in other: + if not set(o.names).issubset(Union_list): + + for element in o.names: + if element not in Union_list: + + Union_list.append(element) + + return Union_list + def argsort(self, *args, **kwargs) -> np.ndarray: return self._values.argsort(*args, **kwargs) diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index b29855caf6c1d..2d2c656de4960 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -1341,3 +1341,35 @@ def test_maxmin(self, raw_data, max_expected, min_expected): min_result = SparseArray(raw_data).min() assert max_result in max_expected assert min_result in min_expected + + +def test_concat_with_different_index_arrangement(): + df_first = pd.DataFrame( + [["i1_top", "i2_top", 1]], columns=["index1", "index2", "value1"] + ) + df_second = pd.DataFrame( + [["i1_middle", "i2_middle", 1]], columns=["index1", "index3", "value1"] + ) + df_third = pd.DataFrame( + [["i1_bottom", "i2_bottom", 1]], columns=["index1", "index4", "value1"] + ) + + df_concatenated_expected = pd.DataFrame( + [ + ["i1_top", "i2_top", 1, pd.NA, pd.NA], + ["i1_middle", pd.NA, 1, "i2_middle", pd.NA], + ["i1_bottom", pd.NA, 1, pd.NA, "i2_bottom"], + ], + columns=["index1", "index2", "value1", "index3", "index4"], + ) + + df_first.set_index(["index1", "index2"], inplace=True) + df_second.set_index(["index3", "index1"], inplace=True) + df_third.set_index(["index4", "index1"], inplace=True) + + df_concatenated_result = pd.concat([df_first, df_second, df_third]) + df_concatenated_expected.set_index( + ["index1", "index2", "index3", "index4"], inplace=True + ) + + tm.assert_frame_equal(df_concatenated_result, df_concatenated_expected)