diff --git a/pandas/tests/reshape/concat/__init__.py b/pandas/tests/reshape/concat/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/reshape/concat/conftest.py b/pandas/tests/reshape/concat/conftest.py new file mode 100644 index 0000000000000..62b8c59ba8855 --- /dev/null +++ b/pandas/tests/reshape/concat/conftest.py @@ -0,0 +1,7 @@ +import pytest + + +@pytest.fixture(params=[True, False]) +def sort(request): + """Boolean sort keyword for concat and DataFrame.append.""" + return request.param diff --git a/pandas/tests/reshape/concat/test_append.py b/pandas/tests/reshape/concat/test_append.py index 2f9228bc84394..ffeda703cd890 100644 --- a/pandas/tests/reshape/concat/test_append.py +++ b/pandas/tests/reshape/concat/test_append.py @@ -11,12 +11,6 @@ import pandas._testing as tm -@pytest.fixture(params=[True, False]) -def sort(request): - """Boolean sort keyword for concat and DataFrame.append.""" - return request.param - - class TestAppend: def test_append(self, sort, float_frame): mixed_frame = float_frame.copy() diff --git a/pandas/tests/reshape/concat/test_append_common.py b/pandas/tests/reshape/concat/test_append_common.py index 7ca3ae65706fe..395673e9a47ab 100644 --- a/pandas/tests/reshape/concat/test_append_common.py +++ b/pandas/tests/reshape/concat/test_append_common.py @@ -725,3 +725,26 @@ def test_concat_categorical_empty(self): tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) + + def test_categorical_concat_append(self): + cat = Categorical(["a", "b"], categories=["a", "b"]) + vals = [1, 2] + df = DataFrame({"cats": cat, "vals": vals}) + cat2 = Categorical(["a", "b", "a", "b"], categories=["a", "b"]) + vals2 = [1, 2, 1, 2] + exp = DataFrame({"cats": cat2, "vals": vals2}, index=Index([0, 1, 0, 1])) + + tm.assert_frame_equal(pd.concat([df, df]), exp) + tm.assert_frame_equal(df.append(df), exp) + + # GH 13524 can concat different categories + cat3 = Categorical(["a", "b"], categories=["a", "b", "c"]) + vals3 = [1, 2] + df_different_categories = DataFrame({"cats": cat3, "vals": vals3}) + + res = pd.concat([df, df_different_categories], ignore_index=True) + exp = DataFrame({"cats": list("abab"), "vals": [1, 2, 1, 2]}) + tm.assert_frame_equal(res, exp) + + res = df.append(df_different_categories, ignore_index=True) + tm.assert_frame_equal(res, exp) diff --git a/pandas/tests/reshape/concat/test_categorical.py b/pandas/tests/reshape/concat/test_categorical.py new file mode 100644 index 0000000000000..388575c5a3b86 --- /dev/null +++ b/pandas/tests/reshape/concat/test_categorical.py @@ -0,0 +1,195 @@ +import numpy as np +import pytest + +from pandas.core.dtypes.dtypes import CategoricalDtype + +import pandas as pd +from pandas import Categorical, DataFrame, Series +import pandas._testing as tm + + +class TestCategoricalConcat: + def test_categorical_concat(self, sort): + # See GH 10177 + df1 = DataFrame( + np.arange(18, dtype="int64").reshape(6, 3), columns=["a", "b", "c"] + ) + + df2 = DataFrame(np.arange(14, dtype="int64").reshape(7, 2), columns=["a", "c"]) + + cat_values = ["one", "one", "two", "one", "two", "two", "one"] + df2["h"] = Series(Categorical(cat_values)) + + res = pd.concat((df1, df2), axis=0, ignore_index=True, sort=sort) + exp = DataFrame( + { + "a": [0, 3, 6, 9, 12, 15, 0, 2, 4, 6, 8, 10, 12], + "b": [ + 1, + 4, + 7, + 10, + 13, + 16, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + ], + "c": [2, 5, 8, 11, 14, 17, 1, 3, 5, 7, 9, 11, 13], + "h": [None] * 6 + cat_values, + } + ) + tm.assert_frame_equal(res, exp) + + def test_categorical_concat_dtypes(self): + + # GH8143 + index = ["cat", "obj", "num"] + cat = Categorical(["a", "b", "c"]) + obj = Series(["a", "b", "c"]) + num = Series([1, 2, 3]) + df = pd.concat([Series(cat), obj, num], axis=1, keys=index) + + result = df.dtypes == "object" + expected = Series([False, True, False], index=index) + tm.assert_series_equal(result, expected) + + result = df.dtypes == "int64" + expected = Series([False, False, True], index=index) + tm.assert_series_equal(result, expected) + + result = df.dtypes == "category" + expected = Series([True, False, False], index=index) + tm.assert_series_equal(result, expected) + + def test_concat_categoricalindex(self): + # GH 16111, categories that aren't lexsorted + categories = [9, 0, 1, 2, 3] + + a = Series(1, index=pd.CategoricalIndex([9, 0], categories=categories)) + b = Series(2, index=pd.CategoricalIndex([0, 1], categories=categories)) + c = Series(3, index=pd.CategoricalIndex([1, 2], categories=categories)) + + result = pd.concat([a, b, c], axis=1) + + exp_idx = pd.CategoricalIndex([9, 0, 1, 2], categories=categories) + exp = DataFrame( + { + 0: [1, 1, np.nan, np.nan], + 1: [np.nan, 2, 2, np.nan], + 2: [np.nan, np.nan, 3, 3], + }, + columns=[0, 1, 2], + index=exp_idx, + ) + tm.assert_frame_equal(result, exp) + + def test_categorical_concat_preserve(self): + + # GH 8641 series concat not preserving category dtype + # GH 13524 can concat different categories + s = Series(list("abc"), dtype="category") + s2 = Series(list("abd"), dtype="category") + + exp = Series(list("abcabd")) + res = pd.concat([s, s2], ignore_index=True) + tm.assert_series_equal(res, exp) + + exp = Series(list("abcabc"), dtype="category") + res = pd.concat([s, s], ignore_index=True) + tm.assert_series_equal(res, exp) + + exp = Series(list("abcabc"), index=[0, 1, 2, 0, 1, 2], dtype="category") + res = pd.concat([s, s]) + tm.assert_series_equal(res, exp) + + a = Series(np.arange(6, dtype="int64")) + b = Series(list("aabbca")) + + df2 = DataFrame({"A": a, "B": b.astype(CategoricalDtype(list("cab")))}) + res = pd.concat([df2, df2]) + exp = DataFrame( + { + "A": pd.concat([a, a]), + "B": pd.concat([b, b]).astype(CategoricalDtype(list("cab"))), + } + ) + tm.assert_frame_equal(res, exp) + + def test_categorical_index_preserver(self): + + a = Series(np.arange(6, dtype="int64")) + b = Series(list("aabbca")) + + df2 = DataFrame( + {"A": a, "B": b.astype(CategoricalDtype(list("cab")))} + ).set_index("B") + result = pd.concat([df2, df2]) + expected = DataFrame( + { + "A": pd.concat([a, a]), + "B": pd.concat([b, b]).astype(CategoricalDtype(list("cab"))), + } + ).set_index("B") + tm.assert_frame_equal(result, expected) + + # wrong categories + df3 = DataFrame( + {"A": a, "B": Categorical(b, categories=list("abe"))} + ).set_index("B") + msg = "categories must match existing categories when appending" + with pytest.raises(TypeError, match=msg): + pd.concat([df2, df3]) + + def test_concat_categorical_tz(self): + # GH-23816 + a = Series(pd.date_range("2017-01-01", periods=2, tz="US/Pacific")) + b = Series(["a", "b"], dtype="category") + result = pd.concat([a, b], ignore_index=True) + expected = Series( + [ + pd.Timestamp("2017-01-01", tz="US/Pacific"), + pd.Timestamp("2017-01-02", tz="US/Pacific"), + "a", + "b", + ] + ) + tm.assert_series_equal(result, expected) + + def test_concat_categorical_unchanged(self): + # GH-12007 + # test fix for when concat on categorical and float + # coerces dtype categorical -> float + df = DataFrame(Series(["a", "b", "c"], dtype="category", name="A")) + ser = Series([0, 1, 2], index=[0, 1, 3], name="B") + result = pd.concat([df, ser], axis=1) + expected = DataFrame( + { + "A": Series(["a", "b", "c", np.nan], dtype="category"), + "B": Series([0, 1, np.nan, 2], dtype="float"), + } + ) + tm.assert_equal(result, expected) + + def test_categorical_concat_gh7864(self): + # GH 7864 + # make sure ordering is preserved + df = DataFrame({"id": [1, 2, 3, 4, 5, 6], "raw_grade": list("abbaae")}) + df["grade"] = Categorical(df["raw_grade"]) + df["grade"].cat.set_categories(["e", "a", "b"]) + + df1 = df[0:3] + df2 = df[3:] + + tm.assert_index_equal(df["grade"].cat.categories, df1["grade"].cat.categories) + tm.assert_index_equal(df["grade"].cat.categories, df2["grade"].cat.categories) + + dfx = pd.concat([df1, df2]) + tm.assert_index_equal(df["grade"].cat.categories, dfx["grade"].cat.categories) + + dfa = df1.append(df2) + tm.assert_index_equal(df["grade"].cat.categories, dfa["grade"].cat.categories) diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index c3e1f3177b3d3..90172abefb8c4 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -1,38 +1,18 @@ from collections import abc, deque from decimal import Decimal -from io import StringIO from warnings import catch_warnings import numpy as np -from numpy.random import randn import pytest -from pandas.core.dtypes.dtypes import CategoricalDtype - import pandas as pd -from pandas import ( - Categorical, - DataFrame, - DatetimeIndex, - Index, - MultiIndex, - Series, - concat, - date_range, - read_csv, -) +from pandas import DataFrame, Index, MultiIndex, Series, concat, date_range import pandas._testing as tm from pandas.core.arrays import SparseArray from pandas.core.construction import create_series_with_explicit_dtype from pandas.tests.extension.decimal import to_decimal -@pytest.fixture(params=[True, False]) -def sort(request): - """Boolean sort keyword for concat and DataFrame.append.""" - return request.param - - class TestConcatenate: def test_concat_copy(self): df = DataFrame(np.random.randn(4, 3)) @@ -116,39 +96,6 @@ def test_concat_keys_specific_levels(self): assert result.columns.names == ["group_key", None] - def test_concat_dataframe_keys_bug(self, sort): - t1 = DataFrame( - {"value": Series([1, 2, 3], index=Index(["a", "b", "c"], name="id"))} - ) - t2 = DataFrame({"value": Series([7, 8], index=Index(["a", "b"], name="id"))}) - - # it works - result = concat([t1, t2], axis=1, keys=["t1", "t2"], sort=sort) - assert list(result.columns) == [("t1", "value"), ("t2", "value")] - - def test_concat_series_partial_columns_names(self): - # GH10698 - foo = Series([1, 2], name="foo") - bar = Series([1, 2]) - baz = Series([4, 5]) - - result = concat([foo, bar, baz], axis=1) - expected = DataFrame( - {"foo": [1, 2], 0: [1, 2], 1: [4, 5]}, columns=["foo", 0, 1] - ) - tm.assert_frame_equal(result, expected) - - result = concat([foo, bar, baz], axis=1, keys=["red", "blue", "yellow"]) - expected = DataFrame( - {"red": [1, 2], "blue": [1, 2], "yellow": [4, 5]}, - columns=["red", "blue", "yellow"], - ) - tm.assert_frame_equal(result, expected) - - result = concat([foo, bar, baz], axis=1, ignore_index=True) - expected = DataFrame({0: [1, 2], 1: [1, 2], 2: [4, 5]}) - tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("mapping", ["mapping", "dict"]) def test_concat_mapping(self, mapping, non_dict_mapping_subclass): constructor = dict if mapping == "dict" else non_dict_mapping_subclass @@ -176,106 +123,6 @@ def test_concat_mapping(self, mapping, non_dict_mapping_subclass): expected = concat([frames[k] for k in keys], keys=keys) tm.assert_frame_equal(result, expected) - def test_concat_ignore_index(self, sort): - frame1 = DataFrame( - {"test1": ["a", "b", "c"], "test2": [1, 2, 3], "test3": [4.5, 3.2, 1.2]} - ) - frame2 = DataFrame({"test3": [5.2, 2.2, 4.3]}) - frame1.index = Index(["x", "y", "z"]) - frame2.index = Index(["x", "y", "q"]) - - v1 = concat([frame1, frame2], axis=1, ignore_index=True, sort=sort) - - nan = np.nan - expected = DataFrame( - [ - [nan, nan, nan, 4.3], - ["a", 1, 4.5, 5.2], - ["b", 2, 3.2, 2.2], - ["c", 3, 1.2, nan], - ], - index=Index(["q", "x", "y", "z"]), - ) - if not sort: - expected = expected.loc[["x", "y", "z", "q"]] - - tm.assert_frame_equal(v1, expected) - - @pytest.mark.parametrize( - "name_in1,name_in2,name_in3,name_out", - [ - ("idx", "idx", "idx", "idx"), - ("idx", "idx", None, None), - ("idx", None, None, None), - ("idx1", "idx2", None, None), - ("idx1", "idx1", "idx2", None), - ("idx1", "idx2", "idx3", None), - (None, None, None, None), - ], - ) - def test_concat_same_index_names(self, name_in1, name_in2, name_in3, name_out): - # GH13475 - indices = [ - Index(["a", "b", "c"], name=name_in1), - Index(["b", "c", "d"], name=name_in2), - Index(["c", "d", "e"], name=name_in3), - ] - frames = [ - DataFrame({c: [0, 1, 2]}, index=i) for i, c in zip(indices, ["x", "y", "z"]) - ] - result = pd.concat(frames, axis=1) - - exp_ind = Index(["a", "b", "c", "d", "e"], name=name_out) - expected = DataFrame( - { - "x": [0, 1, 2, np.nan, np.nan], - "y": [np.nan, 0, 1, 2, np.nan], - "z": [np.nan, np.nan, 0, 1, 2], - }, - index=exp_ind, - ) - - tm.assert_frame_equal(result, expected) - - def test_concat_multiindex_with_keys(self): - index = MultiIndex( - levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], - codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=["first", "second"], - ) - frame = DataFrame( - np.random.randn(10, 3), - index=index, - columns=Index(["A", "B", "C"], name="exp"), - ) - result = concat([frame, frame], keys=[0, 1], names=["iteration"]) - - assert result.index.names == ("iteration",) + index.names - tm.assert_frame_equal(result.loc[0], frame) - tm.assert_frame_equal(result.loc[1], frame) - assert result.index.nlevels == 3 - - def test_concat_multiindex_with_none_in_index_names(self): - # GH 15787 - index = pd.MultiIndex.from_product([[1], range(5)], names=["level1", None]) - df = DataFrame({"col": range(5)}, index=index, dtype=np.int32) - - result = concat([df, df], keys=[1, 2], names=["level2"]) - index = pd.MultiIndex.from_product( - [[1, 2], [1], range(5)], names=["level2", "level1", None] - ) - expected = DataFrame({"col": list(range(5)) * 2}, index=index, dtype=np.int32) - tm.assert_frame_equal(result, expected) - - result = concat([df, df[:2]], keys=[1, 2], names=["level2"]) - level2 = [1] * 5 + [2] * 2 - level1 = [1] * 7 - no_name = list(range(5)) + list(range(2)) - tuples = list(zip(level2, level1, no_name)) - index = pd.MultiIndex.from_tuples(tuples, names=["level2", "level1", None]) - expected = DataFrame({"col": no_name}, index=index, dtype=np.int32) - tm.assert_frame_equal(result, expected) - def test_concat_keys_and_levels(self): df = DataFrame(np.random.randn(1, 3)) df2 = DataFrame(np.random.randn(1, 4)) @@ -330,28 +177,6 @@ def test_concat_keys_levels_no_overlap(self): with pytest.raises(ValueError, match=msg): concat([df, df2], keys=["one", "two"], levels=[["foo", "bar", "baz"]]) - def test_concat_rename_index(self): - a = DataFrame( - np.random.rand(3, 3), - columns=list("ABC"), - index=Index(list("abc"), name="index_a"), - ) - b = DataFrame( - np.random.rand(3, 3), - columns=list("ABC"), - index=Index(list("abc"), name="index_b"), - ) - - result = concat([a, b], keys=["key0", "key1"], names=["lvl0", "lvl1"]) - - exp = concat([a, b], keys=["key0", "key1"], names=["lvl0"]) - names = list(exp.index.names) - names[1] = "lvl1" - exp.index.set_names(names, inplace=True) - - tm.assert_frame_equal(result, exp) - assert result.index.names == exp.index.names - def test_crossed_dtypes_weird_corner(self): columns = ["A", "B", "C", "D"] df1 = DataFrame( @@ -385,53 +210,6 @@ def test_crossed_dtypes_weird_corner(self): result = concat([df, df2], keys=["one", "two"], names=["first", "second"]) assert result.index.names == ("first", "second") - def test_dups_index(self): - # GH 4771 - - # single dtypes - df = DataFrame( - np.random.randint(0, 10, size=40).reshape(10, 4), - columns=["A", "A", "C", "C"], - ) - - result = concat([df, df], axis=1) - tm.assert_frame_equal(result.iloc[:, :4], df) - tm.assert_frame_equal(result.iloc[:, 4:], df) - - result = concat([df, df], axis=0) - tm.assert_frame_equal(result.iloc[:10], df) - tm.assert_frame_equal(result.iloc[10:], df) - - # multi dtypes - df = concat( - [ - DataFrame(np.random.randn(10, 4), columns=["A", "A", "B", "B"]), - DataFrame( - np.random.randint(0, 10, size=20).reshape(10, 2), columns=["A", "C"] - ), - ], - axis=1, - ) - - result = concat([df, df], axis=1) - tm.assert_frame_equal(result.iloc[:, :6], df) - tm.assert_frame_equal(result.iloc[:, 6:], df) - - result = concat([df, df], axis=0) - tm.assert_frame_equal(result.iloc[:10], df) - tm.assert_frame_equal(result.iloc[10:], df) - - # append - result = df.iloc[0:8, :].append(df.iloc[8:]) - tm.assert_frame_equal(result, df) - - result = df.iloc[0:8, :].append(df.iloc[8:9]).append(df.iloc[9:10]) - tm.assert_frame_equal(result, df) - - expected = concat([df, df], axis=0) - result = df.append(df) - tm.assert_frame_equal(result, expected) - def test_with_mixed_tuples(self, sort): # 10697 # columns have mixed tuples, so handle properly @@ -441,38 +219,6 @@ def test_with_mixed_tuples(self, sort): # it works concat([df1, df2], sort=sort) - def test_handle_empty_objects(self, sort): - df = DataFrame(np.random.randn(10, 4), columns=list("abcd")) - - baz = df[:5].copy() - baz["foo"] = "bar" - empty = df[5:5] - - frames = [baz, empty, empty, df[5:]] - concatted = concat(frames, axis=0, sort=sort) - - expected = df.reindex(columns=["a", "b", "c", "d", "foo"]) - expected["foo"] = expected["foo"].astype("O") - expected.loc[0:4, "foo"] = "bar" - - tm.assert_frame_equal(concatted, expected) - - # empty as first element with time series - # GH3259 - df = DataFrame( - dict(A=range(10000)), index=date_range("20130101", periods=10000, freq="s") - ) - empty = DataFrame() - result = concat([df, empty], axis=1) - tm.assert_frame_equal(result, df) - result = concat([empty, df], axis=1) - tm.assert_frame_equal(result, df) - - result = concat([df, empty]) - tm.assert_frame_equal(result, df) - result = concat([empty, df]) - tm.assert_frame_equal(result, df) - def test_concat_mixed_objs(self): # concat mixed series/frames @@ -542,20 +288,6 @@ def test_concat_mixed_objs(self): result = concat([s1, df, s2], ignore_index=True) tm.assert_frame_equal(result, expected) - def test_empty_dtype_coerce(self): - - # xref to #12411 - # xref to #12045 - # xref to #11594 - # see below - - # 10571 - df1 = DataFrame(data=[[1, None], [2, None]], columns=["a", "b"]) - df2 = DataFrame(data=[[3, None], [4, None]], columns=["a", "b"]) - result = concat([df1, df2]) - expected = df1.dtypes - tm.assert_series_equal(result.dtypes, expected) - def test_dtype_coerceion(self): # 12411 @@ -578,76 +310,6 @@ def test_dtype_coerceion(self): result = concat([df.iloc[[0]], df.iloc[[1]]]) tm.assert_series_equal(result.dtypes, df.dtypes) - def test_concat_series(self): - - ts = tm.makeTimeSeries() - ts.name = "foo" - - pieces = [ts[:5], ts[5:15], ts[15:]] - - result = concat(pieces) - tm.assert_series_equal(result, ts) - assert result.name == ts.name - - result = concat(pieces, keys=[0, 1, 2]) - expected = ts.copy() - - ts.index = DatetimeIndex(np.array(ts.index.values, dtype="M8[ns]")) - - exp_codes = [np.repeat([0, 1, 2], [len(x) for x in pieces]), np.arange(len(ts))] - exp_index = MultiIndex(levels=[[0, 1, 2], ts.index], codes=exp_codes) - expected.index = exp_index - tm.assert_series_equal(result, expected) - - def test_concat_series_axis1(self, sort=sort): - ts = tm.makeTimeSeries() - - pieces = [ts[:-2], ts[2:], ts[2:-2]] - - result = concat(pieces, axis=1) - expected = DataFrame(pieces).T - tm.assert_frame_equal(result, expected) - - result = concat(pieces, keys=["A", "B", "C"], axis=1) - expected = DataFrame(pieces, index=["A", "B", "C"]).T - tm.assert_frame_equal(result, expected) - - # preserve series names, #2489 - s = Series(randn(5), name="A") - s2 = Series(randn(5), name="B") - - result = concat([s, s2], axis=1) - expected = DataFrame({"A": s, "B": s2}) - tm.assert_frame_equal(result, expected) - - s2.name = None - result = concat([s, s2], axis=1) - tm.assert_index_equal(result.columns, Index(["A", 0], dtype="object")) - - # must reindex, #2603 - s = Series(randn(3), index=["c", "a", "b"], name="A") - s2 = Series(randn(4), index=["d", "a", "b", "c"], name="B") - result = concat([s, s2], axis=1, sort=sort) - expected = DataFrame({"A": s, "B": s2}) - tm.assert_frame_equal(result, expected) - - def test_concat_series_axis1_names_applied(self): - # ensure names argument is not ignored on axis=1, #23490 - s = Series([1, 2, 3]) - s2 = Series([4, 5, 6]) - result = concat([s, s2], axis=1, keys=["a", "b"], names=["A"]) - expected = DataFrame( - [[1, 4], [2, 5], [3, 6]], columns=Index(["a", "b"], name="A") - ) - tm.assert_frame_equal(result, expected) - - result = concat([s, s2], axis=1, keys=[("a", 1), ("b", 2)], names=["A", "B"]) - expected = DataFrame( - [[1, 4], [2, 5], [3, 6]], - columns=MultiIndex.from_tuples([("a", 1), ("b", 2)], names=["A", "B"]), - ) - tm.assert_frame_equal(result, expected) - def test_concat_single_with_key(self): df = DataFrame(np.random.randn(10, 4)) @@ -664,17 +326,6 @@ def test_concat_exclude_none(self): with pytest.raises(ValueError, match="All objects passed were None"): concat([None, None]) - def test_concat_timedelta64_block(self): - from pandas import to_timedelta - - rng = to_timedelta(np.arange(10), unit="s") - - df = DataFrame({"time": rng}) - - result = concat([df, df]) - assert (result.iloc[:10]["time"] == rng).all() - assert (result.iloc[10:]["time"] == rng).all() - def test_concat_keys_with_none(self): # #1649 df0 = DataFrame([[10, 20, 30], [10, 20, 30], [10, 20, 30]]) @@ -736,26 +387,6 @@ def test_concat_bug_3602(self): result = concat([df1, df2], axis=1) tm.assert_frame_equal(result, expected) - def test_concat_inner_join_empty(self): - # GH 15328 - df_empty = DataFrame() - df_a = DataFrame({"a": [1, 2]}, index=[0, 1], dtype="int64") - df_expected = DataFrame({"a": []}, index=[], dtype="int64") - - for how, expected in [("inner", df_expected), ("outer", df_a)]: - result = pd.concat([df_a, df_empty], axis=1, join=how) - tm.assert_frame_equal(result, expected) - - def test_concat_series_axis1_same_names_ignore_index(self): - dates = date_range("01-Jan-2013", "01-Jan-2014", freq="MS")[0:-1] - s1 = Series(randn(len(dates)), index=dates, name="value") - s2 = Series(randn(len(dates)), index=dates, name="value") - - result = concat([s1, s2], axis=1, ignore_index=True) - expected = Index([0, 1]) - - tm.assert_index_equal(result.columns, expected) - def test_concat_iterables(self): # GH8645 check concat works with tuples, list, generators, and weird # stuff like deque and custom iterables @@ -788,342 +419,6 @@ def __iter__(self): tm.assert_frame_equal(pd.concat(CustomIterator2(), ignore_index=True), expected) - def test_concat_invalid(self): - - # trying to concat a ndframe with a non-ndframe - df1 = tm.makeCustomDataframe(10, 2) - for obj in [1, dict(), [1, 2], (1, 2)]: - - msg = ( - f"cannot concatenate object of type '{type(obj)}'; " - "only Series and DataFrame objs are valid" - ) - with pytest.raises(TypeError, match=msg): - concat([df1, obj]) - - def test_concat_invalid_first_argument(self): - df1 = tm.makeCustomDataframe(10, 2) - df2 = tm.makeCustomDataframe(10, 2) - msg = ( - "first argument must be an iterable of pandas " - 'objects, you passed an object of type "DataFrame"' - ) - with pytest.raises(TypeError, match=msg): - concat(df1, df2) - - # generator ok though - concat(DataFrame(np.random.rand(5, 5)) for _ in range(3)) - - # text reader ok - # GH6583 - data = """index,A,B,C,D -foo,2,3,4,5 -bar,7,8,9,10 -baz,12,13,14,15 -qux,12,13,14,15 -foo2,12,13,14,15 -bar2,12,13,14,15 -""" - - reader = read_csv(StringIO(data), chunksize=1) - result = concat(reader, ignore_index=True) - expected = read_csv(StringIO(data)) - tm.assert_frame_equal(result, expected) - - def test_concat_empty_series(self): - # GH 11082 - s1 = Series([1, 2, 3], name="x") - s2 = Series(name="y", dtype="float64") - res = pd.concat([s1, s2], axis=1) - exp = DataFrame( - {"x": [1, 2, 3], "y": [np.nan, np.nan, np.nan]}, - index=Index([0, 1, 2], dtype="O"), - ) - tm.assert_frame_equal(res, exp) - - s1 = Series([1, 2, 3], name="x") - s2 = Series(name="y", dtype="float64") - res = pd.concat([s1, s2], axis=0) - # name will be reset - exp = Series([1, 2, 3]) - tm.assert_series_equal(res, exp) - - # empty Series with no name - s1 = Series([1, 2, 3], name="x") - s2 = Series(name=None, dtype="float64") - res = pd.concat([s1, s2], axis=1) - exp = DataFrame( - {"x": [1, 2, 3], 0: [np.nan, np.nan, np.nan]}, - columns=["x", 0], - index=Index([0, 1, 2], dtype="O"), - ) - tm.assert_frame_equal(res, exp) - - @pytest.mark.parametrize("tz", [None, "UTC"]) - @pytest.mark.parametrize("values", [[], [1, 2, 3]]) - def test_concat_empty_series_timelike(self, tz, values): - # GH 18447 - - first = Series([], dtype="M8[ns]").dt.tz_localize(tz) - dtype = None if values else np.float64 - second = Series(values, dtype=dtype) - - expected = DataFrame( - { - 0: Series([pd.NaT] * len(values), dtype="M8[ns]").dt.tz_localize(tz), - 1: values, - } - ) - result = concat([first, second], axis=1) - tm.assert_frame_equal(result, expected) - - def test_default_index(self): - # is_series and ignore_index - s1 = Series([1, 2, 3], name="x") - s2 = Series([4, 5, 6], name="y") - res = pd.concat([s1, s2], axis=1, ignore_index=True) - assert isinstance(res.columns, pd.RangeIndex) - exp = DataFrame([[1, 4], [2, 5], [3, 6]]) - # use check_index_type=True to check the result have - # RangeIndex (default index) - tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True) - - # is_series and all inputs have no names - s1 = Series([1, 2, 3]) - s2 = Series([4, 5, 6]) - res = pd.concat([s1, s2], axis=1, ignore_index=False) - assert isinstance(res.columns, pd.RangeIndex) - exp = DataFrame([[1, 4], [2, 5], [3, 6]]) - exp.columns = pd.RangeIndex(2) - tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True) - - # is_dataframe and ignore_index - df1 = DataFrame({"A": [1, 2], "B": [5, 6]}) - df2 = DataFrame({"A": [3, 4], "B": [7, 8]}) - - res = pd.concat([df1, df2], axis=0, ignore_index=True) - exp = DataFrame([[1, 5], [2, 6], [3, 7], [4, 8]], columns=["A", "B"]) - tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True) - - res = pd.concat([df1, df2], axis=1, ignore_index=True) - exp = DataFrame([[1, 5, 3, 7], [2, 6, 4, 8]]) - tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True) - - def test_concat_multiindex_rangeindex(self): - # GH13542 - # when multi-index levels are RangeIndex objects - # there is a bug in concat with objects of len 1 - - df = DataFrame(np.random.randn(9, 2)) - df.index = MultiIndex( - levels=[pd.RangeIndex(3), pd.RangeIndex(3)], - codes=[np.repeat(np.arange(3), 3), np.tile(np.arange(3), 3)], - ) - - res = concat([df.iloc[[2, 3, 4], :], df.iloc[[5], :]]) - exp = df.iloc[[2, 3, 4, 5], :] - tm.assert_frame_equal(res, exp) - - def test_concat_multiindex_dfs_with_deepcopy(self): - # GH 9967 - from copy import deepcopy - - example_multiindex1 = pd.MultiIndex.from_product([["a"], ["b"]]) - example_dataframe1 = DataFrame([0], index=example_multiindex1) - - example_multiindex2 = pd.MultiIndex.from_product([["a"], ["c"]]) - example_dataframe2 = DataFrame([1], index=example_multiindex2) - - example_dict = {"s1": example_dataframe1, "s2": example_dataframe2} - expected_index = pd.MultiIndex( - levels=[["s1", "s2"], ["a"], ["b", "c"]], - codes=[[0, 1], [0, 0], [0, 1]], - names=["testname", None, None], - ) - expected = DataFrame([[0], [1]], index=expected_index) - result_copy = pd.concat(deepcopy(example_dict), names=["testname"]) - tm.assert_frame_equal(result_copy, expected) - result_no_copy = pd.concat(example_dict, names=["testname"]) - tm.assert_frame_equal(result_no_copy, expected) - - def test_categorical_concat_append(self): - cat = Categorical(["a", "b"], categories=["a", "b"]) - vals = [1, 2] - df = DataFrame({"cats": cat, "vals": vals}) - cat2 = Categorical(["a", "b", "a", "b"], categories=["a", "b"]) - vals2 = [1, 2, 1, 2] - exp = DataFrame({"cats": cat2, "vals": vals2}, index=Index([0, 1, 0, 1])) - - tm.assert_frame_equal(pd.concat([df, df]), exp) - tm.assert_frame_equal(df.append(df), exp) - - # GH 13524 can concat different categories - cat3 = Categorical(["a", "b"], categories=["a", "b", "c"]) - vals3 = [1, 2] - df_different_categories = DataFrame({"cats": cat3, "vals": vals3}) - - res = pd.concat([df, df_different_categories], ignore_index=True) - exp = DataFrame({"cats": list("abab"), "vals": [1, 2, 1, 2]}) - tm.assert_frame_equal(res, exp) - - res = df.append(df_different_categories, ignore_index=True) - tm.assert_frame_equal(res, exp) - - def test_categorical_concat_dtypes(self): - - # GH8143 - index = ["cat", "obj", "num"] - cat = Categorical(["a", "b", "c"]) - obj = Series(["a", "b", "c"]) - num = Series([1, 2, 3]) - df = pd.concat([Series(cat), obj, num], axis=1, keys=index) - - result = df.dtypes == "object" - expected = Series([False, True, False], index=index) - tm.assert_series_equal(result, expected) - - result = df.dtypes == "int64" - expected = Series([False, False, True], index=index) - tm.assert_series_equal(result, expected) - - result = df.dtypes == "category" - expected = Series([True, False, False], index=index) - tm.assert_series_equal(result, expected) - - def test_categorical_concat(self, sort): - # See GH 10177 - df1 = DataFrame( - np.arange(18, dtype="int64").reshape(6, 3), columns=["a", "b", "c"] - ) - - df2 = DataFrame(np.arange(14, dtype="int64").reshape(7, 2), columns=["a", "c"]) - - cat_values = ["one", "one", "two", "one", "two", "two", "one"] - df2["h"] = Series(Categorical(cat_values)) - - res = pd.concat((df1, df2), axis=0, ignore_index=True, sort=sort) - exp = DataFrame( - { - "a": [0, 3, 6, 9, 12, 15, 0, 2, 4, 6, 8, 10, 12], - "b": [ - 1, - 4, - 7, - 10, - 13, - 16, - np.nan, - np.nan, - np.nan, - np.nan, - np.nan, - np.nan, - np.nan, - ], - "c": [2, 5, 8, 11, 14, 17, 1, 3, 5, 7, 9, 11, 13], - "h": [None] * 6 + cat_values, - } - ) - tm.assert_frame_equal(res, exp) - - def test_categorical_concat_gh7864(self): - # GH 7864 - # make sure ordering is preserved - df = DataFrame({"id": [1, 2, 3, 4, 5, 6], "raw_grade": list("abbaae")}) - df["grade"] = Categorical(df["raw_grade"]) - df["grade"].cat.set_categories(["e", "a", "b"]) - - df1 = df[0:3] - df2 = df[3:] - - tm.assert_index_equal(df["grade"].cat.categories, df1["grade"].cat.categories) - tm.assert_index_equal(df["grade"].cat.categories, df2["grade"].cat.categories) - - dfx = pd.concat([df1, df2]) - tm.assert_index_equal(df["grade"].cat.categories, dfx["grade"].cat.categories) - - dfa = df1.append(df2) - tm.assert_index_equal(df["grade"].cat.categories, dfa["grade"].cat.categories) - - def test_categorical_concat_preserve(self): - - # GH 8641 series concat not preserving category dtype - # GH 13524 can concat different categories - s = Series(list("abc"), dtype="category") - s2 = Series(list("abd"), dtype="category") - - exp = Series(list("abcabd")) - res = pd.concat([s, s2], ignore_index=True) - tm.assert_series_equal(res, exp) - - exp = Series(list("abcabc"), dtype="category") - res = pd.concat([s, s], ignore_index=True) - tm.assert_series_equal(res, exp) - - exp = Series(list("abcabc"), index=[0, 1, 2, 0, 1, 2], dtype="category") - res = pd.concat([s, s]) - tm.assert_series_equal(res, exp) - - a = Series(np.arange(6, dtype="int64")) - b = Series(list("aabbca")) - - df2 = DataFrame({"A": a, "B": b.astype(CategoricalDtype(list("cab")))}) - res = pd.concat([df2, df2]) - exp = DataFrame( - { - "A": pd.concat([a, a]), - "B": pd.concat([b, b]).astype(CategoricalDtype(list("cab"))), - } - ) - tm.assert_frame_equal(res, exp) - - def test_categorical_index_preserver(self): - - a = Series(np.arange(6, dtype="int64")) - b = Series(list("aabbca")) - - df2 = DataFrame( - {"A": a, "B": b.astype(CategoricalDtype(list("cab")))} - ).set_index("B") - result = pd.concat([df2, df2]) - expected = DataFrame( - { - "A": pd.concat([a, a]), - "B": pd.concat([b, b]).astype(CategoricalDtype(list("cab"))), - } - ).set_index("B") - tm.assert_frame_equal(result, expected) - - # wrong categories - df3 = DataFrame( - {"A": a, "B": Categorical(b, categories=list("abe"))} - ).set_index("B") - msg = "categories must match existing categories when appending" - with pytest.raises(TypeError, match=msg): - pd.concat([df2, df3]) - - def test_concat_categoricalindex(self): - # GH 16111, categories that aren't lexsorted - categories = [9, 0, 1, 2, 3] - - a = Series(1, index=pd.CategoricalIndex([9, 0], categories=categories)) - b = Series(2, index=pd.CategoricalIndex([0, 1], categories=categories)) - c = Series(3, index=pd.CategoricalIndex([1, 2], categories=categories)) - - result = pd.concat([a, b, c], axis=1) - - exp_idx = pd.CategoricalIndex([9, 0, 1, 2], categories=categories) - exp = DataFrame( - { - 0: [1, 1, np.nan, np.nan], - 1: [np.nan, 2, 2, np.nan], - 2: [np.nan, np.nan, 3, 3], - }, - columns=[0, 1, 2], - index=exp_idx, - ) - tm.assert_frame_equal(result, exp) - def test_concat_order(self): # GH 17344 dfs = [DataFrame(index=range(3), columns=["a", 1, None])] @@ -1141,7 +436,7 @@ def test_concat_different_extension_dtypes_upcasts(self): expected = Series([1, 2, Decimal(1), Decimal(2)], dtype=object) tm.assert_series_equal(result, expected) - def test_concat_odered_dict(self): + def test_concat_ordered_dict(self): # GH 21510 expected = pd.concat( [Series(range(3)), Series(range(4))], keys=["First", "Another"] @@ -1151,22 +446,6 @@ def test_concat_odered_dict(self): ) tm.assert_series_equal(result, expected) - def test_concat_empty_dataframe_dtypes(self): - df = DataFrame(columns=list("abc")) - df["a"] = df["a"].astype(np.bool_) - df["b"] = df["b"].astype(np.int32) - df["c"] = df["c"].astype(np.float64) - - result = pd.concat([df, df]) - assert result["a"].dtype == np.bool_ - assert result["b"].dtype == np.int32 - assert result["c"].dtype == np.float64 - - result = pd.concat([df, df.astype(np.float64)]) - assert result["a"].dtype == np.object_ - assert result["b"].dtype == np.float64 - assert result["c"].dtype == np.float64 - @pytest.mark.parametrize("pdt", [Series, pd.DataFrame]) @pytest.mark.parametrize("dt", np.sctypes["float"]) @@ -1206,144 +485,6 @@ def test_concat_empty_and_non_empty_frame_regression(): tm.assert_frame_equal(result, expected) -def test_concat_empty_and_non_empty_series_regression(): - # GH 18187 regression test - s1 = Series([1]) - s2 = Series([], dtype=object) - - expected = s1 - result = pd.concat([s1, s2]) - tm.assert_series_equal(result, expected) - - -def test_concat_sorts_columns(sort): - # GH-4588 - df1 = DataFrame({"a": [1, 2], "b": [1, 2]}, columns=["b", "a"]) - df2 = DataFrame({"a": [3, 4], "c": [5, 6]}) - - # for sort=True/None - expected = DataFrame( - {"a": [1, 2, 3, 4], "b": [1, 2, None, None], "c": [None, None, 5, 6]}, - columns=["a", "b", "c"], - ) - - if sort is False: - expected = expected[["b", "a", "c"]] - - # default - with tm.assert_produces_warning(None): - result = pd.concat([df1, df2], ignore_index=True, sort=sort) - tm.assert_frame_equal(result, expected) - - -def test_concat_sorts_index(sort): - df1 = DataFrame({"a": [1, 2, 3]}, index=["c", "a", "b"]) - df2 = DataFrame({"b": [1, 2]}, index=["a", "b"]) - - # For True/None - expected = DataFrame( - {"a": [2, 3, 1], "b": [1, 2, None]}, index=["a", "b", "c"], columns=["a", "b"] - ) - if sort is False: - expected = expected.loc[["c", "a", "b"]] - - # Warn and sort by default - with tm.assert_produces_warning(None): - result = pd.concat([df1, df2], axis=1, sort=sort) - tm.assert_frame_equal(result, expected) - - -def test_concat_inner_sort(sort): - # https://github.com/pandas-dev/pandas/pull/20613 - df1 = DataFrame({"a": [1, 2], "b": [1, 2], "c": [1, 2]}, columns=["b", "a", "c"]) - df2 = DataFrame({"a": [1, 2], "b": [3, 4]}, index=[3, 4]) - - with tm.assert_produces_warning(None): - # unset sort should *not* warn for inner join - # since that never sorted - result = pd.concat([df1, df2], sort=sort, join="inner", ignore_index=True) - - expected = DataFrame({"b": [1, 2, 3, 4], "a": [1, 2, 1, 2]}, columns=["b", "a"]) - if sort is True: - expected = expected[["a", "b"]] - tm.assert_frame_equal(result, expected) - - -def test_concat_aligned_sort(): - # GH-4588 - df = DataFrame({"c": [1, 2], "b": [3, 4], "a": [5, 6]}, columns=["c", "b", "a"]) - result = pd.concat([df, df], sort=True, ignore_index=True) - expected = DataFrame( - {"a": [5, 6, 5, 6], "b": [3, 4, 3, 4], "c": [1, 2, 1, 2]}, - columns=["a", "b", "c"], - ) - tm.assert_frame_equal(result, expected) - - result = pd.concat([df, df[["c", "b"]]], join="inner", sort=True, ignore_index=True) - expected = expected[["b", "c"]] - tm.assert_frame_equal(result, expected) - - -def test_concat_aligned_sort_does_not_raise(): - # GH-4588 - # We catch TypeErrors from sorting internally and do not re-raise. - df = DataFrame({1: [1, 2], "a": [3, 4]}, columns=[1, "a"]) - expected = DataFrame({1: [1, 2, 1, 2], "a": [3, 4, 3, 4]}, columns=[1, "a"]) - result = pd.concat([df, df], ignore_index=True, sort=True) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("s1name,s2name", [(np.int64(190), (43, 0)), (190, (43, 0))]) -def test_concat_series_name_npscalar_tuple(s1name, s2name): - # GH21015 - s1 = Series({"a": 1, "b": 2}, name=s1name) - s2 = Series({"c": 5, "d": 6}, name=s2name) - result = pd.concat([s1, s2]) - expected = Series({"a": 1, "b": 2, "c": 5, "d": 6}) - tm.assert_series_equal(result, expected) - - -def test_concat_categorical_tz(): - # GH-23816 - a = Series(pd.date_range("2017-01-01", periods=2, tz="US/Pacific")) - b = Series(["a", "b"], dtype="category") - result = pd.concat([a, b], ignore_index=True) - expected = Series( - [ - pd.Timestamp("2017-01-01", tz="US/Pacific"), - pd.Timestamp("2017-01-02", tz="US/Pacific"), - "a", - "b", - ] - ) - tm.assert_series_equal(result, expected) - - -def test_concat_categorical_unchanged(): - # GH-12007 - # test fix for when concat on categorical and float - # coerces dtype categorical -> float - df = DataFrame(Series(["a", "b", "c"], dtype="category", name="A")) - ser = Series([0, 1, 2], index=[0, 1, 3], name="B") - result = pd.concat([df, ser], axis=1) - expected = DataFrame( - { - "A": Series(["a", "b", "c", np.nan], dtype="category"), - "B": Series([0, 1, np.nan, 2], dtype="float"), - } - ) - tm.assert_equal(result, expected) - - -def test_concat_empty_df_object_dtype(): - # GH 9149 - df_1 = DataFrame({"Row": [0, 1, 1], "EmptyCol": np.nan, "NumberCol": [1, 2, 3]}) - df_2 = DataFrame(columns=df_1.columns) - result = pd.concat([df_1, df_2], axis=0) - expected = df_1.astype(object) - tm.assert_frame_equal(result, expected) - - def test_concat_sparse(): # GH 23557 a = Series(SparseArray([0, 1, 2])) @@ -1365,20 +506,6 @@ def test_concat_dense_sparse(): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("test_series", [True, False]) -def test_concat_copy_index(test_series, axis): - # GH 29879 - if test_series: - ser = Series([1, 2]) - comb = concat([ser, ser], axis=axis, copy=True) - assert comb.index is not ser.index - else: - df = DataFrame([[1, 2], [3, 4]], columns=["a", "b"]) - comb = concat([df, df], axis=axis, copy=True) - assert comb.index is not df.index - assert comb.columns is not df.columns - - @pytest.mark.parametrize("keys", [["e", "f", "f"], ["f", "e", "f"]]) def test_duplicate_keys(keys): # GH 33654 diff --git a/pandas/tests/reshape/concat/test_dataframe.py b/pandas/tests/reshape/concat/test_dataframe.py index 0e302f5e71fb4..295846ee1b264 100644 --- a/pandas/tests/reshape/concat/test_dataframe.py +++ b/pandas/tests/reshape/concat/test_dataframe.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -from pandas import DataFrame, Index, Series +from pandas import DataFrame, Index, Series, concat import pandas._testing as tm @@ -157,3 +157,13 @@ def test_concat_astype_dup_col(self): np.array(["b", "b"]).reshape(1, 2), columns=["a", "a"] ).astype("category") tm.assert_frame_equal(result, expected) + + def test_concat_dataframe_keys_bug(self, sort): + t1 = DataFrame( + {"value": Series([1, 2, 3], index=Index(["a", "b", "c"], name="id"))} + ) + t2 = DataFrame({"value": Series([7, 8], index=Index(["a", "b"], name="id"))}) + + # it works + result = concat([t1, t2], axis=1, keys=["t1", "t2"], sort=sort) + assert list(result.columns) == [("t1", "value"), ("t2", "value")] diff --git a/pandas/tests/reshape/concat/test_datetimes.py b/pandas/tests/reshape/concat/test_datetimes.py index 0becb16beee08..8783f539faa65 100644 --- a/pandas/tests/reshape/concat/test_datetimes.py +++ b/pandas/tests/reshape/concat/test_datetimes.py @@ -15,16 +15,11 @@ Timestamp, concat, date_range, + to_timedelta, ) import pandas._testing as tm -@pytest.fixture(params=[True, False]) -def sort(request): - """Boolean sort keyword for concat and DataFrame.append.""" - return request.param - - class TestDatetimeConcat: def test_concat_datetime64_block(self): from pandas.core.indexes.datetimes import date_range @@ -518,3 +513,13 @@ def test_concat_period_other_series(self): result = concat([x, y], ignore_index=True) tm.assert_series_equal(result, expected) assert result.dtype == "object" + + +def test_concat_timedelta64_block(): + rng = to_timedelta(np.arange(10), unit="s") + + df = DataFrame({"time": rng}) + + result = concat([df, df]) + tm.assert_frame_equal(result.iloc[:10], df) + tm.assert_frame_equal(result.iloc[10:], df) diff --git a/pandas/tests/reshape/concat/test_empty.py b/pandas/tests/reshape/concat/test_empty.py new file mode 100644 index 0000000000000..5c540124de8e6 --- /dev/null +++ b/pandas/tests/reshape/concat/test_empty.py @@ -0,0 +1,251 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Index, Series, concat, date_range +import pandas._testing as tm + + +class TestEmptyConcat: + def test_handle_empty_objects(self, sort): + df = DataFrame(np.random.randn(10, 4), columns=list("abcd")) + + baz = df[:5].copy() + baz["foo"] = "bar" + empty = df[5:5] + + frames = [baz, empty, empty, df[5:]] + concatted = concat(frames, axis=0, sort=sort) + + expected = df.reindex(columns=["a", "b", "c", "d", "foo"]) + expected["foo"] = expected["foo"].astype("O") + expected.loc[0:4, "foo"] = "bar" + + tm.assert_frame_equal(concatted, expected) + + # empty as first element with time series + # GH3259 + df = DataFrame( + dict(A=range(10000)), index=date_range("20130101", periods=10000, freq="s") + ) + empty = DataFrame() + result = concat([df, empty], axis=1) + tm.assert_frame_equal(result, df) + result = concat([empty, df], axis=1) + tm.assert_frame_equal(result, df) + + result = concat([df, empty]) + tm.assert_frame_equal(result, df) + result = concat([empty, df]) + tm.assert_frame_equal(result, df) + + def test_concat_empty_series(self): + # GH 11082 + s1 = Series([1, 2, 3], name="x") + s2 = Series(name="y", dtype="float64") + res = pd.concat([s1, s2], axis=1) + exp = DataFrame( + {"x": [1, 2, 3], "y": [np.nan, np.nan, np.nan]}, + index=Index([0, 1, 2], dtype="O"), + ) + tm.assert_frame_equal(res, exp) + + s1 = Series([1, 2, 3], name="x") + s2 = Series(name="y", dtype="float64") + res = pd.concat([s1, s2], axis=0) + # name will be reset + exp = Series([1, 2, 3]) + tm.assert_series_equal(res, exp) + + # empty Series with no name + s1 = Series([1, 2, 3], name="x") + s2 = Series(name=None, dtype="float64") + res = pd.concat([s1, s2], axis=1) + exp = DataFrame( + {"x": [1, 2, 3], 0: [np.nan, np.nan, np.nan]}, + columns=["x", 0], + index=Index([0, 1, 2], dtype="O"), + ) + tm.assert_frame_equal(res, exp) + + @pytest.mark.parametrize("tz", [None, "UTC"]) + @pytest.mark.parametrize("values", [[], [1, 2, 3]]) + def test_concat_empty_series_timelike(self, tz, values): + # GH 18447 + + first = Series([], dtype="M8[ns]").dt.tz_localize(tz) + dtype = None if values else np.float64 + second = Series(values, dtype=dtype) + + expected = DataFrame( + { + 0: Series([pd.NaT] * len(values), dtype="M8[ns]").dt.tz_localize(tz), + 1: values, + } + ) + result = concat([first, second], axis=1) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "left,right,expected", + [ + # booleans + (np.bool_, np.int32, np.int32), + (np.bool_, np.float32, np.object_), + # datetime-like + ("m8[ns]", np.bool_, np.object_), + ("m8[ns]", np.int64, np.object_), + ("M8[ns]", np.bool_, np.object_), + ("M8[ns]", np.int64, np.object_), + # categorical + ("category", "category", "category"), + ("category", "object", "object"), + ], + ) + def test_concat_empty_series_dtypes(self, left, right, expected): + result = pd.concat([Series(dtype=left), Series(dtype=right)]) + assert result.dtype == expected + + @pytest.mark.parametrize( + "dtype", ["float64", "int8", "uint8", "bool", "m8[ns]", "M8[ns]"] + ) + def test_concat_empty_series_dtypes_match_roundtrips(self, dtype): + dtype = np.dtype(dtype) + + result = pd.concat([Series(dtype=dtype)]) + assert result.dtype == dtype + + result = pd.concat([Series(dtype=dtype), Series(dtype=dtype)]) + assert result.dtype == dtype + + def test_concat_empty_series_dtypes_roundtrips(self): + + # round-tripping with self & like self + dtypes = map(np.dtype, ["float64", "int8", "uint8", "bool", "m8[ns]", "M8[ns]"]) + + def int_result_type(dtype, dtype2): + typs = {dtype.kind, dtype2.kind} + if not len(typs - {"i", "u", "b"}) and ( + dtype.kind == "i" or dtype2.kind == "i" + ): + return "i" + elif not len(typs - {"u", "b"}) and ( + dtype.kind == "u" or dtype2.kind == "u" + ): + return "u" + return None + + def float_result_type(dtype, dtype2): + typs = {dtype.kind, dtype2.kind} + if not len(typs - {"f", "i", "u"}) and ( + dtype.kind == "f" or dtype2.kind == "f" + ): + return "f" + return None + + def get_result_type(dtype, dtype2): + result = float_result_type(dtype, dtype2) + if result is not None: + return result + result = int_result_type(dtype, dtype2) + if result is not None: + return result + return "O" + + for dtype in dtypes: + for dtype2 in dtypes: + if dtype == dtype2: + continue + + expected = get_result_type(dtype, dtype2) + result = pd.concat([Series(dtype=dtype), Series(dtype=dtype2)]).dtype + assert result.kind == expected + + def test_concat_empty_series_dtypes_triple(self): + + assert ( + pd.concat( + [Series(dtype="M8[ns]"), Series(dtype=np.bool_), Series(dtype=np.int64)] + ).dtype + == np.object_ + ) + + def test_concat_empty_series_dtype_category_with_array(self): + # GH#18515 + assert ( + pd.concat( + [Series(np.array([]), dtype="category"), Series(dtype="float64")] + ).dtype + == "float64" + ) + + def test_concat_empty_series_dtypes_sparse(self): + result = pd.concat( + [ + Series(dtype="float64").astype("Sparse"), + Series(dtype="float64").astype("Sparse"), + ] + ) + assert result.dtype == "Sparse[float64]" + + result = pd.concat( + [Series(dtype="float64").astype("Sparse"), Series(dtype="float64")] + ) + # TODO: release-note: concat sparse dtype + expected = pd.SparseDtype(np.float64) + assert result.dtype == expected + + result = pd.concat( + [Series(dtype="float64").astype("Sparse"), Series(dtype="object")] + ) + # TODO: release-note: concat sparse dtype + expected = pd.SparseDtype("object") + assert result.dtype == expected + + def test_concat_empty_df_object_dtype(self): + # GH 9149 + df_1 = DataFrame({"Row": [0, 1, 1], "EmptyCol": np.nan, "NumberCol": [1, 2, 3]}) + df_2 = DataFrame(columns=df_1.columns) + result = pd.concat([df_1, df_2], axis=0) + expected = df_1.astype(object) + tm.assert_frame_equal(result, expected) + + def test_concat_empty_dataframe_dtypes(self): + df = DataFrame(columns=list("abc")) + df["a"] = df["a"].astype(np.bool_) + df["b"] = df["b"].astype(np.int32) + df["c"] = df["c"].astype(np.float64) + + result = pd.concat([df, df]) + assert result["a"].dtype == np.bool_ + assert result["b"].dtype == np.int32 + assert result["c"].dtype == np.float64 + + result = pd.concat([df, df.astype(np.float64)]) + assert result["a"].dtype == np.object_ + assert result["b"].dtype == np.float64 + assert result["c"].dtype == np.float64 + + def test_concat_inner_join_empty(self): + # GH 15328 + df_empty = DataFrame() + df_a = DataFrame({"a": [1, 2]}, index=[0, 1], dtype="int64") + df_expected = DataFrame({"a": []}, index=[], dtype="int64") + + for how, expected in [("inner", df_expected), ("outer", df_a)]: + result = pd.concat([df_a, df_empty], axis=1, join=how) + tm.assert_frame_equal(result, expected) + + def test_empty_dtype_coerce(self): + + # xref to #12411 + # xref to #12045 + # xref to #11594 + # see below + + # 10571 + df1 = DataFrame(data=[[1, None], [2, None]], columns=["a", "b"]) + df2 = DataFrame(data=[[3, None], [4, None]], columns=["a", "b"]) + result = concat([df1, df2]) + expected = df1.dtypes + tm.assert_series_equal(result.dtypes, expected) diff --git a/pandas/tests/reshape/concat/test_index.py b/pandas/tests/reshape/concat/test_index.py new file mode 100644 index 0000000000000..e283212b4e60c --- /dev/null +++ b/pandas/tests/reshape/concat/test_index.py @@ -0,0 +1,261 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Index, MultiIndex, Series, concat +import pandas._testing as tm + + +class TestIndexConcat: + def test_concat_ignore_index(self, sort): + frame1 = DataFrame( + {"test1": ["a", "b", "c"], "test2": [1, 2, 3], "test3": [4.5, 3.2, 1.2]} + ) + frame2 = DataFrame({"test3": [5.2, 2.2, 4.3]}) + frame1.index = Index(["x", "y", "z"]) + frame2.index = Index(["x", "y", "q"]) + + v1 = concat([frame1, frame2], axis=1, ignore_index=True, sort=sort) + + nan = np.nan + expected = DataFrame( + [ + [nan, nan, nan, 4.3], + ["a", 1, 4.5, 5.2], + ["b", 2, 3.2, 2.2], + ["c", 3, 1.2, nan], + ], + index=Index(["q", "x", "y", "z"]), + ) + if not sort: + expected = expected.loc[["x", "y", "z", "q"]] + + tm.assert_frame_equal(v1, expected) + + @pytest.mark.parametrize( + "name_in1,name_in2,name_in3,name_out", + [ + ("idx", "idx", "idx", "idx"), + ("idx", "idx", None, None), + ("idx", None, None, None), + ("idx1", "idx2", None, None), + ("idx1", "idx1", "idx2", None), + ("idx1", "idx2", "idx3", None), + (None, None, None, None), + ], + ) + def test_concat_same_index_names(self, name_in1, name_in2, name_in3, name_out): + # GH13475 + indices = [ + Index(["a", "b", "c"], name=name_in1), + Index(["b", "c", "d"], name=name_in2), + Index(["c", "d", "e"], name=name_in3), + ] + frames = [ + DataFrame({c: [0, 1, 2]}, index=i) for i, c in zip(indices, ["x", "y", "z"]) + ] + result = pd.concat(frames, axis=1) + + exp_ind = Index(["a", "b", "c", "d", "e"], name=name_out) + expected = DataFrame( + { + "x": [0, 1, 2, np.nan, np.nan], + "y": [np.nan, 0, 1, 2, np.nan], + "z": [np.nan, np.nan, 0, 1, 2], + }, + index=exp_ind, + ) + + tm.assert_frame_equal(result, expected) + + def test_concat_rename_index(self): + a = DataFrame( + np.random.rand(3, 3), + columns=list("ABC"), + index=Index(list("abc"), name="index_a"), + ) + b = DataFrame( + np.random.rand(3, 3), + columns=list("ABC"), + index=Index(list("abc"), name="index_b"), + ) + + result = concat([a, b], keys=["key0", "key1"], names=["lvl0", "lvl1"]) + + exp = concat([a, b], keys=["key0", "key1"], names=["lvl0"]) + names = list(exp.index.names) + names[1] = "lvl1" + exp.index.set_names(names, inplace=True) + + tm.assert_frame_equal(result, exp) + assert result.index.names == exp.index.names + + @pytest.mark.parametrize("test_series", [True, False]) + def test_concat_copy_index(self, test_series, axis): + # GH 29879 + if test_series: + ser = Series([1, 2]) + comb = concat([ser, ser], axis=axis, copy=True) + assert comb.index is not ser.index + else: + df = DataFrame([[1, 2], [3, 4]], columns=["a", "b"]) + comb = concat([df, df], axis=axis, copy=True) + assert comb.index is not df.index + assert comb.columns is not df.columns + + def test_default_index(self): + # is_series and ignore_index + s1 = Series([1, 2, 3], name="x") + s2 = Series([4, 5, 6], name="y") + res = pd.concat([s1, s2], axis=1, ignore_index=True) + assert isinstance(res.columns, pd.RangeIndex) + exp = DataFrame([[1, 4], [2, 5], [3, 6]]) + # use check_index_type=True to check the result have + # RangeIndex (default index) + tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True) + + # is_series and all inputs have no names + s1 = Series([1, 2, 3]) + s2 = Series([4, 5, 6]) + res = pd.concat([s1, s2], axis=1, ignore_index=False) + assert isinstance(res.columns, pd.RangeIndex) + exp = DataFrame([[1, 4], [2, 5], [3, 6]]) + exp.columns = pd.RangeIndex(2) + tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True) + + # is_dataframe and ignore_index + df1 = DataFrame({"A": [1, 2], "B": [5, 6]}) + df2 = DataFrame({"A": [3, 4], "B": [7, 8]}) + + res = pd.concat([df1, df2], axis=0, ignore_index=True) + exp = DataFrame([[1, 5], [2, 6], [3, 7], [4, 8]], columns=["A", "B"]) + tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True) + + res = pd.concat([df1, df2], axis=1, ignore_index=True) + exp = DataFrame([[1, 5, 3, 7], [2, 6, 4, 8]]) + tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True) + + def test_dups_index(self): + # GH 4771 + + # single dtypes + df = DataFrame( + np.random.randint(0, 10, size=40).reshape(10, 4), + columns=["A", "A", "C", "C"], + ) + + result = concat([df, df], axis=1) + tm.assert_frame_equal(result.iloc[:, :4], df) + tm.assert_frame_equal(result.iloc[:, 4:], df) + + result = concat([df, df], axis=0) + tm.assert_frame_equal(result.iloc[:10], df) + tm.assert_frame_equal(result.iloc[10:], df) + + # multi dtypes + df = concat( + [ + DataFrame(np.random.randn(10, 4), columns=["A", "A", "B", "B"]), + DataFrame( + np.random.randint(0, 10, size=20).reshape(10, 2), columns=["A", "C"] + ), + ], + axis=1, + ) + + result = concat([df, df], axis=1) + tm.assert_frame_equal(result.iloc[:, :6], df) + tm.assert_frame_equal(result.iloc[:, 6:], df) + + result = concat([df, df], axis=0) + tm.assert_frame_equal(result.iloc[:10], df) + tm.assert_frame_equal(result.iloc[10:], df) + + # append + result = df.iloc[0:8, :].append(df.iloc[8:]) + tm.assert_frame_equal(result, df) + + result = df.iloc[0:8, :].append(df.iloc[8:9]).append(df.iloc[9:10]) + tm.assert_frame_equal(result, df) + + expected = concat([df, df], axis=0) + result = df.append(df) + tm.assert_frame_equal(result, expected) + + +class TestMultiIndexConcat: + def test_concat_multiindex_with_keys(self): + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["first", "second"], + ) + frame = DataFrame( + np.random.randn(10, 3), + index=index, + columns=Index(["A", "B", "C"], name="exp"), + ) + result = concat([frame, frame], keys=[0, 1], names=["iteration"]) + + assert result.index.names == ("iteration",) + index.names + tm.assert_frame_equal(result.loc[0], frame) + tm.assert_frame_equal(result.loc[1], frame) + assert result.index.nlevels == 3 + + def test_concat_multiindex_with_none_in_index_names(self): + # GH 15787 + index = pd.MultiIndex.from_product([[1], range(5)], names=["level1", None]) + df = DataFrame({"col": range(5)}, index=index, dtype=np.int32) + + result = concat([df, df], keys=[1, 2], names=["level2"]) + index = pd.MultiIndex.from_product( + [[1, 2], [1], range(5)], names=["level2", "level1", None] + ) + expected = DataFrame({"col": list(range(5)) * 2}, index=index, dtype=np.int32) + tm.assert_frame_equal(result, expected) + + result = concat([df, df[:2]], keys=[1, 2], names=["level2"]) + level2 = [1] * 5 + [2] * 2 + level1 = [1] * 7 + no_name = list(range(5)) + list(range(2)) + tuples = list(zip(level2, level1, no_name)) + index = pd.MultiIndex.from_tuples(tuples, names=["level2", "level1", None]) + expected = DataFrame({"col": no_name}, index=index, dtype=np.int32) + tm.assert_frame_equal(result, expected) + + def test_concat_multiindex_rangeindex(self): + # GH13542 + # when multi-index levels are RangeIndex objects + # there is a bug in concat with objects of len 1 + + df = DataFrame(np.random.randn(9, 2)) + df.index = MultiIndex( + levels=[pd.RangeIndex(3), pd.RangeIndex(3)], + codes=[np.repeat(np.arange(3), 3), np.tile(np.arange(3), 3)], + ) + + res = concat([df.iloc[[2, 3, 4], :], df.iloc[[5], :]]) + exp = df.iloc[[2, 3, 4, 5], :] + tm.assert_frame_equal(res, exp) + + def test_concat_multiindex_dfs_with_deepcopy(self): + # GH 9967 + from copy import deepcopy + + example_multiindex1 = pd.MultiIndex.from_product([["a"], ["b"]]) + example_dataframe1 = DataFrame([0], index=example_multiindex1) + + example_multiindex2 = pd.MultiIndex.from_product([["a"], ["c"]]) + example_dataframe2 = DataFrame([1], index=example_multiindex2) + + example_dict = {"s1": example_dataframe1, "s2": example_dataframe2} + expected_index = pd.MultiIndex( + levels=[["s1", "s2"], ["a"], ["b", "c"]], + codes=[[0, 1], [0, 0], [0, 1]], + names=["testname", None, None], + ) + expected = DataFrame([[0], [1]], index=expected_index) + result_copy = pd.concat(deepcopy(example_dict), names=["testname"]) + tm.assert_frame_equal(result_copy, expected) + result_no_copy = pd.concat(example_dict, names=["testname"]) + tm.assert_frame_equal(result_no_copy, expected) diff --git a/pandas/tests/reshape/concat/test_invalid.py b/pandas/tests/reshape/concat/test_invalid.py new file mode 100644 index 0000000000000..3a886e0d612c6 --- /dev/null +++ b/pandas/tests/reshape/concat/test_invalid.py @@ -0,0 +1,51 @@ +from io import StringIO + +import numpy as np +import pytest + +from pandas import DataFrame, concat, read_csv +import pandas._testing as tm + + +class TestInvalidConcat: + def test_concat_invalid(self): + + # trying to concat a ndframe with a non-ndframe + df1 = tm.makeCustomDataframe(10, 2) + for obj in [1, dict(), [1, 2], (1, 2)]: + + msg = ( + f"cannot concatenate object of type '{type(obj)}'; " + "only Series and DataFrame objs are valid" + ) + with pytest.raises(TypeError, match=msg): + concat([df1, obj]) + + def test_concat_invalid_first_argument(self): + df1 = tm.makeCustomDataframe(10, 2) + df2 = tm.makeCustomDataframe(10, 2) + msg = ( + "first argument must be an iterable of pandas " + 'objects, you passed an object of type "DataFrame"' + ) + with pytest.raises(TypeError, match=msg): + concat(df1, df2) + + # generator ok though + concat(DataFrame(np.random.rand(5, 5)) for _ in range(3)) + + # text reader ok + # GH6583 + data = """index,A,B,C,D + foo,2,3,4,5 + bar,7,8,9,10 + baz,12,13,14,15 + qux,12,13,14,15 + foo2,12,13,14,15 + bar2,12,13,14,15 + """ + + reader = read_csv(StringIO(data), chunksize=1) + result = concat(reader, ignore_index=True) + expected = read_csv(StringIO(data)) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/concat/test_series.py b/pandas/tests/reshape/concat/test_series.py index 7f84e937736ac..aea2840bb897f 100644 --- a/pandas/tests/reshape/concat/test_series.py +++ b/pandas/tests/reshape/concat/test_series.py @@ -1,123 +1,146 @@ import numpy as np +from numpy.random import randn import pytest import pandas as pd -from pandas import Series +from pandas import ( + DataFrame, + DatetimeIndex, + Index, + MultiIndex, + Series, + concat, + date_range, +) +import pandas._testing as tm + + +@pytest.fixture(params=[True, False]) +def sort(request): + """Boolean sort keyword for concat and DataFrame.append.""" + return request.param class TestSeriesConcat: - @pytest.mark.parametrize( - "dtype", ["float64", "int8", "uint8", "bool", "m8[ns]", "M8[ns]"] - ) - def test_concat_empty_series_dtypes_match_roundtrips(self, dtype): - dtype = np.dtype(dtype) - - result = pd.concat([Series(dtype=dtype)]) - assert result.dtype == dtype - - result = pd.concat([Series(dtype=dtype), Series(dtype=dtype)]) - assert result.dtype == dtype - - def test_concat_empty_series_dtypes_roundtrips(self): - - # round-tripping with self & like self - dtypes = map(np.dtype, ["float64", "int8", "uint8", "bool", "m8[ns]", "M8[ns]"]) - - def int_result_type(dtype, dtype2): - typs = {dtype.kind, dtype2.kind} - if not len(typs - {"i", "u", "b"}) and ( - dtype.kind == "i" or dtype2.kind == "i" - ): - return "i" - elif not len(typs - {"u", "b"}) and ( - dtype.kind == "u" or dtype2.kind == "u" - ): - return "u" - return None - - def float_result_type(dtype, dtype2): - typs = {dtype.kind, dtype2.kind} - if not len(typs - {"f", "i", "u"}) and ( - dtype.kind == "f" or dtype2.kind == "f" - ): - return "f" - return None - - def get_result_type(dtype, dtype2): - result = float_result_type(dtype, dtype2) - if result is not None: - return result - result = int_result_type(dtype, dtype2) - if result is not None: - return result - return "O" - - for dtype in dtypes: - for dtype2 in dtypes: - if dtype == dtype2: - continue - - expected = get_result_type(dtype, dtype2) - result = pd.concat([Series(dtype=dtype), Series(dtype=dtype2)]).dtype - assert result.kind == expected + def test_concat_series(self): - @pytest.mark.parametrize( - "left,right,expected", - [ - # booleans - (np.bool_, np.int32, np.int32), - (np.bool_, np.float32, np.object_), - # datetime-like - ("m8[ns]", np.bool_, np.object_), - ("m8[ns]", np.int64, np.object_), - ("M8[ns]", np.bool_, np.object_), - ("M8[ns]", np.int64, np.object_), - # categorical - ("category", "category", "category"), - ("category", "object", "object"), - ], - ) - def test_concat_empty_series_dtypes(self, left, right, expected): - result = pd.concat([Series(dtype=left), Series(dtype=right)]) - assert result.dtype == expected + ts = tm.makeTimeSeries() + ts.name = "foo" - def test_concat_empty_series_dtypes_triple(self): + pieces = [ts[:5], ts[5:15], ts[15:]] - assert ( - pd.concat( - [Series(dtype="M8[ns]"), Series(dtype=np.bool_), Series(dtype=np.int64)] - ).dtype - == np.object_ - ) + result = concat(pieces) + tm.assert_series_equal(result, ts) + assert result.name == ts.name + + result = concat(pieces, keys=[0, 1, 2]) + expected = ts.copy() + + ts.index = DatetimeIndex(np.array(ts.index.values, dtype="M8[ns]")) + + exp_codes = [np.repeat([0, 1, 2], [len(x) for x in pieces]), np.arange(len(ts))] + exp_index = MultiIndex(levels=[[0, 1, 2], ts.index], codes=exp_codes) + expected.index = exp_index + tm.assert_series_equal(result, expected) + + def test_concat_empty_and_non_empty_series_regression(self): + # GH 18187 regression test + s1 = Series([1]) + s2 = Series([], dtype=object) + + expected = s1 + result = pd.concat([s1, s2]) + tm.assert_series_equal(result, expected) + + def test_concat_series_axis1(self, sort=sort): + ts = tm.makeTimeSeries() - def test_concat_empty_series_dtype_category_with_array(self): - # GH#18515 - assert ( - pd.concat( - [Series(np.array([]), dtype="category"), Series(dtype="float64")] - ).dtype - == "float64" + pieces = [ts[:-2], ts[2:], ts[2:-2]] + + result = concat(pieces, axis=1) + expected = DataFrame(pieces).T + tm.assert_frame_equal(result, expected) + + result = concat(pieces, keys=["A", "B", "C"], axis=1) + expected = DataFrame(pieces, index=["A", "B", "C"]).T + tm.assert_frame_equal(result, expected) + + # preserve series names, #2489 + s = Series(randn(5), name="A") + s2 = Series(randn(5), name="B") + + result = concat([s, s2], axis=1) + expected = DataFrame({"A": s, "B": s2}) + tm.assert_frame_equal(result, expected) + + s2.name = None + result = concat([s, s2], axis=1) + tm.assert_index_equal(result.columns, Index(["A", 0], dtype="object")) + + # must reindex, #2603 + s = Series(randn(3), index=["c", "a", "b"], name="A") + s2 = Series(randn(4), index=["d", "a", "b", "c"], name="B") + result = concat([s, s2], axis=1, sort=sort) + expected = DataFrame({"A": s, "B": s2}) + tm.assert_frame_equal(result, expected) + + def test_concat_series_axis1_names_applied(self): + # ensure names argument is not ignored on axis=1, #23490 + s = Series([1, 2, 3]) + s2 = Series([4, 5, 6]) + result = concat([s, s2], axis=1, keys=["a", "b"], names=["A"]) + expected = DataFrame( + [[1, 4], [2, 5], [3, 6]], columns=Index(["a", "b"], name="A") ) + tm.assert_frame_equal(result, expected) - def test_concat_empty_series_dtypes_sparse(self): - result = pd.concat( - [ - Series(dtype="float64").astype("Sparse"), - Series(dtype="float64").astype("Sparse"), - ] + result = concat([s, s2], axis=1, keys=[("a", 1), ("b", 2)], names=["A", "B"]) + expected = DataFrame( + [[1, 4], [2, 5], [3, 6]], + columns=MultiIndex.from_tuples([("a", 1), ("b", 2)], names=["A", "B"]), ) - assert result.dtype == "Sparse[float64]" + tm.assert_frame_equal(result, expected) - result = pd.concat( - [Series(dtype="float64").astype("Sparse"), Series(dtype="float64")] + def test_concat_series_axis1_same_names_ignore_index(self): + dates = date_range("01-Jan-2013", "01-Jan-2014", freq="MS")[0:-1] + s1 = Series(randn(len(dates)), index=dates, name="value") + s2 = Series(randn(len(dates)), index=dates, name="value") + + result = concat([s1, s2], axis=1, ignore_index=True) + expected = Index([0, 1]) + + tm.assert_index_equal(result.columns, expected) + + @pytest.mark.parametrize( + "s1name,s2name", [(np.int64(190), (43, 0)), (190, (43, 0))] + ) + def test_concat_series_name_npscalar_tuple(self, s1name, s2name): + # GH21015 + s1 = Series({"a": 1, "b": 2}, name=s1name) + s2 = Series({"c": 5, "d": 6}, name=s2name) + result = pd.concat([s1, s2]) + expected = Series({"a": 1, "b": 2, "c": 5, "d": 6}) + tm.assert_series_equal(result, expected) + + def test_concat_series_partial_columns_names(self): + # GH10698 + foo = Series([1, 2], name="foo") + bar = Series([1, 2]) + baz = Series([4, 5]) + + result = concat([foo, bar, baz], axis=1) + expected = DataFrame( + {"foo": [1, 2], 0: [1, 2], 1: [4, 5]}, columns=["foo", 0, 1] ) - # TODO: release-note: concat sparse dtype - expected = pd.SparseDtype(np.float64) - assert result.dtype == expected + tm.assert_frame_equal(result, expected) - result = pd.concat( - [Series(dtype="float64").astype("Sparse"), Series(dtype="object")] + result = concat([foo, bar, baz], axis=1, keys=["red", "blue", "yellow"]) + expected = DataFrame( + {"red": [1, 2], "blue": [1, 2], "yellow": [4, 5]}, + columns=["red", "blue", "yellow"], ) - # TODO: release-note: concat sparse dtype - expected = pd.SparseDtype("object") - assert result.dtype == expected + tm.assert_frame_equal(result, expected) + + result = concat([foo, bar, baz], axis=1, ignore_index=True) + expected = DataFrame({0: [1, 2], 1: [1, 2], 2: [4, 5]}) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/concat/test_sort.py b/pandas/tests/reshape/concat/test_sort.py new file mode 100644 index 0000000000000..865f696b7a73a --- /dev/null +++ b/pandas/tests/reshape/concat/test_sort.py @@ -0,0 +1,83 @@ +import pandas as pd +from pandas import DataFrame +import pandas._testing as tm + + +class TestConcatSort: + def test_concat_sorts_columns(self, sort): + # GH-4588 + df1 = DataFrame({"a": [1, 2], "b": [1, 2]}, columns=["b", "a"]) + df2 = DataFrame({"a": [3, 4], "c": [5, 6]}) + + # for sort=True/None + expected = DataFrame( + {"a": [1, 2, 3, 4], "b": [1, 2, None, None], "c": [None, None, 5, 6]}, + columns=["a", "b", "c"], + ) + + if sort is False: + expected = expected[["b", "a", "c"]] + + # default + with tm.assert_produces_warning(None): + result = pd.concat([df1, df2], ignore_index=True, sort=sort) + tm.assert_frame_equal(result, expected) + + def test_concat_sorts_index(self, sort): + df1 = DataFrame({"a": [1, 2, 3]}, index=["c", "a", "b"]) + df2 = DataFrame({"b": [1, 2]}, index=["a", "b"]) + + # For True/None + expected = DataFrame( + {"a": [2, 3, 1], "b": [1, 2, None]}, + index=["a", "b", "c"], + columns=["a", "b"], + ) + if sort is False: + expected = expected.loc[["c", "a", "b"]] + + # Warn and sort by default + with tm.assert_produces_warning(None): + result = pd.concat([df1, df2], axis=1, sort=sort) + tm.assert_frame_equal(result, expected) + + def test_concat_inner_sort(self, sort): + # https://github.com/pandas-dev/pandas/pull/20613 + df1 = DataFrame( + {"a": [1, 2], "b": [1, 2], "c": [1, 2]}, columns=["b", "a", "c"] + ) + df2 = DataFrame({"a": [1, 2], "b": [3, 4]}, index=[3, 4]) + + with tm.assert_produces_warning(None): + # unset sort should *not* warn for inner join + # since that never sorted + result = pd.concat([df1, df2], sort=sort, join="inner", ignore_index=True) + + expected = DataFrame({"b": [1, 2, 3, 4], "a": [1, 2, 1, 2]}, columns=["b", "a"]) + if sort is True: + expected = expected[["a", "b"]] + tm.assert_frame_equal(result, expected) + + def test_concat_aligned_sort(self): + # GH-4588 + df = DataFrame({"c": [1, 2], "b": [3, 4], "a": [5, 6]}, columns=["c", "b", "a"]) + result = pd.concat([df, df], sort=True, ignore_index=True) + expected = DataFrame( + {"a": [5, 6, 5, 6], "b": [3, 4, 3, 4], "c": [1, 2, 1, 2]}, + columns=["a", "b", "c"], + ) + tm.assert_frame_equal(result, expected) + + result = pd.concat( + [df, df[["c", "b"]]], join="inner", sort=True, ignore_index=True + ) + expected = expected[["b", "c"]] + tm.assert_frame_equal(result, expected) + + def test_concat_aligned_sort_does_not_raise(self): + # GH-4588 + # We catch TypeErrors from sorting internally and do not re-raise. + df = DataFrame({1: [1, 2], "a": [3, 4]}, columns=[1, "a"]) + expected = DataFrame({1: [1, 2, 1, 2], "a": [3, 4, 3, 4]}, columns=[1, "a"]) + result = pd.concat([df, df], ignore_index=True, sort=True) + tm.assert_frame_equal(result, expected)