From 016ce1afd9ed38a8616fff7a2dcf7a1eb14cb6ff Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 9 Feb 2021 16:52:39 +0100 Subject: [PATCH 1/5] [ArrayManager] TST: get tests running for /tests/frame --- .github/workflows/ci.yml | 1 + pandas/conftest.py | 5 ++- pandas/core/internals/array_manager.py | 8 ++++- pandas/tests/frame/test_arithmetic.py | 6 +++- pandas/tests/frame/test_block_internals.py | 6 ++++ pandas/tests/frame/test_constructors.py | 12 ++++++-- pandas/tests/frame/test_nonunique_indexes.py | 12 ++++++-- pandas/tests/frame/test_repr_info.py | 6 ++-- pandas/tests/frame/test_stack_unstack.py | 32 +++++++++++++++++--- pandas/tests/frame/test_subclass.py | 1 + 10 files changed, 74 insertions(+), 15 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 461363d295f6a..089e19a2909e9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -153,6 +153,7 @@ jobs: run: | source activate pandas-dev pytest pandas/tests/frame/methods --array-manager + pytest pandas/tests/frame/test_* --array-manager -k "not test_reductions" pytest pandas/tests/arithmetic/ --array-manager pytest pandas/tests/reshape/merge --array-manager diff --git a/pandas/conftest.py b/pandas/conftest.py index ce572e42abec6..2df6325fe0e1a 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -407,11 +407,14 @@ def __len__(self): # Indices # ---------------------------------------------------------------- @pytest.fixture -def multiindex_year_month_day_dataframe_random_data(): +def multiindex_year_month_day_dataframe_random_data(using_array_manager): """ DataFrame with 3 level MultiIndex (year, month, day) covering first 100 business days from 2000-01-01 with random data """ + if using_array_manager: + # TODO(ArrayManager) groupby + pytest.skip("Not yet implemented for ArrayManager") tdf = tm.makeTimeDataFrame(100) ymd = tdf.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day]).sum() # use Int64Index, to make sure things work diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index d38d278e89a67..0d73acd080f7a 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -741,7 +741,13 @@ def insert(self, loc: int, item: Hashable, value, allow_duplicates: bool = False value = extract_array(value, extract_numpy=True) if value.ndim == 2: - value = value[0, :] + if value.shape[0] == 1: + value = value[0, :] + else: + raise ValueError( + f"expected 1D array, got array with shape {value.shape}" + ) + # TODO self.arrays can be empty # assert len(value) == len(self.arrays[0]) diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 44b6d44ee6275..c6816fa6481f4 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -7,6 +7,8 @@ import pytest import pytz +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( DataFrame, @@ -686,6 +688,7 @@ def test_df_add_2d_array_collike_broadcasts(self): result = collike + df tm.assert_frame_equal(result, expected) + @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) decide on dtypes def test_df_arith_2d_array_rowlike_broadcasts(self, all_arithmetic_operators): # GH#23000 opname = all_arithmetic_operators @@ -707,6 +710,7 @@ def test_df_arith_2d_array_rowlike_broadcasts(self, all_arithmetic_operators): result = getattr(df, opname)(rowlike) tm.assert_frame_equal(result, expected) + @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) decide on dtypes def test_df_arith_2d_array_collike_broadcasts(self, all_arithmetic_operators): # GH#23000 opname = all_arithmetic_operators @@ -1351,7 +1355,7 @@ def test_strings_to_numbers_comparisons_raises(self, compare_operators_no_eq_ne) def test_comparison_protected_from_errstate(self): missing_df = tm.makeDataFrame() - missing_df.iloc[0]["A"] = np.nan + missing_df.loc[missing_df.index[0], "A"] = np.nan with np.errstate(invalid="ignore"): expected = missing_df.values < 0 with np.errstate(invalid="raise"): diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index 193f1617fbb55..9644a5f6ae54e 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -9,6 +9,7 @@ import pytest from pandas.errors import PerformanceWarning +import pandas.util._test_decorators as td import pandas as pd from pandas import ( @@ -30,6 +31,11 @@ # structure +# TODO(ArrayManager) check which of those tests need to be rewritten the test the +# equivalent for ArrayManager +pytestmark = td.skip_array_manager_invalid_test + + class TestDataFrameBlockInternals: def test_setitem_invalidates_datetime_index_freq(self): # GH#24096 altering a datetime64tz column inplace invalidates the diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 14adc8a992609..7296f8ccdeafc 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -18,6 +18,7 @@ import pytz from pandas.compat import np_version_under1p19 +import pandas.util._test_decorators as td from pandas.core.dtypes.common import is_integer_dtype from pandas.core.dtypes.dtypes import ( @@ -159,7 +160,10 @@ def test_constructor_cast_failure(self): df["foo"] = np.ones((4, 2)).tolist() # this is not ok - msg = "Wrong number of items passed 2, placement implies 1" + msg = ( + "Wrong number of items passed 2, placement implies 1" + "|expected 1D array, got array" + ) with pytest.raises(ValueError, match=msg): df["test"] = np.ones((4, 2)) @@ -174,12 +178,15 @@ def test_constructor_dtype_copy(self): new_df["col1"] = 200.0 assert orig_df["col1"][0] == 1.0 - def test_constructor_dtype_nocast_view(self): + def test_constructor_dtype_nocast_view_dataframe(self): df = DataFrame([[1, 2]]) should_be_view = DataFrame(df, dtype=df[0].dtype) should_be_view[0][0] = 99 assert df.values[0, 0] == 99 + @td.skip_array_manager_invalid_test # TODO(ArrayManager) keep view on 2D array? + def test_constructor_dtype_nocast_view_2d_array(self): + df = DataFrame([[1, 2]]) should_be_view = DataFrame(df.values, dtype=df[0].dtype) should_be_view[0][0] = 97 assert df.values[0, 0] == 97 @@ -1931,6 +1938,7 @@ def test_constructor_frame_copy(self, float_frame): assert (cop["A"] == 5).all() assert not (float_frame["A"] == 5).all() + @td.skip_array_manager_invalid_test # TODO(ArrayManager) keep view on 2D array? def test_constructor_ndarray_copy(self, float_frame): df = DataFrame(float_frame.values) diff --git a/pandas/tests/frame/test_nonunique_indexes.py b/pandas/tests/frame/test_nonunique_indexes.py index c3812e109b938..9b0dedd7c6759 100644 --- a/pandas/tests/frame/test_nonunique_indexes.py +++ b/pandas/tests/frame/test_nonunique_indexes.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( DataFrame, @@ -18,6 +20,9 @@ def check(result, expected=None): class TestDataFrameNonuniqueIndexes: + + # TODO(ArrayManager) iset with multiple elements not yet implemented + @td.skip_array_manager_not_yet_implemented def test_setattr_columns_vs_construct_with_columns(self): # assignment @@ -330,7 +335,7 @@ def test_multi_dtype2(self): expected = DataFrame([[1, 2, "foo", "bar"]], columns=["a", "a.1", "a.2", "a.3"]) tm.assert_frame_equal(df, expected) - def test_dups_across_blocks(self): + def test_dups_across_blocks(self, using_array_manager): # dups across blocks df_float = DataFrame(np.random.randn(10, 3), dtype="float64") df_int = DataFrame(np.random.randn(10, 3), dtype="int64") @@ -341,8 +346,9 @@ def test_dups_across_blocks(self): ) df = pd.concat([df_float, df_int, df_bool, df_object, df_dt], axis=1) - assert len(df._mgr.blknos) == len(df.columns) - assert len(df._mgr.blklocs) == len(df.columns) + if not using_array_manager: + assert len(df._mgr.blknos) == len(df.columns) + assert len(df._mgr.blklocs) == len(df.columns) # testing iloc for i in range(len(df.columns)): diff --git a/pandas/tests/frame/test_repr_info.py b/pandas/tests/frame/test_repr_info.py index c8131049b51d2..03c5b6e027dac 100644 --- a/pandas/tests/frame/test_repr_info.py +++ b/pandas/tests/frame/test_repr_info.py @@ -26,14 +26,16 @@ class TestDataFrameReprInfoEtc: - def test_repr_bytes_61_lines(self): + def test_repr_bytes_61_lines(self, using_array_manager): # GH#12857 lets = list("ACDEFGHIJKLMNOP") slen = 50 nseqs = 1000 words = [[np.random.choice(lets) for x in range(slen)] for _ in range(nseqs)] df = DataFrame(words).astype("U1") - assert (df.dtypes == object).all() + # TODO(Arraymanager) astype("U1") actually gives this dtype instead of object + if not using_array_manager: + assert (df.dtypes == object).all() # smoke tests; at one point this raised with 61 but not 60 repr(df) diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 9945b739f8a87..1f6ab02d1ce5f 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -5,6 +5,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( DataFrame, @@ -60,12 +62,13 @@ def test_stack_mixed_level(self): expected = expected[["a", "b"]] tm.assert_frame_equal(result, expected) - def test_unstack_not_consolidated(self): + def test_unstack_not_consolidated(self, using_array_manager): # Gh#34708 df = DataFrame({"x": [1, 2, np.NaN], "y": [3.0, 4, np.NaN]}) df2 = df[["x"]] df2["y"] = df["y"] - assert len(df2._mgr.blocks) == 2 + if not using_array_manager: + assert len(df2._mgr.blocks) == 2 res = df2.unstack() expected = df.unstack() @@ -118,6 +121,8 @@ def test_unstack_fill(self): expected = unstacked["w"] tm.assert_frame_equal(result, expected) + # TODO(ArrayManager) iset with multiple elements not yet implemented + @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) iset def test_unstack_fill_frame(self): # From a dataframe @@ -747,7 +752,8 @@ def test_unstack_multi_level_rows_and_cols(self): expected = df.unstack(["i3"]).unstack(["i2"]) tm.assert_frame_equal(result, expected) - def test_unstack_nan_index(self): # GH7466 + def test_unstack_nan_index1(self): + # GH7466 def cast(val): val_str = "" if val != val else val return f"{val_str:1}" @@ -833,6 +839,7 @@ def verify(df): for col in ["4th", "5th"]: verify(udf[col]) + def test_unstack_nan_index2(self): # GH7403 df = DataFrame({"A": list("aaaabbbb"), "B": range(8), "C": range(8)}) df.iloc[3, 1] = np.NaN @@ -875,6 +882,7 @@ def verify(df): right = DataFrame(vals, columns=cols, index=idx) tm.assert_frame_equal(left, right) + def test_unstack_nan_index3(self, using_array_manager): # GH7401 df = DataFrame( { @@ -896,8 +904,13 @@ def verify(df): ) right = DataFrame(vals, columns=cols, index=idx) + if using_array_manager: + # with ArrayManager preserve dtype where possible + cols = right.columns[[1, 2, 3, 5]] + right[cols] = right[cols].astype("int64") tm.assert_frame_equal(left, right) + def test_unstack_nan_index4(self): # GH4862 vals = [ ["Hg", np.nan, np.nan, 680585148], @@ -938,6 +951,8 @@ def verify(df): left = df.loc[17264:].copy().set_index(["s_id", "dosage", "agent"]) tm.assert_frame_equal(left.unstack(), right) + @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) MultiIndex bug + def test_unstack_nan_index5(self): # GH9497 - multiple unstack with nulls df = DataFrame( { @@ -1453,6 +1468,7 @@ def test_stack_mixed_dtype(self, multiindex_dataframe_random_data): assert result.name is None assert stacked["bar"].dtype == np.float_ + @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) groupby def test_unstack_bug(self): df = DataFrame( { @@ -1689,6 +1705,7 @@ def test_unstack_period_frame(self): tm.assert_frame_equal(result3, expected) + @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) groupby def test_stack_multiple_bug(self): # bug when some uniques are not present in the data GH#3170 id_col = ([1] * 3) + ([2] * 3) @@ -1887,7 +1904,7 @@ def test_unstack_group_index_overflow(self): result = s.unstack(4) assert result.shape == (500, 2) - def test_unstack_with_missing_int_cast_to_float(self): + def test_unstack_with_missing_int_cast_to_float(self, using_array_manager): # https://github.com/pandas-dev/pandas/issues/37115 df = DataFrame( { @@ -1899,7 +1916,8 @@ def test_unstack_with_missing_int_cast_to_float(self): # add another int column to get 2 blocks df["is_"] = 1 - assert len(df._mgr.blocks) == 2 + if not using_array_manager: + assert len(df._mgr.blocks) == 2 result = df.unstack("b") result[("is_", "ca")] = result[("is_", "ca")].fillna(0) @@ -1912,6 +1930,10 @@ def test_unstack_with_missing_int_cast_to_float(self): names=[None, "b"], ), ) + if using_array_manager: + # with ArrayManager preserve dtype where possible + expected[("v", "cb")] = expected[("v", "cb")].astype("int64") + expected[("is_", "cb")] = expected[("is_", "cb")].astype("int64") tm.assert_frame_equal(result, expected) def test_unstack_with_level_has_nan(self): diff --git a/pandas/tests/frame/test_subclass.py b/pandas/tests/frame/test_subclass.py index 784ca03fa9c03..0c8a51b2adc93 100644 --- a/pandas/tests/frame/test_subclass.py +++ b/pandas/tests/frame/test_subclass.py @@ -702,6 +702,7 @@ def test_idxmax_preserves_subclass(self): result = df.idxmax() assert isinstance(result, tm.SubclassedSeries) + @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) equals def test_equals_subclass(self): # https://github.com/pandas-dev/pandas/pull/34402 # allow subclass in both directions From 62fb0d57a265833c8a5b7eaf28dbec3d89349726 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 23 Feb 2021 11:38:58 +0100 Subject: [PATCH 2/5] remove skips that are no longer needed --- pandas/tests/frame/test_block_internals.py | 2 +- pandas/tests/frame/test_nonunique_indexes.py | 5 ----- pandas/tests/frame/test_stack_unstack.py | 6 +++--- pandas/tests/frame/test_subclass.py | 1 - 4 files changed, 4 insertions(+), 10 deletions(-) diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index 9644a5f6ae54e..9c9557a442a80 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -31,7 +31,7 @@ # structure -# TODO(ArrayManager) check which of those tests need to be rewritten the test the +# TODO(ArrayManager) check which of those tests need to be rewritten to test the # equivalent for ArrayManager pytestmark = td.skip_array_manager_invalid_test diff --git a/pandas/tests/frame/test_nonunique_indexes.py b/pandas/tests/frame/test_nonunique_indexes.py index 9b0dedd7c6759..be7dd4eb4942e 100644 --- a/pandas/tests/frame/test_nonunique_indexes.py +++ b/pandas/tests/frame/test_nonunique_indexes.py @@ -1,8 +1,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - import pandas as pd from pandas import ( DataFrame, @@ -20,9 +18,6 @@ def check(result, expected=None): class TestDataFrameNonuniqueIndexes: - - # TODO(ArrayManager) iset with multiple elements not yet implemented - @td.skip_array_manager_not_yet_implemented def test_setattr_columns_vs_construct_with_columns(self): # assignment diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 1f6ab02d1ce5f..0000ee62716f4 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -122,7 +122,7 @@ def test_unstack_fill(self): tm.assert_frame_equal(result, expected) # TODO(ArrayManager) iset with multiple elements not yet implemented - @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) iset + # @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) iset def test_unstack_fill_frame(self): # From a dataframe @@ -905,7 +905,7 @@ def test_unstack_nan_index3(self, using_array_manager): right = DataFrame(vals, columns=cols, index=idx) if using_array_manager: - # with ArrayManager preserve dtype where possible + # INFO(ArrayManager) with ArrayManager preserve dtype where possible cols = right.columns[[1, 2, 3, 5]] right[cols] = right[cols].astype("int64") tm.assert_frame_equal(left, right) @@ -1931,7 +1931,7 @@ def test_unstack_with_missing_int_cast_to_float(self, using_array_manager): ), ) if using_array_manager: - # with ArrayManager preserve dtype where possible + # INFO(ArrayManager) with ArrayManager preserve dtype where possible expected[("v", "cb")] = expected[("v", "cb")].astype("int64") expected[("is_", "cb")] = expected[("is_", "cb")].astype("int64") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_subclass.py b/pandas/tests/frame/test_subclass.py index 0c8a51b2adc93..784ca03fa9c03 100644 --- a/pandas/tests/frame/test_subclass.py +++ b/pandas/tests/frame/test_subclass.py @@ -702,7 +702,6 @@ def test_idxmax_preserves_subclass(self): result = df.idxmax() assert isinstance(result, tm.SubclassedSeries) - @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) equals def test_equals_subclass(self): # https://github.com/pandas-dev/pandas/pull/34402 # allow subclass in both directions From 21f59af754f9990c5a754698203cf62f7c6a13a2 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 25 Feb 2021 17:57:57 +0100 Subject: [PATCH 3/5] clean-up unnecessary skips --- pandas/conftest.py | 5 +---- pandas/tests/frame/test_stack_unstack.py | 3 --- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index ed9adc9df5173..426cbf6a65aa5 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -407,14 +407,11 @@ def __len__(self): # Indices # ---------------------------------------------------------------- @pytest.fixture -def multiindex_year_month_day_dataframe_random_data(using_array_manager): +def multiindex_year_month_day_dataframe_random_data(): """ DataFrame with 3 level MultiIndex (year, month, day) covering first 100 business days from 2000-01-01 with random data """ - if using_array_manager: - # TODO(ArrayManager) groupby - pytest.skip("Not yet implemented for ArrayManager") tdf = tm.makeTimeDataFrame(100) ymd = tdf.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day]).sum() # use Int64Index, to make sure things work diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 0000ee62716f4..1c31bda16d117 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -121,8 +121,6 @@ def test_unstack_fill(self): expected = unstacked["w"] tm.assert_frame_equal(result, expected) - # TODO(ArrayManager) iset with multiple elements not yet implemented - # @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) iset def test_unstack_fill_frame(self): # From a dataframe @@ -1705,7 +1703,6 @@ def test_unstack_period_frame(self): tm.assert_frame_equal(result3, expected) - @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) groupby def test_stack_multiple_bug(self): # bug when some uniques are not present in the data GH#3170 id_col = ([1] * 3) + ([2] * 3) From 53765e5c729cac8f9284d7a7a584333b66c4dd09 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 3 Mar 2021 09:42:58 +0100 Subject: [PATCH 4/5] no longer skip groupby --- pandas/tests/frame/test_stack_unstack.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 1c31bda16d117..fd23ea3a7621c 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -1466,7 +1466,6 @@ def test_stack_mixed_dtype(self, multiindex_dataframe_random_data): assert result.name is None assert stacked["bar"].dtype == np.float_ - @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) groupby def test_unstack_bug(self): df = DataFrame( { From 8714c4401aa0493541814a01eb5c3821e5cfc0a7 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 4 Mar 2021 08:26:38 +0100 Subject: [PATCH 5/5] skip set_flags for now --- pandas/tests/frame/test_api.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 2f2de9764219b..76cfd77d254f2 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -296,6 +296,7 @@ def test_attrs(self): result = df.rename(columns=str) assert result.attrs == {"version": 1} + @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) setitem (no copy) @pytest.mark.parametrize("allows_duplicate_labels", [True, False, None]) def test_set_flags(self, allows_duplicate_labels, frame_or_series): obj = DataFrame({"A": [1, 2]})