From 6cd1c4dbbe1dfe742f65625b4a8ff7c75d034f06 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 5 Feb 2021 12:11:41 +0100 Subject: [PATCH 01/21] [ArrayManager] Implement concat with reindexing --- .github/workflows/ci.yml | 1 + pandas/core/dtypes/concat.py | 160 +++++++++++++++++- pandas/core/internals/array_manager.py | 26 ++- pandas/core/internals/concat.py | 55 ++++-- pandas/tests/frame/methods/test_append.py | 21 ++- pandas/tests/frame/methods/test_drop.py | 2 +- pandas/tests/frame/methods/test_explode.py | 2 +- pandas/tests/frame/methods/test_join.py | 9 +- pandas/tests/io/formats/test_printing.py | 2 +- pandas/tests/io/test_fsspec.py | 2 +- pandas/tests/reshape/concat/test_append.py | 6 + pandas/tests/reshape/concat/test_concat.py | 6 +- pandas/tests/reshape/concat/test_datetimes.py | 6 + 13 files changed, 256 insertions(+), 42 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b551e7ded0178..341ed8ef75cdd 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -157,3 +157,4 @@ jobs: run: | source activate pandas-dev pytest pandas/tests/frame/methods --array-manager + pytest pandas/tests/reshape/concat/ --array-manager diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 5b46bee96d4b3..1afc97b0adc78 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -5,14 +5,19 @@ import numpy as np +from pandas._libs import NaT, lib from pandas._typing import ArrayLike, DtypeObj from pandas.core.dtypes.cast import find_common_type from pandas.core.dtypes.common import ( + is_bool_dtype, is_categorical_dtype, + is_datetime64_ns_dtype, is_dtype_equal, is_extension_array_dtype, + is_integer_dtype, is_sparse, + is_timedelta64_ns_dtype, ) from pandas.core.dtypes.generic import ABCCategoricalIndex, ABCSeries @@ -21,11 +26,78 @@ from pandas.core.construction import array, ensure_wrapped_if_datetimelike +class NullArrayProxy: + """ + Proxy object for an all-NA array. + + Only stores the length of the array, and not the dtype. The dtype + will only be known when actually concatenating (after determining the + common dtype, for which this proxy is ignored). + Using this object avoids that the internals/concat.py needs to determine + the proper dtype and array type. + """ + + ndim = 1 + + def __init__(self, n: int): + self.n = n + + @property + def shape(self): + return (self.n,) + + +def _array_from_proxy(arr, dtype: DtypeObj, fill_value=lib.no_default): + """ + Helper function to create the actual all-NA array from the NullArrayProxy object. + + Parameters + ---------- + arr : NullArrayProxy + dtype : the dtype for the resulting array + fill_value : scalar NA-like value + By default uses the ExtensionDtype's na_value or np.nan. For numpy + arrays, this can be overridden to be something else (eg None). + + Returns + ------- + np.ndarray or ExtensionArray + """ + if is_extension_array_dtype(dtype): + return dtype.construct_array_type()._from_sequence( + [dtype.na_value] * arr.n, dtype=dtype + ) + elif is_datetime64_ns_dtype(dtype): + from pandas.core.arrays import DatetimeArray + + return DatetimeArray._from_sequence([NaT] * arr.n, dtype=dtype) + elif is_timedelta64_ns_dtype(dtype): + from pandas.core.arrays import TimedeltaArray + + return TimedeltaArray._from_sequence([NaT] * arr.n, dtype=dtype) + else: + if is_integer_dtype(dtype): + dtype = "float64" + fill_value = np.nan + elif is_bool_dtype(dtype): + dtype = object + + if fill_value is lib.no_default: + fill_value = np.nan + + arr = np.empty(arr.n, dtype=dtype) + arr.fill(fill_value) + return arr + + def _cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike: """ Helper function for `arr.astype(common_dtype)` but handling all special cases. """ + if isinstance(arr, NullArrayProxy): + return _array_from_proxy(arr, dtype) + if ( is_categorical_dtype(arr.dtype) and isinstance(dtype, np.dtype) @@ -132,6 +204,75 @@ def is_nonempty(x) -> bool: return np.concatenate(to_concat, axis=axis) +def concat_arrays(to_concat): + """ + Alternative for concat_compat but specialized for use in the ArrayManager. + + Differences: only deals with 1D arrays (no axis keyword) and does not skip + empty arrays to determine the dtype. + In addition ensures that all NullArrayProxies get replaced with actual + arrays. + + Parameters + ---------- + to_concat : list of arrays + + Returns + ------- + np.ndarray or ExtensionArray + """ + # ignore the all-NA proxies to determine the resulting dtype + to_concat_no_proxy = [x for x in to_concat if not isinstance(x, NullArrayProxy)] + + kinds = {obj.dtype.kind for obj in to_concat_no_proxy} + single_dtype = len({x.dtype for x in to_concat_no_proxy}) == 1 + any_ea = any(is_extension_array_dtype(x.dtype) for x in to_concat_no_proxy) + + if any_ea: + if not single_dtype: + target_dtype = find_common_type([x.dtype for x in to_concat_no_proxy]) + to_concat = [_cast_to_common_type(arr, target_dtype) for arr in to_concat] + else: + target_dtype = to_concat_no_proxy[0].dtype + to_concat = [ + _array_from_proxy(arr, target_dtype) + if isinstance(arr, NullArrayProxy) + else arr + for arr in to_concat + ] + + if isinstance(to_concat[0], ExtensionArray): + cls = type(to_concat[0]) + return cls._concat_same_type(to_concat) + else: + return np.concatenate(to_concat) + + elif any(kind in ["m", "M"] for kind in kinds): + return _concat_datetime(to_concat) + + if not single_dtype: + target_dtype = np.find_common_type( + [arr.dtype for arr in to_concat_no_proxy], [] + ) + else: + target_dtype = to_concat_no_proxy[0].dtype + to_concat = [ + _array_from_proxy(arr, target_dtype) if isinstance(arr, NullArrayProxy) else arr + for arr in to_concat + ] + + result = np.concatenate(to_concat) + + # TODO(ArrayManager) this is currently inconsistent between Series and DataFrame + # so we should decide whether to keep the below special case or remove it + if len(result) == 0: + # all empties -> check for bool to not coerce to float + if len(kinds) != 1: + if "b" in kinds: + result = result.astype(object) + return result + + def union_categoricals( to_union, sort_categories: bool = False, ignore_order: bool = False ): @@ -322,20 +463,35 @@ def _concat_datetime(to_concat, axis=0): a single array, preserving the combined dtypes """ to_concat = [ensure_wrapped_if_datetimelike(x) for x in to_concat] + to_concat_no_proxy = [x for x in to_concat if not isinstance(x, NullArrayProxy)] - single_dtype = len({x.dtype for x in to_concat}) == 1 + single_dtype = len({x.dtype for x in to_concat_no_proxy}) == 1 # multiple types, need to coerce to object if not single_dtype: # ensure_wrapped_if_datetimelike ensures that astype(object) wraps # in Timestamp/Timedelta + to_concat = [ + _array_from_proxy(arr, dtype=object, fill_value=None) + if isinstance(arr, NullArrayProxy) + else arr + for arr in to_concat + ] + return _concatenate_2d([x.astype(object) for x in to_concat], axis=axis) if axis == 1: # TODO(EA2D): kludge not necessary with 2D EAs to_concat = [x.reshape(1, -1) if x.ndim == 1 else x for x in to_concat] + else: + to_concat = [ + _array_from_proxy(arr, dtype=to_concat_no_proxy[0].dtype) + if isinstance(arr, NullArrayProxy) + else arr + for arr in to_concat + ] - result = type(to_concat[0])._concat_same_type(to_concat, axis=axis) + result = type(to_concat_no_proxy[0])._concat_same_type(to_concat, axis=axis) if result.ndim == 2 and is_extension_array_dtype(result.dtype): # TODO(EA2D): kludge not necessary with 2D EAs diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 0f677ff3180be..10648057b28a7 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -18,6 +18,7 @@ is_extension_array_dtype, is_numeric_dtype, ) +from pandas.core.dtypes.concat import NullArrayProxy from pandas.core.dtypes.dtypes import ExtensionDtype, PandasDtype from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries from pandas.core.dtypes.missing import isna @@ -725,10 +726,20 @@ def reindex_indexer( # ignored keywords consolidate: bool = True, only_slice: bool = False, + # ArrayManager specific keywords + do_integrity_check=True, + use_na_proxy=False, ) -> T: axis = self._normalize_axis(axis) return self._reindex_indexer( - new_axis, indexer, axis, fill_value, allow_dups, copy + new_axis, + indexer, + axis, + fill_value, + allow_dups, + copy, + do_integrity_check, + use_na_proxy, ) def _reindex_indexer( @@ -739,6 +750,8 @@ def _reindex_indexer( fill_value=None, allow_dups: bool = False, copy: bool = True, + do_integrity_check=True, + use_na_proxy=False, ) -> T: """ Parameters @@ -773,7 +786,9 @@ def _reindex_indexer( new_arrays = [] for i in indexer: if i == -1: - arr = self._make_na_array(fill_value=fill_value) + arr = self._make_na_array( + fill_value=fill_value, use_na_proxy=use_na_proxy + ) else: arr = self.arrays[i] new_arrays.append(arr) @@ -793,7 +808,7 @@ def _reindex_indexer( new_axes = list(self._axes) new_axes[axis] = new_axis - return type(self)(new_arrays, new_axes) + return type(self)(new_arrays, new_axes, do_integrity_check=do_integrity_check) def take(self, indexer, axis: int = 1, verify: bool = True, convert: bool = True): """ @@ -820,10 +835,11 @@ def take(self, indexer, axis: int = 1, verify: bool = True, convert: bool = True new_axis=new_labels, indexer=indexer, axis=axis, allow_dups=True ) - def _make_na_array(self, fill_value=None): + def _make_na_array(self, fill_value=None, use_na_proxy=False): + if use_na_proxy: + return NullArrayProxy(self.shape_proper[0]) if fill_value is None: fill_value = np.nan - dtype, fill_value = infer_dtype_from_scalar(fill_value) values = np.empty(self.shape_proper[0], dtype=dtype) values.fill(fill_value) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 3dcfa85ed5c08..51732505c1bc0 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -23,7 +23,7 @@ is_sparse, is_timedelta64_dtype, ) -from pandas.core.dtypes.concat import concat_compat +from pandas.core.dtypes.concat import concat_arrays, concat_compat from pandas.core.dtypes.missing import isna_all import pandas.core.algorithms as algos @@ -37,6 +37,45 @@ from pandas.core.arrays.sparse.dtype import SparseDtype +def concatenate_array_managers( + mgrs_indexers, axes: List[Index], concat_axis: int, copy: bool +) -> Manager: + """ + Concatenate array managers into one. + + Parameters + ---------- + mgrs_indexers : list of (ArrayManager, {axis: indexer,...}) tuples + axes : list of Index + concat_axis : int + copy : bool + + Returns + ------- + ArrayManager + """ + # reindex all arrays + mgrs = [] + for mgr, indexers in mgrs_indexers: + for ax, indexer in indexers.items(): + mgr = mgr.reindex_indexer( + axes[ax], indexer, axis=ax, do_integrity_check=False, use_na_proxy=True + ) + mgrs.append(mgr) + + # concatting along the rows -> concat the reindexed arrays + if concat_axis == 1: + arrays = [ + concat_arrays([mgrs[i].arrays[j] for i in range(len(mgrs))]) + for j in range(len(mgrs[0].arrays)) + ] + return ArrayManager(arrays, [axes[1], axes[0]], do_integrity_check=False) + # concatting along the columns -> combine reindexed arrays in a single manager + elif concat_axis == 0: + arrays = list(itertools.chain.from_iterable([mgr.arrays for mgr in mgrs])) + return ArrayManager(arrays, [axes[1], axes[0]], do_integrity_check=False) + + def concatenate_block_managers( mgrs_indexers, axes: List[Index], concat_axis: int, copy: bool ) -> Manager: @@ -55,19 +94,7 @@ def concatenate_block_managers( BlockManager """ if isinstance(mgrs_indexers[0][0], ArrayManager): - - if concat_axis == 1: - # TODO for now only fastpath without indexers - mgrs = [t[0] for t in mgrs_indexers] - arrays = [ - concat_compat([mgrs[i].arrays[j] for i in range(len(mgrs))], axis=0) - for j in range(len(mgrs[0].arrays)) - ] - return ArrayManager(arrays, [axes[1], axes[0]]) - elif concat_axis == 0: - mgrs = [t[0] for t in mgrs_indexers] - arrays = list(itertools.chain.from_iterable([mgr.arrays for mgr in mgrs])) - return ArrayManager(arrays, [axes[1], axes[0]]) + return concatenate_array_managers(mgrs_indexers, axes, concat_axis, copy) concat_plans = [ _get_mgr_concatenation_plan(mgr, indexers) for mgr, indexers in mgrs_indexers diff --git a/pandas/tests/frame/methods/test_append.py b/pandas/tests/frame/methods/test_append.py index 36c875b8abe6f..5d94bc63a5365 100644 --- a/pandas/tests/frame/methods/test_append.py +++ b/pandas/tests/frame/methods/test_append.py @@ -1,15 +1,10 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - import pandas as pd from pandas import DataFrame, Series, Timestamp, date_range, timedelta_range import pandas._testing as tm -# TODO td.skip_array_manager_not_yet_implemented -# appending with reindexing not yet working - class TestDataFrameAppend: def test_append_multiindex(self, multiindex_dataframe_random_data, frame_or_series): @@ -37,7 +32,6 @@ def test_append_empty_list(self): tm.assert_frame_equal(result, expected) assert result is not df # .append() should return a new object - @td.skip_array_manager_not_yet_implemented def test_append_series_dict(self): df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"]) @@ -78,7 +72,6 @@ def test_append_series_dict(self): expected = df.append(df[-1:], ignore_index=True) tm.assert_frame_equal(result, expected) - @td.skip_array_manager_not_yet_implemented def test_append_list_of_series_dicts(self): df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"]) @@ -97,7 +90,6 @@ def test_append_list_of_series_dicts(self): expected = df.append(DataFrame(dicts), ignore_index=True, sort=True) tm.assert_frame_equal(result, expected) - @td.skip_array_manager_not_yet_implemented def test_append_missing_cols(self): # GH22252 # exercise the conditional branch in append method where the data @@ -142,8 +134,7 @@ def test_append_empty_dataframe(self): expected = df1.copy() tm.assert_frame_equal(result, expected) - @td.skip_array_manager_not_yet_implemented - def test_append_dtypes(self): + def test_append_dtypes(self, using_array_manager): # GH 5754 # row appends of different dtypes (so need to do by-item) @@ -167,6 +158,9 @@ def test_append_dtypes(self): expected = DataFrame( {"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")} ) + if using_array_manager: + # With ArrayManager, all-NaN float is not ignored + expected = expected.astype(object) tm.assert_frame_equal(result, expected) df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1)) @@ -175,6 +169,9 @@ def test_append_dtypes(self): expected = DataFrame( {"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")} ) + if using_array_manager: + # With ArrayManager, all-NaN float is not ignored + expected = expected.astype(object) tm.assert_frame_equal(result, expected) df1 = DataFrame({"bar": np.nan}, index=range(1)) @@ -183,6 +180,9 @@ def test_append_dtypes(self): expected = DataFrame( {"bar": Series([np.nan, Timestamp("20130101")], dtype="M8[ns]")} ) + if using_array_manager: + # With ArrayManager, all-NaN float is not ignored + expected = expected.astype(object) tm.assert_frame_equal(result, expected) df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1)) @@ -202,7 +202,6 @@ def test_append_timestamps_aware_or_naive(self, tz_naive_fixture, timestamp): expected = Series(Timestamp(timestamp, tz=tz), name=0) tm.assert_series_equal(result, expected) - @td.skip_array_manager_not_yet_implemented @pytest.mark.parametrize( "data, dtype", [ diff --git a/pandas/tests/frame/methods/test_drop.py b/pandas/tests/frame/methods/test_drop.py index 4568cda24d5cf..2af5c8dd29842 100644 --- a/pandas/tests/frame/methods/test_drop.py +++ b/pandas/tests/frame/methods/test_drop.py @@ -155,7 +155,7 @@ def test_drop(self): assert return_value is None tm.assert_frame_equal(df, expected) - @td.skip_array_manager_not_yet_implemented + @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) groupby def test_drop_multiindex_not_lexsorted(self): # GH#11640 diff --git a/pandas/tests/frame/methods/test_explode.py b/pandas/tests/frame/methods/test_explode.py index be80dd49ff1fb..c5a4fcc3cc3c2 100644 --- a/pandas/tests/frame/methods/test_explode.py +++ b/pandas/tests/frame/methods/test_explode.py @@ -6,7 +6,7 @@ import pandas as pd import pandas._testing as tm -# TODO(ArrayManager) concat with reindexing +# TODO(ArrayManager) concat reindex with duplicates pytestmark = td.skip_array_manager_not_yet_implemented diff --git a/pandas/tests/frame/methods/test_join.py b/pandas/tests/frame/methods/test_join.py index 42694dc3ff37c..20658a3dcf0b2 100644 --- a/pandas/tests/frame/methods/test_join.py +++ b/pandas/tests/frame/methods/test_join.py @@ -9,9 +9,6 @@ from pandas import DataFrame, Index, MultiIndex, date_range, period_range import pandas._testing as tm -# TODO(ArrayManager) concat with reindexing -pytestmark = td.skip_array_manager_not_yet_implemented - @pytest.fixture def frame_with_period_index(): @@ -183,6 +180,7 @@ def test_join_period_index(frame_with_period_index): tm.assert_frame_equal(joined, expected) +@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) concat with duplicates def test_join_left_sequence_non_unique_index(): # https://github.com/pandas-dev/pandas/issues/19607 df1 = DataFrame({"a": [0, 10, 20]}, index=[1, 2, 3]) @@ -234,8 +232,9 @@ def test_join(self, multiindex_dataframe_random_data): b = frame.loc[frame.index[2:], ["B", "C"]] joined = a.join(b, how="outer").reindex(frame.index) - expected = frame.copy() - expected.values[np.isnan(joined.values)] = np.nan + expected = frame.copy().values + expected[np.isnan(joined.values)] = np.nan + expected = DataFrame(expected, index=frame.index, columns=frame.columns) assert not np.isnan(joined.values).all() diff --git a/pandas/tests/io/formats/test_printing.py b/pandas/tests/io/formats/test_printing.py index 2339e21288bb5..24d1973eeda6d 100644 --- a/pandas/tests/io/formats/test_printing.py +++ b/pandas/tests/io/formats/test_printing.py @@ -121,7 +121,7 @@ def test_ambiguous_width(self): assert adjoined == expected -@td.skip_array_manager_not_yet_implemented +@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) JSON class TestTableSchemaRepr: @classmethod def setup_class(cls): diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index d9575a6ad81e5..3131131682ccd 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -247,7 +247,7 @@ def test_pickle_options(fsspectest): tm.assert_frame_equal(df, out) -@td.skip_array_manager_not_yet_implemented +@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) JSON def test_json_options(fsspectest): df = DataFrame({"a": [0]}) df.to_json("testmem://afile", storage_options={"test": "json_write"}) diff --git a/pandas/tests/reshape/concat/test_append.py b/pandas/tests/reshape/concat/test_append.py index dd6dbd79113e5..d4e16b0a9196c 100644 --- a/pandas/tests/reshape/concat/test_append.py +++ b/pandas/tests/reshape/concat/test_append.py @@ -6,6 +6,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import DataFrame, Index, Series, Timestamp, concat, isna import pandas._testing as tm @@ -331,6 +333,10 @@ def test_append_missing_column_proper_upcast(self, sort): assert appended["A"].dtype == "f8" assert appended["B"].dtype == "O" + # TODO(ArrayManager) DataFrame.append reindexes a Series itself (giving + # float dtype) -> delay reindexing until concat_array_managers which properly + # takes care of all-null dtype inference + @td.skip_array_manager_not_yet_implemented def test_append_empty_frame_to_series_with_dateutil_tz(self): # GH 23682 date = Timestamp("2018-10-24 07:30:00", tz=dateutil.tz.tzutc()) diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index 575903de8f946..f755ec3f76f39 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -5,6 +5,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, concat, date_range import pandas._testing as tm @@ -14,7 +16,9 @@ class TestConcatenate: - def test_concat_copy(self): + # TODO(ArrayManager) using block internals to verify, needs rewrite + @td.skip_array_manager_invalid_test + def test_concat_copy(self, using_array_manager): df = DataFrame(np.random.randn(4, 3)) df2 = DataFrame(np.random.randint(0, 10, size=4).reshape(4, 1)) df3 = DataFrame({5: "foo"}, index=range(4)) diff --git a/pandas/tests/reshape/concat/test_datetimes.py b/pandas/tests/reshape/concat/test_datetimes.py index 92181e7dffc50..7e2522b6dcf36 100644 --- a/pandas/tests/reshape/concat/test_datetimes.py +++ b/pandas/tests/reshape/concat/test_datetimes.py @@ -5,6 +5,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( DataFrame, @@ -32,6 +34,9 @@ def test_concat_datetime64_block(self): assert (result.iloc[:10]["time"] == rng).all() assert (result.iloc[10:]["time"] == rng).all() + # TODO(ArrayManager) concat with mixed managers + # (or, fix DataFrame.from_records to honor option) + @td.skip_array_manager_not_yet_implemented def test_concat_datetime_datetime64_frame(self): # GH#2624 rows = [] @@ -46,6 +51,7 @@ def test_concat_datetime_datetime64_frame(self): # it works! pd.concat([df1, df2_obj]) + @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) groupby def test_concat_datetime_timezone(self): # GH 18523 idx1 = pd.date_range("2011-01-01", periods=3, freq="H", tz="Europe/Paris") From 73d9de2b1795bf2a89b76a07acda8be6e5878e37 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 5 Feb 2021 13:57:14 +0100 Subject: [PATCH 02/21] fix mypy --- pandas/core/internals/concat.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 51732505c1bc0..efdbd0a739c4a 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -71,7 +71,8 @@ def concatenate_array_managers( ] return ArrayManager(arrays, [axes[1], axes[0]], do_integrity_check=False) # concatting along the columns -> combine reindexed arrays in a single manager - elif concat_axis == 0: + else: + assert concat_axis == 0 arrays = list(itertools.chain.from_iterable([mgr.arrays for mgr in mgrs])) return ArrayManager(arrays, [axes[1], axes[0]], do_integrity_check=False) From 272d67409a8e3126e0a1df9de928a1a733f0b807 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 5 Feb 2021 14:27:45 +0100 Subject: [PATCH 03/21] pass through allow dups --- .github/workflows/ci.yml | 2 +- pandas/core/internals/concat.py | 7 ++++++- pandas/tests/frame/methods/test_explode.py | 5 ----- pandas/tests/frame/methods/test_join.py | 3 --- pandas/tests/reshape/merge/test_join.py | 3 +++ pandas/tests/reshape/merge/test_merge.py | 8 +++++++- pandas/tests/reshape/test_crosstab.py | 4 ++++ pandas/tests/reshape/test_pivot.py | 4 ++++ pandas/tests/reshape/test_pivot_multilevel.py | 7 +++++-- 9 files changed, 30 insertions(+), 13 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 341ed8ef75cdd..a4a0d3b779565 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -157,4 +157,4 @@ jobs: run: | source activate pandas-dev pytest pandas/tests/frame/methods --array-manager - pytest pandas/tests/reshape/concat/ --array-manager + pytest pandas/tests/reshape --array-manager diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index efdbd0a739c4a..793bc9bda30fd 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -59,7 +59,12 @@ def concatenate_array_managers( for mgr, indexers in mgrs_indexers: for ax, indexer in indexers.items(): mgr = mgr.reindex_indexer( - axes[ax], indexer, axis=ax, do_integrity_check=False, use_na_proxy=True + axes[ax], + indexer, + axis=ax, + allow_dups=True, + do_integrity_check=False, + use_na_proxy=True, ) mgrs.append(mgr) diff --git a/pandas/tests/frame/methods/test_explode.py b/pandas/tests/frame/methods/test_explode.py index c5a4fcc3cc3c2..bd0901387eeed 100644 --- a/pandas/tests/frame/methods/test_explode.py +++ b/pandas/tests/frame/methods/test_explode.py @@ -1,14 +1,9 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - import pandas as pd import pandas._testing as tm -# TODO(ArrayManager) concat reindex with duplicates -pytestmark = td.skip_array_manager_not_yet_implemented - def test_error(): df = pd.DataFrame( diff --git a/pandas/tests/frame/methods/test_join.py b/pandas/tests/frame/methods/test_join.py index 20658a3dcf0b2..2bbc73da24033 100644 --- a/pandas/tests/frame/methods/test_join.py +++ b/pandas/tests/frame/methods/test_join.py @@ -3,8 +3,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - import pandas as pd from pandas import DataFrame, Index, MultiIndex, date_range, period_range import pandas._testing as tm @@ -180,7 +178,6 @@ def test_join_period_index(frame_with_period_index): tm.assert_frame_equal(joined, expected) -@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) concat with duplicates def test_join_left_sequence_non_unique_index(): # https://github.com/pandas-dev/pandas/issues/19607 df1 = DataFrame({"a": [0, 10, 20]}, index=[1, 2, 3]) diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 500d7000817af..242fc8cade9e6 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( Categorical, @@ -547,6 +549,7 @@ def test_join_non_unique_period_index(self): ) tm.assert_frame_equal(result, expected) + @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) groupby def test_mixed_type_join_with_suffix(self): # GH #916 df = DataFrame(np.random.randn(20, 6), columns=["a", "b", "c", "d", "e", "f"]) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index da3ac81c4aa17..5d3f5046fdbf6 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -5,6 +5,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas.core.dtypes.common import is_categorical_dtype, is_object_dtype from pandas.core.dtypes.dtypes import CategoricalDtype @@ -277,6 +279,7 @@ def test_merge_copy(self): merged["d"] = "peekaboo" assert (right["d"] == "bar").all() + @td.skip_array_manager_invalid_test # TODO(ArrayManager) join copy behaviour def test_merge_nocopy(self): left = DataFrame({"a": 0, "b": 1}, index=range(10)) right = DataFrame({"c": "foo", "d": "bar"}, index=range(10)) @@ -656,7 +659,7 @@ def _constructor(self): assert isinstance(result, NotADataFrame) - def test_join_append_timedeltas(self): + def test_join_append_timedeltas(self, using_array_manager): # timedelta64 issues with join/merge # GH 5695 @@ -670,6 +673,8 @@ def test_join_append_timedeltas(self): "t": [timedelta(0, 22500), timedelta(0, 22500)], } ) + if using_array_manager: + expected = expected.astype(object) tm.assert_frame_equal(result, expected) td = np.timedelta64(300000000) @@ -1362,6 +1367,7 @@ def test_merge_take_missing_values_from_index_of_other_dtype(self): expected = expected.reindex(columns=["a", "key", "b"]) tm.assert_frame_equal(result, expected) + @td.skip_array_manager_invalid_test # TODO(ArrayManager) rewrite test def test_merge_readonly(self): # https://github.com/pandas-dev/pandas/issues/27943 data1 = DataFrame( diff --git a/pandas/tests/reshape/test_crosstab.py b/pandas/tests/reshape/test_crosstab.py index 6faf64789c687..10aebdccc04a5 100644 --- a/pandas/tests/reshape/test_crosstab.py +++ b/pandas/tests/reshape/test_crosstab.py @@ -1,11 +1,15 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas.core.dtypes.common import is_categorical_dtype from pandas import CategoricalIndex, DataFrame, Index, MultiIndex, Series, crosstab import pandas._testing as tm +pytestmark = td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) groupby + class TestCrosstab: def setup_method(self, method): diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index f9b2a02920841..4d67dd7813ce7 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -4,6 +4,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( Categorical, @@ -19,6 +21,8 @@ from pandas.api.types import CategoricalDtype as CDT from pandas.core.reshape.pivot import pivot_table +pytestmark = td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) groupby + @pytest.fixture(params=[True, False]) def dropna(request): diff --git a/pandas/tests/reshape/test_pivot_multilevel.py b/pandas/tests/reshape/test_pivot_multilevel.py index f59a469c05d15..a2e59e04a6eb8 100644 --- a/pandas/tests/reshape/test_pivot_multilevel.py +++ b/pandas/tests/reshape/test_pivot_multilevel.py @@ -192,7 +192,7 @@ def test_pivot_list_like_columns( tm.assert_frame_equal(result, expected) -def test_pivot_multiindexed_rows_and_cols(): +def test_pivot_multiindexed_rows_and_cols(using_array_manager): # GH 36360 df = pd.DataFrame( @@ -214,11 +214,14 @@ def test_pivot_multiindexed_rows_and_cols(): ) expected = pd.DataFrame( - data=[[5.0, np.nan], [10.0, 7.0]], + data=[[5, np.nan], [10, 7.0]], columns=MultiIndex.from_tuples( [(0, 1, 0), (0, 1, 1)], names=["col_L0", "col_L1", "idx_L1"] ), index=Int64Index([0, 1], dtype="int64", name="idx_L0"), ) + if not using_array_manager: + # BlockManager does not preserve the dtypes + expected = expected.astype("float64") tm.assert_frame_equal(res, expected) From 555d7ac16296966f6ea0d5425e42df737f76cd26 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 5 Feb 2021 16:43:12 +0100 Subject: [PATCH 04/21] simplify _array_from_proxy --- pandas/core/dtypes/concat.py | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 1afc97b0adc78..ed7763e844c9e 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -5,21 +5,20 @@ import numpy as np -from pandas._libs import NaT, lib +from pandas._libs import lib from pandas._typing import ArrayLike, DtypeObj from pandas.core.dtypes.cast import find_common_type from pandas.core.dtypes.common import ( is_bool_dtype, is_categorical_dtype, - is_datetime64_ns_dtype, is_dtype_equal, is_extension_array_dtype, is_integer_dtype, is_sparse, - is_timedelta64_ns_dtype, ) from pandas.core.dtypes.generic import ABCCategoricalIndex, ABCSeries +from pandas.core.dtypes.missing import na_value_for_dtype from pandas.core.arrays import ExtensionArray from pandas.core.arrays.sparse import SparseArray @@ -67,27 +66,18 @@ def _array_from_proxy(arr, dtype: DtypeObj, fill_value=lib.no_default): return dtype.construct_array_type()._from_sequence( [dtype.na_value] * arr.n, dtype=dtype ) - elif is_datetime64_ns_dtype(dtype): - from pandas.core.arrays import DatetimeArray - - return DatetimeArray._from_sequence([NaT] * arr.n, dtype=dtype) - elif is_timedelta64_ns_dtype(dtype): - from pandas.core.arrays import TimedeltaArray - - return TimedeltaArray._from_sequence([NaT] * arr.n, dtype=dtype) else: if is_integer_dtype(dtype): - dtype = "float64" - fill_value = np.nan + dtype = np.dtype("float64") elif is_bool_dtype(dtype): - dtype = object + dtype = np.dtype(object) if fill_value is lib.no_default: - fill_value = np.nan + fill_value = na_value_for_dtype(dtype) arr = np.empty(arr.n, dtype=dtype) arr.fill(fill_value) - return arr + return ensure_wrapped_if_datetimelike(arr) def _cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike: From 19c7f751b0c4d93d81f07752698c049470cc0b64 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 9 Feb 2021 15:02:54 +0100 Subject: [PATCH 05/21] fix creation empty + turn into method --- pandas/core/dtypes/concat.py | 77 ++++++++++++++++++------------------ 1 file changed, 38 insertions(+), 39 deletions(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index ed7763e844c9e..632bc0c34d78b 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -45,39 +45,40 @@ def __init__(self, n: int): def shape(self): return (self.n,) + def to_array(self, dtype: DtypeObj, fill_value=lib.no_default) -> ArrayLike: + """ + Helper function to create the actual all-NA array from the NullArrayProxy + object. + + Parameters + ---------- + arr : NullArrayProxy + dtype : the dtype for the resulting array + fill_value : scalar NA-like value + By default uses the ExtensionDtype's na_value or np.nan. For numpy + arrays, this can be overridden to be something else (eg None). + + Returns + ------- + np.ndarray or ExtensionArray + """ + if is_extension_array_dtype(dtype): + empty = dtype.construct_array_type()._from_sequence([], dtype=dtype) + indexer = -np.ones(self.n, dtype=np.intp) + return empty.take(indexer, allow_fill=True) + else: + # when introducing missing values, int becomes float, bool becomes object + if is_integer_dtype(dtype): + dtype = np.dtype("float64") + elif is_bool_dtype(dtype): + dtype = np.dtype(object) -def _array_from_proxy(arr, dtype: DtypeObj, fill_value=lib.no_default): - """ - Helper function to create the actual all-NA array from the NullArrayProxy object. - - Parameters - ---------- - arr : NullArrayProxy - dtype : the dtype for the resulting array - fill_value : scalar NA-like value - By default uses the ExtensionDtype's na_value or np.nan. For numpy - arrays, this can be overridden to be something else (eg None). - - Returns - ------- - np.ndarray or ExtensionArray - """ - if is_extension_array_dtype(dtype): - return dtype.construct_array_type()._from_sequence( - [dtype.na_value] * arr.n, dtype=dtype - ) - else: - if is_integer_dtype(dtype): - dtype = np.dtype("float64") - elif is_bool_dtype(dtype): - dtype = np.dtype(object) - - if fill_value is lib.no_default: - fill_value = na_value_for_dtype(dtype) + if fill_value is lib.no_default: + fill_value = na_value_for_dtype(dtype) - arr = np.empty(arr.n, dtype=dtype) - arr.fill(fill_value) - return ensure_wrapped_if_datetimelike(arr) + arr = np.empty(self.n, dtype=dtype) + arr.fill(fill_value) + return ensure_wrapped_if_datetimelike(arr) def _cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike: @@ -86,7 +87,7 @@ def _cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike: cases. """ if isinstance(arr, NullArrayProxy): - return _array_from_proxy(arr, dtype) + return arr.to_array(dtype) if ( is_categorical_dtype(arr.dtype) @@ -194,7 +195,7 @@ def is_nonempty(x) -> bool: return np.concatenate(to_concat, axis=axis) -def concat_arrays(to_concat): +def concat_arrays(to_concat) -> ArrayLike: """ Alternative for concat_compat but specialized for use in the ArrayManager. @@ -225,9 +226,7 @@ def concat_arrays(to_concat): else: target_dtype = to_concat_no_proxy[0].dtype to_concat = [ - _array_from_proxy(arr, target_dtype) - if isinstance(arr, NullArrayProxy) - else arr + arr.to_array(target_dtype) if isinstance(arr, NullArrayProxy) else arr for arr in to_concat ] @@ -247,7 +246,7 @@ def concat_arrays(to_concat): else: target_dtype = to_concat_no_proxy[0].dtype to_concat = [ - _array_from_proxy(arr, target_dtype) if isinstance(arr, NullArrayProxy) else arr + arr.to_array(target_dtype) if isinstance(arr, NullArrayProxy) else arr for arr in to_concat ] @@ -462,7 +461,7 @@ def _concat_datetime(to_concat, axis=0): # ensure_wrapped_if_datetimelike ensures that astype(object) wraps # in Timestamp/Timedelta to_concat = [ - _array_from_proxy(arr, dtype=object, fill_value=None) + arr.to_array(object, fill_value=None) if isinstance(arr, NullArrayProxy) else arr for arr in to_concat @@ -475,7 +474,7 @@ def _concat_datetime(to_concat, axis=0): to_concat = [x.reshape(1, -1) if x.ndim == 1 else x for x in to_concat] else: to_concat = [ - _array_from_proxy(arr, dtype=to_concat_no_proxy[0].dtype) + arr.to_array(to_concat_no_proxy[0].dtype) if isinstance(arr, NullArrayProxy) else arr for arr in to_concat From 42e1b059da543737815b21ef8a2c99b755ea0b1f Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 10 Feb 2021 08:30:46 +0100 Subject: [PATCH 06/21] remove overriding of fill_value --- pandas/core/dtypes/concat.py | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 632bc0c34d78b..ed0fb67380eca 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -5,7 +5,6 @@ import numpy as np -from pandas._libs import lib from pandas._typing import ArrayLike, DtypeObj from pandas.core.dtypes.cast import find_common_type @@ -45,7 +44,7 @@ def __init__(self, n: int): def shape(self): return (self.n,) - def to_array(self, dtype: DtypeObj, fill_value=lib.no_default) -> ArrayLike: + def to_array(self, dtype: DtypeObj) -> ArrayLike: """ Helper function to create the actual all-NA array from the NullArrayProxy object. @@ -54,9 +53,6 @@ def to_array(self, dtype: DtypeObj, fill_value=lib.no_default) -> ArrayLike: ---------- arr : NullArrayProxy dtype : the dtype for the resulting array - fill_value : scalar NA-like value - By default uses the ExtensionDtype's na_value or np.nan. For numpy - arrays, this can be overridden to be something else (eg None). Returns ------- @@ -73,9 +69,7 @@ def to_array(self, dtype: DtypeObj, fill_value=lib.no_default) -> ArrayLike: elif is_bool_dtype(dtype): dtype = np.dtype(object) - if fill_value is lib.no_default: - fill_value = na_value_for_dtype(dtype) - + fill_value = na_value_for_dtype(dtype) arr = np.empty(self.n, dtype=dtype) arr.fill(fill_value) return ensure_wrapped_if_datetimelike(arr) @@ -461,9 +455,7 @@ def _concat_datetime(to_concat, axis=0): # ensure_wrapped_if_datetimelike ensures that astype(object) wraps # in Timestamp/Timedelta to_concat = [ - arr.to_array(object, fill_value=None) - if isinstance(arr, NullArrayProxy) - else arr + arr.to_array(object) if isinstance(arr, NullArrayProxy) else arr for arr in to_concat ] From a2aa388c7bab7b6925fffa145fc1cfd6cb55db17 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 12 Feb 2021 14:10:17 +0100 Subject: [PATCH 07/21] use ensure_dtype_can_hold_na --- pandas/core/dtypes/concat.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index d191d905c3071..35587c5496f21 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -7,13 +7,11 @@ from pandas._typing import ArrayLike, DtypeObj -from pandas.core.dtypes.cast import find_common_type +from pandas.core.dtypes.cast import ensure_dtype_can_hold_na, find_common_type from pandas.core.dtypes.common import ( - is_bool_dtype, is_categorical_dtype, is_dtype_equal, is_extension_array_dtype, - is_integer_dtype, is_sparse, ) from pandas.core.dtypes.generic import ABCCategoricalIndex, ABCSeries @@ -64,11 +62,7 @@ def to_array(self, dtype: DtypeObj) -> ArrayLike: return empty.take(indexer, allow_fill=True) else: # when introducing missing values, int becomes float, bool becomes object - if is_integer_dtype(dtype): - dtype = np.dtype("float64") - elif is_bool_dtype(dtype): - dtype = np.dtype(object) - + dtype = ensure_dtype_can_hold_na(dtype) fill_value = na_value_for_dtype(dtype) arr = np.empty(self.n, dtype=dtype) arr.fill(fill_value) From 6bdd1754ac9608305fb330d54556a118ba746f0c Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 12 Feb 2021 15:27:12 +0100 Subject: [PATCH 08/21] add type annotation --- pandas/core/dtypes/concat.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 35587c5496f21..cbc7db4aa6574 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -1,7 +1,7 @@ """ Utility functions related to concat. """ -from typing import cast +from typing import Any, List, cast import numpy as np @@ -183,7 +183,7 @@ def is_nonempty(x) -> bool: return np.concatenate(to_concat, axis=axis) -def concat_arrays(to_concat) -> ArrayLike: +def concat_arrays(to_concat: List[Any]) -> ArrayLike: """ Alternative for concat_compat but specialized for use in the ArrayManager. From cab90f6b8fa3bc905fcb02157a5e92e40429fcd4 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 15 Feb 2021 11:10:54 +0100 Subject: [PATCH 09/21] address review --- pandas/core/dtypes/concat.py | 2 -- pandas/core/internals/array_manager.py | 11 +++++++---- pandas/tests/frame/methods/test_append.py | 1 + pandas/tests/reshape/merge/test_merge.py | 23 +++++++++++++++++------ 4 files changed, 25 insertions(+), 12 deletions(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 767cb5fcf78fc..89637f9da112e 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -244,8 +244,6 @@ def concat_arrays(to_concat: List[Any]) -> ArrayLike: result = np.concatenate(to_concat) - # TODO(ArrayManager) this is currently inconsistent between Series and DataFrame - # so we should decide whether to keep the below special case or remove it if len(result) == 0: # all empties -> check for bool to not coerce to float if len(kinds) != 1: diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 06b752726ef12..ac391f8750e0e 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -727,8 +727,8 @@ def reindex_indexer( consolidate: bool = True, only_slice: bool = False, # ArrayManager specific keywords - do_integrity_check=True, - use_na_proxy=False, + do_integrity_check: bool = True, + use_na_proxy: bool = False, ) -> T: axis = self._normalize_axis(axis) return self._reindex_indexer( @@ -750,8 +750,8 @@ def _reindex_indexer( fill_value=None, allow_dups: bool = False, copy: bool = True, - do_integrity_check=True, - use_na_proxy=False, + do_integrity_check: bool = True, + use_na_proxy: bool = False, ) -> T: """ Parameters @@ -837,9 +837,12 @@ def take(self, indexer, axis: int = 1, verify: bool = True, convert: bool = True def _make_na_array(self, fill_value=None, use_na_proxy=False): if use_na_proxy: + assert fill_value is None return NullArrayProxy(self.shape_proper[0]) + if fill_value is None: fill_value = np.nan + dtype, fill_value = infer_dtype_from_scalar(fill_value) values = np.empty(self.shape_proper[0], dtype=dtype) values.fill(fill_value) diff --git a/pandas/tests/frame/methods/test_append.py b/pandas/tests/frame/methods/test_append.py index 5d94bc63a5365..598366923aa80 100644 --- a/pandas/tests/frame/methods/test_append.py +++ b/pandas/tests/frame/methods/test_append.py @@ -159,6 +159,7 @@ def test_append_dtypes(self, using_array_manager): {"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")} ) if using_array_manager: + # TODO(ArrayManager) decide on exact casting rules in concat # With ArrayManager, all-NaN float is not ignored expected = expected.astype(object) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 5d3f5046fdbf6..b3e6b64e3b4a2 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -279,18 +279,28 @@ def test_merge_copy(self): merged["d"] = "peekaboo" assert (right["d"] == "bar").all() - @td.skip_array_manager_invalid_test # TODO(ArrayManager) join copy behaviour - def test_merge_nocopy(self): + def test_merge_nocopy(self, using_array_manager): left = DataFrame({"a": 0, "b": 1}, index=range(10)) right = DataFrame({"c": "foo", "d": "bar"}, index=range(10)) merged = merge(left, right, left_index=True, right_index=True, copy=False) - merged["a"] = 6 - assert (left["a"] == 6).all() + if using_array_manager: + # With ArrayManager, setting a column doesn't change the values inplace + # and thus does not propagate the changes to the original left/right + # dataframes -> need to check that no copy was made in a different way + # TODO(ArrayManager) we should be able to simplify this with a .loc + # setitem test: merged.loc[0, "a"] = 10; assert left.loc[0, "a"] == 10 + # but this currently replaces the array (_setitem_with_indexer_split_path) + merged.loc[0, "a"] = 10 + assert merged._mgr.arrays[0] is left._mgr.arrays[0] + assert merged._mgr.arrays[2] is right._mgr.arrays[0] + else: + merged["a"] = 6 + assert (left["a"] == 6).all() - merged["d"] = "peekaboo" - assert (right["d"] == "peekaboo").all() + merged["d"] = "peekaboo" + assert (right["d"] == "peekaboo").all() def test_intelligently_handle_join_key(self): # #733, be a bit more 1337 about not returning unconsolidated DataFrame @@ -674,6 +684,7 @@ def test_join_append_timedeltas(self, using_array_manager): } ) if using_array_manager: + # TODO(ArrayManager) decide on exact casting rules in concat expected = expected.astype(object) tm.assert_frame_equal(result, expected) From c22a0102ab2d7e1fd333d7f00b77b5e916bec345 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 15 Feb 2021 11:40:58 +0100 Subject: [PATCH 10/21] update comment --- pandas/core/dtypes/concat.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 89637f9da112e..1f334293ac6a4 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -244,6 +244,8 @@ def concat_arrays(to_concat: List[Any]) -> ArrayLike: result = np.concatenate(to_concat) + # TODO decide on exact behaviour (we shouldn't do this only for empty result) + # see https://github.com/pandas-dev/pandas/issues/39817 if len(result) == 0: # all empties -> check for bool to not coerce to float if len(kinds) != 1: From eec01615a773c7390771f966635dd88b1eb22cdc Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 15 Feb 2021 11:41:51 +0100 Subject: [PATCH 11/21] fixup test --- pandas/tests/reshape/merge/test_merge.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index b3e6b64e3b4a2..c7a3aed7edce5 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -292,7 +292,6 @@ def test_merge_nocopy(self, using_array_manager): # TODO(ArrayManager) we should be able to simplify this with a .loc # setitem test: merged.loc[0, "a"] = 10; assert left.loc[0, "a"] == 10 # but this currently replaces the array (_setitem_with_indexer_split_path) - merged.loc[0, "a"] = 10 assert merged._mgr.arrays[0] is left._mgr.arrays[0] assert merged._mgr.arrays[2] is right._mgr.arrays[0] else: From 04ead63e832b501978ee0a7a8cf27b961cb1c0de Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 18 Mar 2021 10:24:29 +0100 Subject: [PATCH 12/21] update/remove skips --- pandas/tests/reshape/concat/test_datetimes.py | 6 ------ pandas/tests/reshape/merge/test_join.py | 3 --- pandas/tests/reshape/merge/test_merge.py | 3 --- pandas/tests/reshape/test_crosstab.py | 9 ++++----- pandas/tests/reshape/test_pivot.py | 9 ++++----- 5 files changed, 8 insertions(+), 22 deletions(-) diff --git a/pandas/tests/reshape/concat/test_datetimes.py b/pandas/tests/reshape/concat/test_datetimes.py index d0653de964556..2b8233388d328 100644 --- a/pandas/tests/reshape/concat/test_datetimes.py +++ b/pandas/tests/reshape/concat/test_datetimes.py @@ -5,8 +5,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - import pandas as pd from pandas import ( DataFrame, @@ -34,9 +32,6 @@ def test_concat_datetime64_block(self): assert (result.iloc[:10]["time"] == rng).all() assert (result.iloc[10:]["time"] == rng).all() - # TODO(ArrayManager) concat with mixed managers - # (or, fix DataFrame.from_records to honor option) - @td.skip_array_manager_not_yet_implemented def test_concat_datetime_datetime64_frame(self): # GH#2624 rows = [] @@ -51,7 +46,6 @@ def test_concat_datetime_datetime64_frame(self): # it works! concat([df1, df2_obj]) - @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) groupby def test_concat_datetime_timezone(self): # GH 18523 idx1 = date_range("2011-01-01", periods=3, freq="H", tz="Europe/Paris") diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index d31930aa233cd..fb161e38c7155 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -1,8 +1,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - import pandas as pd from pandas import ( Categorical, @@ -553,7 +551,6 @@ def test_join_non_unique_period_index(self): ) tm.assert_frame_equal(result, expected) - @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) groupby def test_mixed_type_join_with_suffix(self): # GH #916 df = DataFrame(np.random.randn(20, 6), columns=["a", "b", "c", "d", "e", "f"]) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 4ea5fc6af1c91..9699a0dec4891 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -9,8 +9,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - from pandas.core.dtypes.common import ( is_categorical_dtype, is_object_dtype, @@ -1387,7 +1385,6 @@ def test_merge_take_missing_values_from_index_of_other_dtype(self): expected = expected.reindex(columns=["a", "key", "b"]) tm.assert_frame_equal(result, expected) - @td.skip_array_manager_invalid_test # TODO(ArrayManager) rewrite test def test_merge_readonly(self): # https://github.com/pandas-dev/pandas/issues/27943 data1 = DataFrame( diff --git a/pandas/tests/reshape/test_crosstab.py b/pandas/tests/reshape/test_crosstab.py index dfc336ffb907e..44299d51a878f 100644 --- a/pandas/tests/reshape/test_crosstab.py +++ b/pandas/tests/reshape/test_crosstab.py @@ -1,8 +1,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - from pandas.core.dtypes.common import is_categorical_dtype from pandas import ( @@ -15,8 +13,6 @@ ) import pandas._testing as tm -pytestmark = td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) groupby - class TestCrosstab: def setup_method(self, method): @@ -442,7 +438,7 @@ def test_crosstab_normalize_arrays(self): ) tm.assert_frame_equal(test_case, norm_sum) - def test_crosstab_with_empties(self): + def test_crosstab_with_empties(self, using_array_manager): # Check handling of empties df = DataFrame( { @@ -467,6 +463,9 @@ def test_crosstab_with_empties(self): index=Index([1, 2], name="a", dtype="int64"), columns=Index([3, 4], name="b"), ) + if using_array_manager: + # INFO(ArrayManager) column without NaNs can preserve int dtype + nans[3] = nans[3].astype("int64") calculated = crosstab(df.a, df.b, values=df.c, aggfunc="count", normalize=False) tm.assert_frame_equal(nans, calculated) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 04e7db127307d..20aa0c9e2ee9a 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -8,8 +8,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - import pandas as pd from pandas import ( Categorical, @@ -25,8 +23,6 @@ from pandas.api.types import CategoricalDtype as CDT from pandas.core.reshape.pivot import pivot_table -pytestmark = td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) groupby - @pytest.fixture(params=[True, False]) def dropna(request): @@ -1201,7 +1197,7 @@ def test_pivot_table_with_margins_set_margin_name(self, margin_name): margins_name=margin_name, ) - def test_pivot_timegrouper(self): + def test_pivot_timegrouper(self, using_array_manager): df = DataFrame( { "Branch": "A A A A A A A B".split(), @@ -1255,6 +1251,9 @@ def test_pivot_timegrouper(self): ) expected.index.name = "Date" expected.columns.name = "Buyer" + if using_array_manager: + # INFO(ArrayManager) column without NaNs can preserve int dtype + expected["Carl"] = expected["Carl"].astype("int64") result = pivot_table( df, From 427b6f4cbea8568f1c7b955102df3c5f7208a78a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 18 Mar 2021 11:07:56 +0100 Subject: [PATCH 13/21] move logic into internals --- pandas/core/dtypes/concat.py | 151 +------------------------ pandas/core/internals/array_manager.py | 51 ++++++++- pandas/core/internals/concat.py | 95 +++++++++++++++- 3 files changed, 149 insertions(+), 148 deletions(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 8a9ce71192046..f51f8faae5604 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -1,11 +1,7 @@ """ Utility functions related to concat. """ -from typing import ( - Any, - List, - cast, -) +from typing import cast import numpy as np @@ -14,14 +10,10 @@ DtypeObj, ) -from pandas.core.dtypes.cast import ( - ensure_dtype_can_hold_na, - find_common_type, -) +from pandas.core.dtypes.cast import find_common_type from pandas.core.dtypes.common import ( is_categorical_dtype, is_dtype_equal, - is_extension_array_dtype, is_sparse, ) from pandas.core.dtypes.dtypes import ExtensionDtype @@ -29,7 +21,6 @@ ABCCategoricalIndex, ABCSeries, ) -from pandas.core.dtypes.missing import na_value_for_dtype from pandas.core.arrays import ExtensionArray from pandas.core.arrays.sparse import SparseArray @@ -39,61 +30,11 @@ ) -class NullArrayProxy: - """ - Proxy object for an all-NA array. - - Only stores the length of the array, and not the dtype. The dtype - will only be known when actually concatenating (after determining the - common dtype, for which this proxy is ignored). - Using this object avoids that the internals/concat.py needs to determine - the proper dtype and array type. - """ - - ndim = 1 - - def __init__(self, n: int): - self.n = n - - @property - def shape(self): - return (self.n,) - - def to_array(self, dtype: DtypeObj) -> ArrayLike: - """ - Helper function to create the actual all-NA array from the NullArrayProxy - object. - - Parameters - ---------- - arr : NullArrayProxy - dtype : the dtype for the resulting array - - Returns - ------- - np.ndarray or ExtensionArray - """ - if is_extension_array_dtype(dtype): - empty = dtype.construct_array_type()._from_sequence([], dtype=dtype) - indexer = -np.ones(self.n, dtype=np.intp) - return empty.take(indexer, allow_fill=True) - else: - # when introducing missing values, int becomes float, bool becomes object - dtype = ensure_dtype_can_hold_na(dtype) - fill_value = na_value_for_dtype(dtype) - arr = np.empty(self.n, dtype=dtype) - arr.fill(fill_value) - return ensure_wrapped_if_datetimelike(arr) - - -def _cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike: +def cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike: """ Helper function for `arr.astype(common_dtype)` but handling all special cases. """ - if isinstance(arr, NullArrayProxy): - return arr.to_array(dtype) - if ( is_categorical_dtype(arr.dtype) and isinstance(dtype, np.dtype) @@ -180,7 +121,7 @@ def is_nonempty(x) -> bool: # for axis=0 if not single_dtype: target_dtype = find_common_type([x.dtype for x in to_concat]) - to_concat = [_cast_to_common_type(arr, target_dtype) for arr in to_concat] + to_concat = [cast_to_common_type(arr, target_dtype) for arr in to_concat] if isinstance(to_concat[0], ExtensionArray): cls = type(to_concat[0]) @@ -207,73 +148,6 @@ def is_nonempty(x) -> bool: return np.concatenate(to_concat, axis=axis) -def concat_arrays(to_concat: List[Any]) -> ArrayLike: - """ - Alternative for concat_compat but specialized for use in the ArrayManager. - - Differences: only deals with 1D arrays (no axis keyword) and does not skip - empty arrays to determine the dtype. - In addition ensures that all NullArrayProxies get replaced with actual - arrays. - - Parameters - ---------- - to_concat : list of arrays - - Returns - ------- - np.ndarray or ExtensionArray - """ - # ignore the all-NA proxies to determine the resulting dtype - to_concat_no_proxy = [x for x in to_concat if not isinstance(x, NullArrayProxy)] - - kinds = {obj.dtype.kind for obj in to_concat_no_proxy} - single_dtype = len({x.dtype for x in to_concat_no_proxy}) == 1 - any_ea = any(is_extension_array_dtype(x.dtype) for x in to_concat_no_proxy) - - if any_ea: - if not single_dtype: - target_dtype = find_common_type([x.dtype for x in to_concat_no_proxy]) - to_concat = [_cast_to_common_type(arr, target_dtype) for arr in to_concat] - else: - target_dtype = to_concat_no_proxy[0].dtype - to_concat = [ - arr.to_array(target_dtype) if isinstance(arr, NullArrayProxy) else arr - for arr in to_concat - ] - - if isinstance(to_concat[0], ExtensionArray): - cls = type(to_concat[0]) - return cls._concat_same_type(to_concat) - else: - return np.concatenate(to_concat) - - elif any(kind in ["m", "M"] for kind in kinds): - return _concat_datetime(to_concat) - - if not single_dtype: - target_dtype = np.find_common_type( - [arr.dtype for arr in to_concat_no_proxy], [] - ) - else: - target_dtype = to_concat_no_proxy[0].dtype - to_concat = [ - arr.to_array(target_dtype) if isinstance(arr, NullArrayProxy) else arr - for arr in to_concat - ] - - result = np.concatenate(to_concat) - - # TODO decide on exact behaviour (we shouldn't do this only for empty result) - # see https://github.com/pandas-dev/pandas/issues/39817 - if len(result) == 0: - # all empties -> check for bool to not coerce to float - if len(kinds) != 1: - if "b" in kinds: - result = result.astype(object) - return result - - def union_categoricals( to_union, sort_categories: bool = False, ignore_order: bool = False ): @@ -464,33 +338,20 @@ def _concat_datetime(to_concat, axis=0): a single array, preserving the combined dtypes """ to_concat = [ensure_wrapped_if_datetimelike(x) for x in to_concat] - to_concat_no_proxy = [x for x in to_concat if not isinstance(x, NullArrayProxy)] - single_dtype = len({x.dtype for x in to_concat_no_proxy}) == 1 + single_dtype = len({x.dtype for x in to_concat}) == 1 # multiple types, need to coerce to object if not single_dtype: # ensure_wrapped_if_datetimelike ensures that astype(object) wraps # in Timestamp/Timedelta - to_concat = [ - arr.to_array(object) if isinstance(arr, NullArrayProxy) else arr - for arr in to_concat - ] - return _concatenate_2d([x.astype(object) for x in to_concat], axis=axis) if axis == 1: # TODO(EA2D): kludge not necessary with 2D EAs to_concat = [x.reshape(1, -1) if x.ndim == 1 else x for x in to_concat] - else: - to_concat = [ - arr.to_array(to_concat_no_proxy[0].dtype) - if isinstance(arr, NullArrayProxy) - else arr - for arr in to_concat - ] - result = type(to_concat_no_proxy[0])._concat_same_type(to_concat, axis=axis) + result = type(to_concat[0])._concat_same_type(to_concat, axis=axis) if result.ndim == 2 and isinstance(result.dtype, ExtensionDtype): # TODO(EA2D): kludge not necessary with 2D EAs diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 9dc549eb25ed1..cc1526b78ec9f 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -22,12 +22,14 @@ ) from pandas._typing import ( ArrayLike, + DtypeObj, Hashable, ) from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.cast import ( astype_array_safe, + ensure_dtype_can_hold_na, infer_dtype_from_scalar, soft_convert_objects, ) @@ -41,7 +43,6 @@ is_object_dtype, is_timedelta64_ns_dtype, ) -from pandas.core.dtypes.concat import NullArrayProxy from pandas.core.dtypes.dtypes import ( ExtensionDtype, PandasDtype, @@ -54,6 +55,7 @@ from pandas.core.dtypes.missing import ( array_equals, isna, + na_value_for_dtype, ) import pandas.core.algorithms as algos @@ -1299,3 +1301,50 @@ def set_values(self, values: ArrayLike): valid for the current SingleArrayManager (length, dtype, etc). """ self.arrays[0] = values + + +class NullArrayProxy: + """ + Proxy object for an all-NA array. + + Only stores the length of the array, and not the dtype. The dtype + will only be known when actually concatenating (after determining the + common dtype, for which this proxy is ignored). + Using this object avoids that the internals/concat.py needs to determine + the proper dtype and array type. + """ + + ndim = 1 + + def __init__(self, n: int): + self.n = n + + @property + def shape(self): + return (self.n,) + + def to_array(self, dtype: DtypeObj) -> ArrayLike: + """ + Helper function to create the actual all-NA array from the NullArrayProxy + object. + + Parameters + ---------- + arr : NullArrayProxy + dtype : the dtype for the resulting array + + Returns + ------- + np.ndarray or ExtensionArray + """ + if is_extension_array_dtype(dtype): + empty = dtype.construct_array_type()._from_sequence([], dtype=dtype) + indexer = -np.ones(self.n, dtype=np.intp) + return empty.take(indexer, allow_fill=True) + else: + # when introducing missing values, int becomes float, bool becomes object + dtype = ensure_dtype_can_hold_na(dtype) + fill_value = na_value_for_dtype(dtype) + arr = np.empty(self.n, dtype=dtype) + arr.fill(fill_value) + return ensure_wrapped_if_datetimelike(arr) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 11446264ce747..ecec4d04942fb 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -4,6 +4,7 @@ import itertools from typing import ( TYPE_CHECKING, + Any, Dict, List, Sequence, @@ -31,7 +32,7 @@ is_sparse, ) from pandas.core.dtypes.concat import ( - concat_arrays, + cast_to_common_type, concat_compat, ) from pandas.core.dtypes.dtypes import ExtensionDtype @@ -45,7 +46,10 @@ DatetimeArray, ExtensionArray, ) -from pandas.core.internals.array_manager import ArrayManager +from pandas.core.internals.array_manager import ( + ArrayManager, + NullArrayProxy, +) from pandas.core.internals.blocks import ( ensure_block_shape, new_block, @@ -97,6 +101,93 @@ def _concatenate_array_managers( return ArrayManager(arrays, [axes[1], axes[0]], verify_integrity=False) +def concat_arrays(to_concat: List[Any]) -> ArrayLike: + """ + Alternative for concat_compat but specialized for use in the ArrayManager. + + Differences: only deals with 1D arrays (no axis keyword), assumes + ensure_wrapped_if_datetimelike and does not skip empty arrays to determine + the dtype. + In addition ensures that all NullArrayProxies get replaced with actual + arrays. + + Parameters + ---------- + to_concat : list of arrays + + Returns + ------- + np.ndarray or ExtensionArray + """ + # ignore the all-NA proxies to determine the resulting dtype + to_concat_no_proxy = [x for x in to_concat if not isinstance(x, NullArrayProxy)] + + kinds = {obj.dtype.kind for obj in to_concat_no_proxy} + single_dtype = len({x.dtype for x in to_concat_no_proxy}) == 1 + any_ea = any(is_extension_array_dtype(x.dtype) for x in to_concat_no_proxy) + + if any_ea: + if not single_dtype: + target_dtype = find_common_type([x.dtype for x in to_concat_no_proxy]) + to_concat = [ + arr.to_array(target_dtype) + if isinstance(arr, NullArrayProxy) + else cast_to_common_type(arr, target_dtype) + for arr in to_concat + ] + else: + target_dtype = to_concat_no_proxy[0].dtype + to_concat = [ + arr.to_array(target_dtype) if isinstance(arr, NullArrayProxy) else arr + for arr in to_concat + ] + + if isinstance(to_concat[0], ExtensionArray): + cls = type(to_concat[0]) + return cls._concat_same_type(to_concat) + else: + return np.concatenate(to_concat) + + if not single_dtype: + if any(kind in ["m", "M"] for kind in kinds): + # multiple types, need to coerce to object + target_dtype = np.dtype(object) + else: + target_dtype = np.find_common_type( + [arr.dtype for arr in to_concat_no_proxy], [] + ) + else: + target_dtype = to_concat_no_proxy[0].dtype + + if target_dtype.kind in ["m", "M"]: + # for datetimelike use DatetimeArray/TimedeltaArray concatenation + # don't use arr.astype(target_dtype, copy=False), because that doesn't + # work for DatetimeArray/TimedeltaArray (returns ndarray) + to_concat = [ + arr.to_array(target_dtype) if isinstance(arr, NullArrayProxy) else arr + for arr in to_concat + ] + return type(to_concat_no_proxy[0])._concat_same_type(to_concat, axis=0) + + to_concat = [ + arr.to_array(target_dtype) + if isinstance(arr, NullArrayProxy) + else arr.astype(target_dtype, copy=False) + for arr in to_concat + ] + + result = np.concatenate(to_concat) + + # TODO decide on exact behaviour (we shouldn't do this only for empty result) + # see https://github.com/pandas-dev/pandas/issues/39817 + if len(result) == 0: + # all empties -> check for bool to not coerce to float + if len(kinds) != 1: + if "b" in kinds: + result = result.astype(object) + return result + + def concatenate_managers( mgrs_indexers, axes: List[Index], concat_axis: int, copy: bool ) -> Manager: From 8c10a53ebc0c2f1ca8ca9c0c2eeb944e1c6df963 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 18 Mar 2021 15:39:12 +0100 Subject: [PATCH 14/21] fix typing --- pandas/core/internals/array_manager.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index cc1526b78ec9f..581a7cfc861c1 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -12,6 +12,7 @@ Tuple, TypeVar, Union, + cast, ) import numpy as np @@ -1338,10 +1339,12 @@ def to_array(self, dtype: DtypeObj) -> ArrayLike: np.ndarray or ExtensionArray """ if is_extension_array_dtype(dtype): + dtype = cast(ExtensionDtype, dtype) empty = dtype.construct_array_type()._from_sequence([], dtype=dtype) indexer = -np.ones(self.n, dtype=np.intp) return empty.take(indexer, allow_fill=True) else: + dtype = cast(np.dtype, dtype) # when introducing missing values, int becomes float, bool becomes object dtype = ensure_dtype_can_hold_na(dtype) fill_value = na_value_for_dtype(dtype) From f0061f79a49710030e3c75944e1b6a65faff7aec Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 23 Mar 2021 11:46:37 +0100 Subject: [PATCH 15/21] update type check --- pandas/core/internals/array_manager.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 5da319f4c28f2..0081ea061e128 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -1334,8 +1334,7 @@ def to_array(self, dtype: DtypeObj) -> ArrayLike: ------- np.ndarray or ExtensionArray """ - if is_extension_array_dtype(dtype): - dtype = cast(ExtensionDtype, dtype) + if isinstance(dtype, ExtensionDtype): empty = dtype.construct_array_type()._from_sequence([], dtype=dtype) indexer = -np.ones(self.n, dtype=np.intp) return empty.take(indexer, allow_fill=True) From 9ba88540e788791c205c73d38a1607c0f7eee9c1 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 23 Mar 2021 11:58:58 +0100 Subject: [PATCH 16/21] simply casting to_concat + fix skips --- pandas/core/dtypes/concat.py | 2 ++ pandas/core/internals/concat.py | 17 +++++++---------- pandas/tests/extension/base/reshaping.py | 3 --- pandas/tests/reshape/concat/__init__.py | 4 ---- 4 files changed, 9 insertions(+), 17 deletions(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index f51f8faae5604..cfadb3e9f45c5 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -35,6 +35,8 @@ def cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike: Helper function for `arr.astype(common_dtype)` but handling all special cases. """ + if is_dtype_equal(arr.dtype, dtype): + return arr if ( is_categorical_dtype(arr.dtype) and isinstance(dtype, np.dtype) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index ecec4d04942fb..bb590dec84c5c 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -129,18 +129,15 @@ def concat_arrays(to_concat: List[Any]) -> ArrayLike: if any_ea: if not single_dtype: target_dtype = find_common_type([x.dtype for x in to_concat_no_proxy]) - to_concat = [ - arr.to_array(target_dtype) - if isinstance(arr, NullArrayProxy) - else cast_to_common_type(arr, target_dtype) - for arr in to_concat - ] else: target_dtype = to_concat_no_proxy[0].dtype - to_concat = [ - arr.to_array(target_dtype) if isinstance(arr, NullArrayProxy) else arr - for arr in to_concat - ] + + to_concat = [ + arr.to_array(target_dtype) + if isinstance(arr, NullArrayProxy) + else cast_to_common_type(arr, target_dtype) + for arr in to_concat + ] if isinstance(to_concat[0], ExtensionArray): cls = type(to_concat[0]) diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index 5a2d928eea744..de3af31ece7b0 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -3,8 +3,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - import pandas as pd from pandas.api.extensions import ExtensionArray from pandas.core.internals import ExtensionBlock @@ -111,7 +109,6 @@ def test_concat_extension_arrays_copy_false(self, data, na_value): result = pd.concat([df1, df2], axis=1, copy=False) self.assert_frame_equal(result, expected) - @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) concat reindex def test_concat_with_reindex(self, data): # GH-33027 a = pd.DataFrame({"a": data[:5]}) diff --git a/pandas/tests/reshape/concat/__init__.py b/pandas/tests/reshape/concat/__init__.py index 777923be02398..e69de29bb2d1d 100644 --- a/pandas/tests/reshape/concat/__init__.py +++ b/pandas/tests/reshape/concat/__init__.py @@ -1,4 +0,0 @@ -import pandas.util._test_decorators as td - -# TODO(ArrayManager) concat axis=0 -pytestmark = td.skip_array_manager_not_yet_implemented From ad61f2f84abc66b161879b2e11677c7048dd949b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 23 Mar 2021 12:06:45 +0100 Subject: [PATCH 17/21] further simplify concat_arrays --- pandas/core/internals/concat.py | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index bb590dec84c5c..b7e86245c655b 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -131,21 +131,7 @@ def concat_arrays(to_concat: List[Any]) -> ArrayLike: target_dtype = find_common_type([x.dtype for x in to_concat_no_proxy]) else: target_dtype = to_concat_no_proxy[0].dtype - - to_concat = [ - arr.to_array(target_dtype) - if isinstance(arr, NullArrayProxy) - else cast_to_common_type(arr, target_dtype) - for arr in to_concat - ] - - if isinstance(to_concat[0], ExtensionArray): - cls = type(to_concat[0]) - return cls._concat_same_type(to_concat) - else: - return np.concatenate(to_concat) - - if not single_dtype: + elif not single_dtype: if any(kind in ["m", "M"] for kind in kinds): # multiple types, need to coerce to object target_dtype = np.dtype(object) @@ -169,15 +155,19 @@ def concat_arrays(to_concat: List[Any]) -> ArrayLike: to_concat = [ arr.to_array(target_dtype) if isinstance(arr, NullArrayProxy) - else arr.astype(target_dtype, copy=False) + else cast_to_common_type(arr, target_dtype) for arr in to_concat ] + if isinstance(to_concat[0], ExtensionArray): + cls = type(to_concat[0]) + return cls._concat_same_type(to_concat) + result = np.concatenate(to_concat) # TODO decide on exact behaviour (we shouldn't do this only for empty result) # see https://github.com/pandas-dev/pandas/issues/39817 - if len(result) == 0: + if len(result) == 0 and not any_ea: # all empties -> check for bool to not coerce to float if len(kinds) != 1: if "b" in kinds: From a3c2662a21097e77c59ebed3a268f9ff27644d43 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 24 Mar 2021 10:25:06 +0100 Subject: [PATCH 18/21] remove redundant cast --- pandas/core/internals/array_manager.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index b9dadb392c533..d01028a9d3b8a 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -12,7 +12,6 @@ Tuple, TypeVar, Union, - cast, ) import numpy as np @@ -1336,7 +1335,6 @@ def to_array(self, dtype: DtypeObj) -> ArrayLike: indexer = -np.ones(self.n, dtype=np.intp) return empty.take(indexer, allow_fill=True) else: - dtype = cast(np.dtype, dtype) # when introducing missing values, int becomes float, bool becomes object dtype = ensure_dtype_can_hold_na(dtype) fill_value = na_value_for_dtype(dtype) From f67e9e24ec096a9fcedad60c28d04ec9381ad5fc Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 31 Mar 2021 11:22:09 +0200 Subject: [PATCH 19/21] simplify usage of find_common_type --- pandas/core/internals/concat.py | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 6b9f20d2a5716..5e78990a73a6a 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -123,23 +123,10 @@ def concat_arrays(to_concat: List[Any]) -> ArrayLike: # ignore the all-NA proxies to determine the resulting dtype to_concat_no_proxy = [x for x in to_concat if not isinstance(x, NullArrayProxy)] - kinds = {obj.dtype.kind for obj in to_concat_no_proxy} single_dtype = len({x.dtype for x in to_concat_no_proxy}) == 1 - any_ea = any(is_extension_array_dtype(x.dtype) for x in to_concat_no_proxy) - if any_ea: - if not single_dtype: - target_dtype = find_common_type([x.dtype for x in to_concat_no_proxy]) - else: - target_dtype = to_concat_no_proxy[0].dtype - elif not single_dtype: - if any(kind in ["m", "M"] for kind in kinds): - # multiple types, need to coerce to object - target_dtype = np.dtype(object) - else: - target_dtype = np.find_common_type( - [arr.dtype for arr in to_concat_no_proxy], [] - ) + if not single_dtype: + target_dtype = find_common_type([arr.dtype for arr in to_concat_no_proxy]) else: target_dtype = to_concat_no_proxy[0].dtype @@ -168,8 +155,9 @@ def concat_arrays(to_concat: List[Any]) -> ArrayLike: # TODO decide on exact behaviour (we shouldn't do this only for empty result) # see https://github.com/pandas-dev/pandas/issues/39817 - if len(result) == 0 and not any_ea: + if len(result) == 0: # all empties -> check for bool to not coerce to float + kinds = {obj.dtype.kind for obj in to_concat_no_proxy} if len(kinds) != 1: if "b" in kinds: result = result.astype(object) From d21bd3aaccbecd29fa2b794c609cd005c645f44f Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 31 Mar 2021 20:29:52 +0200 Subject: [PATCH 20/21] update annotation --- pandas/core/internals/concat.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 5e78990a73a6a..ff6332f6c2572 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -4,7 +4,6 @@ import itertools from typing import ( TYPE_CHECKING, - Any, Dict, List, Sequence, @@ -102,7 +101,7 @@ def _concatenate_array_managers( return ArrayManager(arrays, [axes[1], axes[0]], verify_integrity=False) -def concat_arrays(to_concat: List[Any]) -> ArrayLike: +def concat_arrays(to_concat: List) -> ArrayLike: """ Alternative for concat_compat but specialized for use in the ArrayManager. From 77b05f46f26b4dc1907dcf7273bfbbbacf7cb4a0 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 7 Apr 2021 15:49:03 +0200 Subject: [PATCH 21/21] fixup typing --- pandas/core/internals/concat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index e2cc074d0052a..687c8768fb251 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -100,7 +100,7 @@ def _concatenate_array_managers( return ArrayManager(arrays, [axes[1], axes[0]], verify_integrity=False) -def concat_arrays(to_concat: List) -> ArrayLike: +def concat_arrays(to_concat: list) -> ArrayLike: """ Alternative for concat_compat but specialized for use in the ArrayManager.