From 69cf7bb269d7d627c3256a7fa58b3637b51cef98 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 1 Mar 2021 21:38:10 +0100 Subject: [PATCH 01/15] [ArrayManager] Add SingleArrayManager to back a Series --- .github/workflows/ci.yml | 3 + pandas/core/generic.py | 3 +- pandas/core/indexing.py | 6 +- pandas/core/internals/__init__.py | 12 +- pandas/core/internals/array_manager.py | 257 ++++++++++++++++++-- pandas/core/internals/base.py | 4 + pandas/core/internals/managers.py | 7 +- pandas/core/series.py | 41 +++- pandas/tests/frame/methods/test_rename.py | 1 + pandas/tests/series/methods/test_explode.py | 3 + pandas/tests/series/test_constructors.py | 9 +- pandas/util/_exceptions.py | 2 +- 12 files changed, 306 insertions(+), 42 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0a7ac2325740a..16a7f7a4462c7 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -161,6 +161,9 @@ jobs: pytest pandas/tests/resample/ --array-manager pytest pandas/tests/reshape/merge --array-manager + pytest pandas/tests/series/methods --array-manager + pytest pandas/tests/series/test_* --array-manager + # indexing subset (temporary since other tests don't pass yet) pytest pandas/tests/frame/indexing/test_indexing.py::TestDataFrameIndexing::test_setitem_boolean --array-manager pytest pandas/tests/frame/indexing/test_where.py --array-manager diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 4774045849eb6..5504120630102 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -138,6 +138,7 @@ from pandas.core.internals import ( ArrayManager, BlockManager, + SingleArrayManager, ) from pandas.core.missing import find_valid_index from pandas.core.ops import align_method_FRAME @@ -5562,7 +5563,7 @@ def _protect_consolidate(self, f): Consolidate _mgr -- if the blocks have changed, then clear the cache """ - if isinstance(self._mgr, ArrayManager): + if isinstance(self._mgr, (ArrayManager, SingleArrayManager)): return f() blocks_before = len(self._mgr.blocks) result = f() diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index cfe16627d5c64..21bb835b76f2b 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1575,7 +1575,11 @@ def _setitem_with_indexer(self, indexer, value, name="iloc"): # if there is only one block/type, still have to take split path # unless the block is one-dimensional or it can hold the value - if not take_split_path and self.obj._mgr.blocks and self.ndim > 1: + if ( + not take_split_path + and getattr(self.obj._mgr, "blocks", False) + and self.ndim > 1 + ): # in case of dict, keys are indices val = list(value.values()) if isinstance(value, dict) else value blk = self.obj._mgr.blocks[0] diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py index 054ce8a40288b..272b4e85e4928 100644 --- a/pandas/core/internals/__init__.py +++ b/pandas/core/internals/__init__.py @@ -1,5 +1,11 @@ -from pandas.core.internals.array_manager import ArrayManager -from pandas.core.internals.base import DataManager +from pandas.core.internals.array_manager import ( + ArrayManager, + SingleArrayManager, +) +from pandas.core.internals.base import ( + DataManager, + SingleManager, +) from pandas.core.internals.blocks import ( # io.pytables, io.packers Block, CategoricalBlock, @@ -35,6 +41,8 @@ "ArrayManager", "BlockManager", "SingleBlockManager", + "SingleManager", + "SingleArrayManager", "concatenate_managers", # those two are preserved here for downstream compatibility (GH-33892) "create_block_manager_from_arrays", diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 5001754017dda..24799f3e7c63d 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -3,8 +3,8 @@ """ from __future__ import annotations +import inspect from typing import ( - TYPE_CHECKING, Any, Callable, List, @@ -28,16 +28,21 @@ from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.cast import ( + astype_dt64_to_dt64tz, + astype_nansafe, find_common_type, infer_dtype_from_scalar, ) from pandas.core.dtypes.common import ( is_bool_dtype, + is_datetime64_dtype, + is_datetime64tz_dtype, is_dtype_equal, is_extension_array_dtype, is_numeric_dtype, is_object_dtype, is_timedelta64_ns_dtype, + pandas_dtype, ) from pandas.core.dtypes.dtypes import ( ExtensionDtype, @@ -53,7 +58,13 @@ ) import pandas.core.algorithms as algos -from pandas.core.arrays import ExtensionArray +from pandas.core.arrays import ( + DatetimeArray, + ExtensionArray, + IntervalArray, + PeriodArray, + TimedeltaArray, +) from pandas.core.arrays.sparse import SparseDtype from pandas.core.construction import ( ensure_wrapped_if_datetimelike, @@ -65,14 +76,80 @@ Index, ensure_index, ) -from pandas.core.internals.base import DataManager +from pandas.core.internals.base import ( + DataManager, + SingleManager, +) from pandas.core.internals.blocks import make_block -if TYPE_CHECKING: - from pandas.core.internals.managers import SingleBlockManager +T = TypeVar("T", bound="ArrayManager") + +def astype_array(values, dtype, copy): + if ( + values.dtype.kind in ["m", "M"] + and dtype.kind in ["i", "u"] + and isinstance(dtype, np.dtype) + and dtype.itemsize != 8 + ): + # TODO(2.0) remove special case once deprecation on DTA/TDA is enforced + msg = rf"cannot astype a datetimelike from [{values.dtype}] to [{dtype}]" + raise TypeError(msg) -T = TypeVar("T", bound="ArrayManager") + if is_datetime64tz_dtype(dtype) and is_datetime64_dtype(values.dtype): + return astype_dt64_to_dt64tz(values, dtype, copy, via_utc=True) + + if is_dtype_equal(values.dtype, dtype): + if copy: + return values.copy() + return values + + if isinstance(values, ExtensionArray): + values = values.astype(dtype, copy=copy) + + else: + values = astype_nansafe(values, dtype, copy=copy) + + # now in ObjectBlock._maybe_coerce_values(cls, values): + if isinstance(dtype, np.dtype) and issubclass(values.dtype.type, str): + values = np.array(values, dtype=object) + + return values + + +def astype_array_safe(values, dtype, copy=False, errors="raise"): + + errors_legal_values = ("raise", "ignore") + + if errors not in errors_legal_values: + invalid_arg = ( + "Expected value of kwarg 'errors' to be one of " + f"{list(errors_legal_values)}. Supplied value is '{errors}'" + ) + raise ValueError(invalid_arg) + + if inspect.isclass(dtype) and issubclass(dtype, ExtensionDtype): + msg = ( + f"Expected an instance of {dtype.__name__}, " + "but got the class instead. Try instantiating 'dtype'." + ) + raise TypeError(msg) + + dtype = pandas_dtype(dtype) + + if isinstance(dtype, PandasDtype): + dtype = dtype.numpy_dtype + + try: + new_values = astype_array(values, dtype, copy=copy) + except (ValueError, TypeError): + # e.g. astype_nansafe can fail on object-dtype of strings + # trying to convert to float + if errors == "ignore": + new_values = values + else: + raise + return new_values class ArrayManager(DataManager): @@ -113,6 +190,7 @@ def __init__( if verify_integrity: self._axes = [ensure_index(ax) for ax in axes] + self.arrays = [ensure_wrapped_if_datetimelike(arr) for arr in arrays] self._verify_integrity() def make_empty(self: T, axes=None) -> T: @@ -125,7 +203,7 @@ def make_empty(self: T, axes=None) -> T: @property def items(self) -> Index: - return self._axes[1] + return self._axes[-1] @property def axes(self) -> List[Index]: # type: ignore[override] @@ -400,7 +478,10 @@ def apply_with_block(self: T, f, align_keys=None, **kwargs) -> T: # The caller is responsible for ensuring that # obj.axes[-1].equals(self.items) if obj.ndim == 1: - kwargs[k] = obj.iloc[[i]] + if self.ndim == 2: + kwargs[k] = obj.iloc[[i]]._values + else: + kwargs[k] = obj.iloc[:]._values else: kwargs[k] = obj.iloc[:, [i]]._values else: @@ -413,15 +494,21 @@ def apply_with_block(self: T, f, align_keys=None, **kwargs) -> T: elif arr.dtype.kind == "m" and not isinstance(arr, np.ndarray): # TimedeltaArray needs to be converted to ndarray for TimedeltaBlock arr = arr._data # type: ignore[union-attr] - if isinstance(arr, np.ndarray): - arr = np.atleast_2d(arr) - block = make_block(arr, placement=slice(0, 1, 1), ndim=2) + + if self.ndim == 2: + if isinstance(arr, np.ndarray): + arr = np.atleast_2d(arr) + block = make_block(arr, placement=slice(0, 1, 1), ndim=2) + else: + block = make_block(arr, placement=slice(0, len(self), 1), ndim=1) + applied = getattr(block, f)(**kwargs) if isinstance(applied, list): applied = applied[0] arr = applied.values - if isinstance(arr, np.ndarray): - arr = arr[0, :] + if self.ndim == 2: + if isinstance(arr, np.ndarray): + arr = arr[0, :] result_arrays.append(arr) return type(self)(result_arrays, self._axes) @@ -499,7 +586,7 @@ def downcast(self) -> ArrayManager: return self.apply_with_block("downcast") def astype(self, dtype, copy: bool = False, errors: str = "raise") -> ArrayManager: - return self.apply("astype", dtype=dtype, copy=copy) # , errors=errors) + return self.apply(astype_array_safe, dtype=dtype, copy=copy, errors=errors) def convert( self, @@ -731,16 +818,12 @@ def fast_xs(self, loc: int) -> ArrayLike: result = dtype.construct_array_type()._from_sequence(result, dtype=dtype) return result - def iget(self, i: int) -> SingleBlockManager: + def iget(self, i: int) -> SingleArrayManager: """ - Return the data as a SingleBlockManager. + Return the data as a SingleArrayManager. """ - from pandas.core.internals.managers import SingleBlockManager - values = self.arrays[i] - block = make_block(values, placement=slice(0, len(values)), ndim=1) - - return SingleBlockManager(block, self._axes[0]) + return SingleArrayManager([values], [self._axes[0]]) def iget_values(self, i: int) -> ArrayLike: """ @@ -900,8 +983,8 @@ def _reindex_indexer( if not allow_dups: self._axes[axis]._validate_can_reindex(indexer) - # if axis >= self.ndim: - # raise IndexError("Requested axis not found in manager") + if axis >= self.ndim: + raise IndexError("Requested axis not found in manager") if axis == 1: new_arrays = [] @@ -1030,3 +1113,131 @@ def _interleaved_dtype(blocks) -> Optional[DtypeObj]: return None return find_common_type([b.dtype for b in blocks]) + + +class SingleArrayManager(ArrayManager, SingleManager): + + __slots__ = [ + "_axes", # private attribute, because 'axes' has different order, see below + "arrays", + ] + + arrays: List[Union[np.ndarray, ExtensionArray]] + _axes: List[Index] + + ndim = 1 + + def __init__( + self, + arrays: Union[np.ndarray, ExtensionArray], + axes: List[Index], + verify_integrity: bool = True, + ): + self._axes = axes + self.arrays = arrays + + if verify_integrity: + assert len(axes) == 1 + assert len(arrays) == 1 + self._axes = [ensure_index(ax) for ax in self._axes] + self.arrays = [ensure_wrapped_if_datetimelike(arr) for arr in arrays] + self._verify_integrity() + + def _verify_integrity(self) -> None: + (n_rows,) = self.shape + assert len(self.arrays) == 1 + assert len(self.arrays[0]) == n_rows + + @staticmethod + def _normalize_axis(axis): + return axis + + def make_empty(self: T, axes=None) -> T: + """Return an empty ArrayManager with index/array of length 0""" + if axes is None: + axes = [Index([], dtype=object)] + array = np.array([], dtype=self.dtype) + return type(self)([array], axes) + + @classmethod + def from_array(cls, array, index): + return cls([array], [index]) + + @property + def axes(self): + return self._axes + + @property + def index(self) -> Index: + return self._axes[0] + + @property + def array(self): + return self.arrays[0] + + @property + def dtype(self): + return self.array.dtype + + def external_values(self): + """The array that Series.values returns""" + if isinstance(self.array, (PeriodArray, IntervalArray)): + return self.array.astype(object) + elif isinstance(self.array, (DatetimeArray, TimedeltaArray)): + return self.array._data + else: + return self.array + + def internal_values(self): + """The array that Series._values returns""" + return self.array + + @property + def _can_hold_na(self) -> bool: + if isinstance(self.array, np.ndarray): + return self.array.dtype.kind not in ["b", "i", "u"] + else: + # ExtensionArray + return self.array._can_hold_na + + @property + def is_single_block(self) -> bool: + return True + + def _consolidate_check(self): + pass + + def get_slice(self, slobj: slice, axis: int = 0) -> SingleArrayManager: + if axis >= self.ndim: + raise IndexError("Requested axis not found in manager") + + new_array = self.array[slobj] + new_index = self.index[slobj] + return type(self)([new_array], [new_index]) + + def apply(self, func, **kwargs): + if callable(func): + new_array = func(self.array, **kwargs) + else: + new_array = getattr(self.array, func)(**kwargs) + return type(self)([new_array], self._axes) + + def setitem(self, indexer, value): + return self.apply_with_block("setitem", indexer=indexer, value=value) + + def idelete(self, indexer): + """ + Delete selected locations in-place (new array, same ArrayManager) + """ + to_keep = np.ones(self.shape[0], dtype=np.bool_) + to_keep[indexer] = False + + self.arrays = [self.arrays[0][to_keep]] + self._axes = [self._axes[0][to_keep]] + + def _get_data_subset(self, predicate: Callable) -> ArrayManager: + # used in get_numeric_data / get_bool_data + if predicate(self.array): + return type(self)(self.arrays, self._axes, verify_integrity=False) + else: + return self.make_empty() diff --git a/pandas/core/internals/base.py b/pandas/core/internals/base.py index 2ce91134f61d6..4a245102aaa61 100644 --- a/pandas/core/internals/base.py +++ b/pandas/core/internals/base.py @@ -98,3 +98,7 @@ def equals(self, other: object) -> bool: return False return self._equal_values(other) + + +class SingleManager(DataManager): + ndim = 1 diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index e013a7f680d6f..b782cfa64dd5d 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -61,7 +61,10 @@ Index, ensure_index, ) -from pandas.core.internals.base import DataManager +from pandas.core.internals.base import ( + DataManager, + SingleManager, +) from pandas.core.internals.blocks import ( Block, CategoricalBlock, @@ -1525,7 +1528,7 @@ def unstack(self, unstacker, fill_value) -> BlockManager: return bm -class SingleBlockManager(BlockManager): +class SingleBlockManager(BlockManager, SingleManager): """ manage a single block with """ ndim = 1 diff --git a/pandas/core/series.py b/pandas/core/series.py index ddfeea381ff2e..d103cf6088a20 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -97,7 +97,10 @@ ) from pandas.core.accessor import CachedAccessor from pandas.core.apply import SeriesApply -from pandas.core.arrays import ExtensionArray +from pandas.core.arrays import ( + ExtensionArray, + PandasArray, +) from pandas.core.arrays.categorical import CategoricalAccessor from pandas.core.arrays.sparse import SparseAccessor import pandas.core.common as com @@ -125,7 +128,11 @@ from pandas.core.indexes.period import PeriodIndex from pandas.core.indexes.timedeltas import TimedeltaIndex from pandas.core.indexing import check_bool_indexer -from pandas.core.internals import SingleBlockManager +from pandas.core.internals import ( + SingleArrayManager, + SingleBlockManager, + SingleManager, +) from pandas.core.shared_docs import _shared_docs from pandas.core.sorting import ( ensure_key_mapped, @@ -267,7 +274,7 @@ class Series(base.IndexOpsMixin, generic.NDFrame): base.IndexOpsMixin.hasnans.func, doc=base.IndexOpsMixin.hasnans.__doc__ ) __hash__ = generic.NDFrame.__hash__ - _mgr: SingleBlockManager + _mgr: SingleManager div: Callable[[Series, Any], Series] rdiv: Callable[[Series, Any], Series] @@ -285,7 +292,7 @@ def __init__( ): if ( - isinstance(data, SingleBlockManager) + isinstance(data, SingleManager) and index is None and dtype is None and copy is False @@ -299,8 +306,12 @@ def __init__( if fastpath: # data is an ndarray, index is defined - if not isinstance(data, SingleBlockManager): - data = SingleBlockManager.from_array(data, index) + if not isinstance(data, SingleManager): + manager = get_option("mode.data_manager") + if manager == "block": + data = SingleBlockManager.from_array(data, index) + elif manager == "array": + data = SingleArrayManager.from_array(data, index) if copy: data = data.copy() if index is None: @@ -363,7 +374,7 @@ def __init__( data, index = self._init_dict(data, index, dtype) dtype = None copy = False - elif isinstance(data, SingleBlockManager): + elif isinstance(data, SingleManager): if index is None: index = data.index elif not data.index.equals(index) or copy: @@ -390,7 +401,7 @@ def __init__( com.require_length_match(data, index) # create/copy the manager - if isinstance(data, SingleBlockManager): + if isinstance(data, SingleManager): if dtype is not None: data = data.astype(dtype=dtype, errors="ignore", copy=copy) elif copy: @@ -398,7 +409,11 @@ def __init__( else: data = sanitize_array(data, index, dtype, copy) - data = SingleBlockManager.from_array(data, index) + manager = get_option("mode.data_manager") + if manager == "block": + data = SingleBlockManager.from_array(data, index) + elif manager == "array": + data = SingleArrayManager.from_array(data, index) generic.NDFrame.__init__(self, data) self.name = name @@ -659,7 +674,13 @@ def _values(self): @Appender(base.IndexOpsMixin.array.__doc__) # type: ignore[misc] @property def array(self) -> ExtensionArray: - return self._mgr._block.array_values() + if isinstance(self._mgr, SingleBlockManager): + return self._mgr._block.array_values() + else: + arr = self._mgr.array + if isinstance(arr, np.ndarray): + arr = PandasArray(arr) + return arr # ops def ravel(self, order="C"): diff --git a/pandas/tests/frame/methods/test_rename.py b/pandas/tests/frame/methods/test_rename.py index 677d862dfe077..98999b5a79f94 100644 --- a/pandas/tests/frame/methods/test_rename.py +++ b/pandas/tests/frame/methods/test_rename.py @@ -170,6 +170,7 @@ def test_rename_multiindex(self): renamed = df.rename(index={"foo1": "foo3", "bar2": "bar3"}, level=0) tm.assert_index_equal(renamed.index, new_index) + @td.skip_array_manager_not_yet_implemented def test_rename_nocopy(self, float_frame): renamed = float_frame.rename(columns={"C": "foo"}, copy=False) renamed["foo"] = 1.0 diff --git a/pandas/tests/series/methods/test_explode.py b/pandas/tests/series/methods/test_explode.py index 1f0fbd1cc5ecb..12ca5c3cb4a0f 100644 --- a/pandas/tests/series/methods/test_explode.py +++ b/pandas/tests/series/methods/test_explode.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd import pandas._testing as tm @@ -65,6 +67,7 @@ def test_large(): tm.assert_series_equal(result, s) +@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) apply def test_invert_array(): df = pd.DataFrame({"a": pd.date_range("20190101", periods=3, tz="UTC")}) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 63c9b4d899622..e4c50ed19ce40 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -13,6 +13,7 @@ iNaT, lib, ) +import pandas.util._test_decorators as td from pandas.core.dtypes.common import ( is_categorical_dtype, @@ -646,6 +647,7 @@ def test_constructor_copy(self): assert x[0] == 2.0 assert y[0] == 1.0 + @td.skip_array_manager_invalid_test # TODO(ArrayManager) rewrite test @pytest.mark.parametrize( "index", [ @@ -1682,12 +1684,14 @@ def test_series_constructor_infer_multiindex(self): class TestSeriesConstructorInternals: - def test_constructor_no_pandas_array(self): + def test_constructor_no_pandas_array(self, using_array_manager): ser = Series([1, 2, 3]) result = Series(ser.array) tm.assert_series_equal(ser, result) - assert isinstance(result._mgr.blocks[0], NumericBlock) + if not using_array_manager: + assert isinstance(result._mgr.blocks[0], NumericBlock) + @td.skip_array_manager_invalid_test def test_from_array(self): result = Series(pd.array(["1H", "2H"], dtype="timedelta64[ns]")) assert result._mgr.blocks[0].is_extension is False @@ -1695,6 +1699,7 @@ def test_from_array(self): result = Series(pd.array(["2015"], dtype="datetime64[ns]")) assert result._mgr.blocks[0].is_extension is False + @td.skip_array_manager_invalid_test def test_from_list_dtype(self): result = Series(["1H", "2H"], dtype="timedelta64[ns]") assert result._mgr.blocks[0].is_extension is False diff --git a/pandas/util/_exceptions.py b/pandas/util/_exceptions.py index 5ca96a1f9989f..7c75276616e98 100644 --- a/pandas/util/_exceptions.py +++ b/pandas/util/_exceptions.py @@ -31,7 +31,7 @@ def find_stack_level() -> int: if stack[n].function == "astype": break - while stack[n].function in ["astype", "apply", "_astype"]: + while stack[n].function in ["astype", "apply", "astype_array", "astype_array_safe"]: # e.g. # bump up Block.astype -> BlockManager.astype -> NDFrame.astype # bump up Datetime.Array.astype -> DatetimeIndex.astype From 758df4a5ef41b801ca5307f0211b924c5907ad14 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 2 Mar 2021 08:40:54 +0100 Subject: [PATCH 02/15] fix fast_xs --- pandas/core/internals/array_manager.py | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 24799f3e7c63d..a4773efeac7ec 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -36,6 +36,7 @@ from pandas.core.dtypes.common import ( is_bool_dtype, is_datetime64_dtype, + is_datetime64_ns_dtype, is_datetime64tz_dtype, is_dtype_equal, is_extension_array_dtype, @@ -802,20 +803,16 @@ def fast_xs(self, loc: int) -> ArrayLike: """ dtype = _interleaved_dtype(self.arrays) - if isinstance(dtype, SparseDtype): - temp_dtype = dtype.subtype - elif isinstance(dtype, PandasDtype): - temp_dtype = dtype.numpy_dtype - elif is_extension_array_dtype(dtype): - temp_dtype = "object" - elif is_dtype_equal(dtype, str): - temp_dtype = "object" - else: - temp_dtype = dtype - - result = np.array([arr[loc] for arr in self.arrays], dtype=temp_dtype) + values = [arr[loc] for arr in self.arrays] if isinstance(dtype, ExtensionDtype): - result = dtype.construct_array_type()._from_sequence(result, dtype=dtype) + result = dtype.construct_array_type()._from_sequence(values, dtype=dtype) + # for datetime64/timedelta64, the np.ndarray constructor cannot handle pd.NaT + elif is_datetime64_ns_dtype(dtype): + result = DatetimeArray._from_sequence(values, dtype=dtype)._data + elif is_timedelta64_ns_dtype(dtype): + result = TimedeltaArray._from_sequence(values, dtype=dtype)._data + else: + result = np.array(values, dtype=dtype) return result def iget(self, i: int) -> SingleArrayManager: From 2c6247338df9424fb4c7694d9747cca8806102cf Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 2 Mar 2021 08:42:37 +0100 Subject: [PATCH 03/15] fix astype tests --- pandas/util/_exceptions.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/util/_exceptions.py b/pandas/util/_exceptions.py index 7c75276616e98..f8b65983fbdde 100644 --- a/pandas/util/_exceptions.py +++ b/pandas/util/_exceptions.py @@ -31,7 +31,13 @@ def find_stack_level() -> int: if stack[n].function == "astype": break - while stack[n].function in ["astype", "apply", "astype_array", "astype_array_safe"]: + while stack[n].function in [ + "astype", + "apply", + "_astype", + "astype_array", + "astype_array_safe", + ]: # e.g. # bump up Block.astype -> BlockManager.astype -> NDFrame.astype # bump up Datetime.Array.astype -> DatetimeIndex.astype From 50fe6e0bde77c0864e874d92b209811c8a139ce0 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 2 Mar 2021 09:20:08 +0100 Subject: [PATCH 04/15] add SingleArrayManager.set_values - fix apply --- pandas/core/apply.py | 5 ++--- pandas/core/internals/array_manager.py | 9 +++++++++ pandas/core/internals/managers.py | 9 +++++++++ pandas/tests/series/methods/test_explode.py | 3 --- 4 files changed, 20 insertions(+), 6 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 15cee1419afb5..a4e8668f71909 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -883,9 +883,8 @@ def series_generator(self): # of it. Kids: don't do this at home. ser = self.obj._ixs(0, axis=0) mgr = ser._mgr - blk = mgr.blocks[0] - if is_extension_array_dtype(blk.dtype): + if is_extension_array_dtype(ser.dtype): # values will be incorrect for this block # TODO(EA2D): special case would be unnecessary with 2D EAs obj = self.obj @@ -896,7 +895,7 @@ def series_generator(self): for (arr, name) in zip(values, self.index): # GH#35462 re-pin mgr in case setitem changed it ser._mgr = mgr - blk.values = arr + ser._mgr.set_values(arr) ser.name = name yield ser diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index a4773efeac7ec..25b6146a37b10 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -1238,3 +1238,12 @@ def _get_data_subset(self, predicate: Callable) -> ArrayManager: return type(self)(self.arrays, self._axes, verify_integrity=False) else: return self.make_empty() + + def set_values(self, values): + """ + Set (replace) the values of the SingleArrayManager in place. + + Use at your own risk! This does not check if the passed values are + valid for the current SingleArrayManager (length, dtype, etc). + """ + self.arrays[0] = values diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index b782cfa64dd5d..5c05bf676d939 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1649,6 +1649,15 @@ def fast_xs(self, loc): """ raise NotImplementedError("Use series._values[loc] instead") + def set_values(self, values): + """ + Set the values of the single block in place. + + Use at your own risk! This does not check if the passed values are + valid for the current Block/SingleBlockManager (length, dtype, etc). + """ + self.blocks[0].values = values + # -------------------------------------------------------------------- # Constructor Helpers diff --git a/pandas/tests/series/methods/test_explode.py b/pandas/tests/series/methods/test_explode.py index 12ca5c3cb4a0f..1f0fbd1cc5ecb 100644 --- a/pandas/tests/series/methods/test_explode.py +++ b/pandas/tests/series/methods/test_explode.py @@ -1,8 +1,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - import pandas as pd import pandas._testing as tm @@ -67,7 +65,6 @@ def test_large(): tm.assert_series_equal(result, s) -@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) apply def test_invert_array(): df = pd.DataFrame({"a": pd.date_range("20190101", periods=3, tz="UTC")}) From dd2511a6709e40ca3718764107386d5dc2958e5a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 2 Mar 2021 18:34:44 +0100 Subject: [PATCH 05/15] use slice to avoid copy --- pandas/core/internals/array_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index fe2bdc3387d46..1d6f6eeb22bd5 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -408,7 +408,7 @@ def apply_with_block(self: T, f, align_keys=None, **kwargs) -> T: # obj.axes[-1].equals(self.items) if obj.ndim == 1: if self.ndim == 2: - kwargs[k] = obj.iloc[[i]]._values + kwargs[k] = obj.iloc[slice(i, i + 1)]._values else: kwargs[k] = obj.iloc[:]._values else: From e7e0cad6657de695e9773ee14c938f1a9a476e26 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 3 Mar 2021 10:49:59 +0100 Subject: [PATCH 06/15] avoid PandasDtype in construction --- pandas/core/construction.py | 2 +- pandas/core/internals/array_manager.py | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 9ba9a5bd38164..43900709ad11f 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -481,7 +481,6 @@ def sanitize_array( DataFrame constructor, as the dtype keyword there may be interpreted as only applying to a subset of columns, see GH#24435. """ - if isinstance(data, ma.MaskedArray): data = sanitize_masked_array(data) @@ -555,6 +554,7 @@ def sanitize_array( inferred = lib.infer_dtype(subarr, skipna=False) if inferred in {"interval", "period"}: subarr = array(subarr) + subarr = extract_array(subarr, extract_numpy=True) return subarr diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 225b92f8365eb..1bf3905d9430c 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -46,6 +46,7 @@ ) from pandas.core.dtypes.generic import ( ABCDataFrame, + ABCPandasArray, ABCSeries, ) from pandas.core.dtypes.missing import ( @@ -1071,7 +1072,11 @@ def __init__( assert len(axes) == 1 assert len(arrays) == 1 self._axes = [ensure_index(ax) for ax in self._axes] - self.arrays = [ensure_wrapped_if_datetimelike(arr) for arr in arrays] + arr = arrays[0] + arr = ensure_wrapped_if_datetimelike(arr) + if isinstance(arr, ABCPandasArray): + arr = arr.to_numpy() + self.arrays = [arr] self._verify_integrity() def _verify_integrity(self) -> None: From c3c00f43061e52c8eb23ec6fcc79f704c4562dfd Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 3 Mar 2021 13:12:07 +0100 Subject: [PATCH 07/15] fix groupby --- pandas/_libs/reduction.pyx | 12 +++++++++--- pandas/core/groupby/generic.py | 20 +++++++++++--------- pandas/tests/groupby/test_bin_groupby.py | 9 ++++++++- 3 files changed, 28 insertions(+), 13 deletions(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 4d0bd4744be5d..0e952bff6100c 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -59,6 +59,7 @@ cdef class _BaseGrouper: cached_typ = self.typ( vslider.buf, dtype=vslider.buf.dtype, index=cached_ityp, name=self.name ) + self.has_block = hasattr(cached_typ._mgr, "_block") else: # See the comment in indexes/base.py about _index_data. # We need this for EA-backed indexes that have a reference @@ -66,9 +67,12 @@ cdef class _BaseGrouper: object.__setattr__(cached_ityp, '_index_data', islider.buf) cached_ityp._engine.clear_mapping() cached_ityp._cache.clear() # e.g. inferred_freq must go - object.__setattr__(cached_typ._mgr._block, 'values', vslider.buf) - object.__setattr__(cached_typ._mgr._block, 'mgr_locs', - slice(len(vslider.buf))) + if self.has_block: + object.__setattr__(cached_typ._mgr._block, 'values', vslider.buf) + object.__setattr__(cached_typ._mgr._block, 'mgr_locs', + slice(len(vslider.buf))) + else: + cached_typ._mgr.arrays[0] = vslider.buf object.__setattr__(cached_typ, '_index', cached_ityp) object.__setattr__(cached_typ, 'name', self.name) @@ -108,6 +112,7 @@ cdef class SeriesBinGrouper(_BaseGrouper): cdef public: ndarray arr, index, dummy_arr, dummy_index object values, f, bins, typ, ityp, name + bint has_block def __init__(self, object series, object f, object bins): @@ -201,6 +206,7 @@ cdef class SeriesGrouper(_BaseGrouper): cdef public: ndarray arr, index, dummy_arr, dummy_index object f, labels, values, typ, ityp, name + bint has_block def __init__(self, object series, object f, object labels, Py_ssize_t ngroups): diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index aaf67fb1be532..4075dd3ba4e94 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1152,16 +1152,18 @@ def py_fallback(values: ArrayLike) -> ArrayLike: result = result._consolidate() assert isinstance(result, (Series, DataFrame)) # for mypy mgr = result._mgr - assert isinstance(mgr, BlockManager) - - # unwrap DataFrame to get array - if len(mgr.blocks) != 1: - # We've split an object block! Everything we've assumed - # about a single block input returning a single block output - # is a lie. See eg GH-39329 - return mgr.as_array() + if isinstance(mgr, BlockManager): + # unwrap DataFrame to get array + if len(mgr.blocks) != 1: + # We've split an object block! Everything we've assumed + # about a single block input returning a single block output + # is a lie. See eg GH-39329 + return mgr.as_array() + else: + result = mgr.blocks[0].values + return result else: - result = mgr.blocks[0].values + result = mgr.arrays[0] return result def array_func(values: ArrayLike) -> ArrayLike: diff --git a/pandas/tests/groupby/test_bin_groupby.py b/pandas/tests/groupby/test_bin_groupby.py index f873c93d90683..bb541739c7f44 100644 --- a/pandas/tests/groupby/test_bin_groupby.py +++ b/pandas/tests/groupby/test_bin_groupby.py @@ -5,6 +5,7 @@ lib, reduction as libreduction, ) +import pandas.util._test_decorators as td import pandas as pd from pandas import Series @@ -61,7 +62,13 @@ def cumsum_max(x): return 0 -@pytest.mark.parametrize("func", [cumsum_max, assert_block_lengths]) +@pytest.mark.parametrize( + "func", + [ + cumsum_max, + pytest.param(assert_block_lengths, marks=td.skip_array_manager_invalid_test), + ], +) def test_mgr_locs_updated(func): # https://github.com/pandas-dev/pandas/issues/31802 # Some operations may require creating new blocks, which requires From 022da5aadac1d521fe13f2747ea5f5287fbfe073 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 3 Mar 2021 13:48:35 +0100 Subject: [PATCH 08/15] fix cumulative ops --- pandas/core/nanops.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index a0dfb1c83a70b..288668b700ad0 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1742,10 +1742,12 @@ def na_accum_func(values: ArrayLike, accum_func, *, skipna: bool) -> ArrayLike: if isinstance(values, np.ndarray): result = result.view(orig_dtype) else: - # DatetimeArray + # DatetimeArray/TimedeltaArray # TODO: have this case go through a DTA method? + # For DatetimeTZDtype, view result as M8[ns] + npdtype = orig_dtype if isinstance(orig_dtype, np.dtype) else "M8[ns]" result = type(values)._simple_new( # type: ignore[attr-defined] - result.view("M8[ns]"), dtype=orig_dtype + result.view(npdtype), dtype=orig_dtype ) elif skipna and not issubclass(values.dtype.type, (np.integer, np.bool_)): From ffcbf37433b5762b1dbf3d9c472e6ec870a5cdc7 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 3 Mar 2021 14:29:30 +0100 Subject: [PATCH 09/15] fix putmask --- pandas/core/internals/array_manager.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 1bf3905d9430c..443677c1b61be 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -192,7 +192,8 @@ def get_dtypes(self): def __repr__(self) -> str: output = type(self).__name__ output += f"\nIndex: {self._axes[0]}" - output += f"\nColumns: {self._axes[1]}" + if self.ndim == 1: + output += f"\nColumns: {self._axes[1]}" output += f"\n{len(self.arrays)} arrays:" for arr in self.arrays: output += f"\n{arr.dtype}" @@ -416,7 +417,8 @@ def apply_with_block(self: T, f, align_keys=None, **kwargs) -> T: kwargs[k] = obj.iloc[:, [i]]._values else: # otherwise we have an ndarray - kwargs[k] = obj[[i]] + if obj.ndim == 2: + kwargs[k] = obj[[i]] if hasattr(arr, "tz") and arr.tz is None: # type: ignore[union-attr] # DatetimeArray needs to be converted to ndarray for DatetimeBlock @@ -469,7 +471,6 @@ def where(self, other, cond, align: bool, errors: str, axis: int) -> ArrayManage # return self.apply_with_block("setitem", indexer=indexer, value=value) def putmask(self, mask, new, align: bool = True): - if align: align_keys = ["new", "mask"] else: From 4d8a029452d5ee21a97fc9edd015e0709db2536a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 3 Mar 2021 14:33:37 +0100 Subject: [PATCH 10/15] rename SingleManager -> SingleDataManager, add typing union alias --- pandas/_typing.py | 3 +++ pandas/core/internals/__init__.py | 4 ++-- pandas/core/internals/base.py | 2 +- pandas/core/internals/managers.py | 4 ++-- pandas/core/series.py | 11 ++++++----- 5 files changed, 14 insertions(+), 10 deletions(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index c50d532f40dd7..e464f2a021ef6 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -58,6 +58,8 @@ from pandas.core.internals import ( ArrayManager, BlockManager, + SingleArrayManager, + SingleBlockManager, ) from pandas.core.resample import Resampler from pandas.core.series import Series @@ -184,3 +186,4 @@ # internals Manager = Union["ArrayManager", "BlockManager"] +SingleManager = Union["SingleArrayManager", "SingleBlockManager"] diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py index 272b4e85e4928..23d35b412e1ae 100644 --- a/pandas/core/internals/__init__.py +++ b/pandas/core/internals/__init__.py @@ -4,7 +4,7 @@ ) from pandas.core.internals.base import ( DataManager, - SingleManager, + SingleDataManager, ) from pandas.core.internals.blocks import ( # io.pytables, io.packers Block, @@ -40,8 +40,8 @@ "DataManager", "ArrayManager", "BlockManager", + "SingleDataManager", "SingleBlockManager", - "SingleManager", "SingleArrayManager", "concatenate_managers", # those two are preserved here for downstream compatibility (GH-33892) diff --git a/pandas/core/internals/base.py b/pandas/core/internals/base.py index 4a245102aaa61..0e4b5ce2e7452 100644 --- a/pandas/core/internals/base.py +++ b/pandas/core/internals/base.py @@ -100,5 +100,5 @@ def equals(self, other: object) -> bool: return self._equal_values(other) -class SingleManager(DataManager): +class SingleDataManager(DataManager): ndim = 1 diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 9f0171436d935..c60240ef46b04 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -63,7 +63,7 @@ ) from pandas.core.internals.base import ( DataManager, - SingleManager, + SingleDataManager, ) from pandas.core.internals.blocks import ( Block, @@ -1528,7 +1528,7 @@ def unstack(self, unstacker, fill_value) -> BlockManager: return bm -class SingleBlockManager(BlockManager, SingleManager): +class SingleBlockManager(BlockManager, SingleDataManager): """ manage a single block with """ ndim = 1 diff --git a/pandas/core/series.py b/pandas/core/series.py index e5c09cc4b72e1..7ce295eeb9efb 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -43,6 +43,7 @@ FrameOrSeriesUnion, IndexKeyFunc, NpDtype, + SingleManager, StorageOptions, ValueKeyFunc, ) @@ -131,7 +132,7 @@ from pandas.core.internals import ( SingleArrayManager, SingleBlockManager, - SingleManager, + SingleDataManager, ) from pandas.core.shared_docs import _shared_docs from pandas.core.sorting import ( @@ -292,7 +293,7 @@ def __init__( ): if ( - isinstance(data, SingleManager) + isinstance(data, SingleDataManager) and index is None and dtype is None and copy is False @@ -306,7 +307,7 @@ def __init__( if fastpath: # data is an ndarray, index is defined - if not isinstance(data, SingleManager): + if not isinstance(data, SingleDataManager): manager = get_option("mode.data_manager") if manager == "block": data = SingleBlockManager.from_array(data, index) @@ -374,7 +375,7 @@ def __init__( data, index = self._init_dict(data, index, dtype) dtype = None copy = False - elif isinstance(data, SingleManager): + elif isinstance(data, SingleDataManager): if index is None: index = data.index elif not data.index.equals(index) or copy: @@ -399,7 +400,7 @@ def __init__( com.require_length_match(data, index) # create/copy the manager - if isinstance(data, SingleManager): + if isinstance(data, SingleDataManager): if dtype is not None: data = data.astype(dtype=dtype, errors="ignore", copy=copy) elif copy: From 606f4675d4c37fe6422486edfcc7697e98fb20e4 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 3 Mar 2021 14:44:13 +0100 Subject: [PATCH 11/15] move logic from Series.array into manager --- pandas/core/internals/array_manager.py | 15 +++++++++++---- pandas/core/internals/managers.py | 4 ++++ pandas/core/series.py | 13 ++----------- 3 files changed, 17 insertions(+), 15 deletions(-) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 443677c1b61be..dc57863973029 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -46,7 +46,6 @@ ) from pandas.core.dtypes.generic import ( ABCDataFrame, - ABCPandasArray, ABCSeries, ) from pandas.core.dtypes.missing import ( @@ -59,6 +58,7 @@ DatetimeArray, ExtensionArray, IntervalArray, + PandasArray, PeriodArray, TimedeltaArray, ) @@ -75,7 +75,7 @@ ) from pandas.core.internals.base import ( DataManager, - SingleManager, + SingleDataManager, ) from pandas.core.internals.blocks import make_block @@ -1048,7 +1048,7 @@ def _interleaved_dtype(blocks) -> Optional[DtypeObj]: return find_common_type([b.dtype for b in blocks]) -class SingleArrayManager(ArrayManager, SingleManager): +class SingleArrayManager(ArrayManager, SingleDataManager): __slots__ = [ "_axes", # private attribute, because 'axes' has different order, see below @@ -1075,7 +1075,7 @@ def __init__( self._axes = [ensure_index(ax) for ax in self._axes] arr = arrays[0] arr = ensure_wrapped_if_datetimelike(arr) - if isinstance(arr, ABCPandasArray): + if isinstance(arr, PandasArray): arr = arr.to_numpy() self.arrays = [arr] self._verify_integrity() @@ -1129,6 +1129,13 @@ def internal_values(self): """The array that Series._values returns""" return self.array + def array_values(self): + """The array that Series.array returns""" + arr = self.array + if isinstance(arr, np.ndarray): + arr = PandasArray(arr) + return arr + @property def _can_hold_na(self) -> bool: if isinstance(self.array, np.ndarray): diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index c60240ef46b04..b91ac420ae7b7 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1620,6 +1620,10 @@ def internal_values(self): """The array that Series._values returns""" return self._block.internal_values() + def array_values(self): + """The array that Series.array returns""" + return self._block.array_values() + @property def _can_hold_na(self) -> bool: return self._block._can_hold_na diff --git a/pandas/core/series.py b/pandas/core/series.py index 7ce295eeb9efb..cd93b3021aa95 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -98,10 +98,7 @@ ) from pandas.core.accessor import CachedAccessor from pandas.core.apply import SeriesApply -from pandas.core.arrays import ( - ExtensionArray, - PandasArray, -) +from pandas.core.arrays import ExtensionArray from pandas.core.arrays.categorical import CategoricalAccessor from pandas.core.arrays.sparse import SparseAccessor import pandas.core.common as com @@ -673,13 +670,7 @@ def _values(self): @Appender(base.IndexOpsMixin.array.__doc__) # type: ignore[misc] @property def array(self) -> ExtensionArray: - if isinstance(self._mgr, SingleBlockManager): - return self._mgr._block.array_values() - else: - arr = self._mgr.array - if isinstance(arr, np.ndarray): - arr = PandasArray(arr) - return arr + self._mgr.array_values() # ops def ravel(self, order="C"): From 26186a39660dad1048cde4b0457dc61652a431e5 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 3 Mar 2021 16:48:24 +0100 Subject: [PATCH 12/15] fixup return --- pandas/core/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index cd93b3021aa95..ad3559601c294 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -670,7 +670,7 @@ def _values(self): @Appender(base.IndexOpsMixin.array.__doc__) # type: ignore[misc] @property def array(self) -> ExtensionArray: - self._mgr.array_values() + return self._mgr.array_values() # ops def ravel(self, order="C"): From 0f4bf415d7aa88a600584ce7effc3ef1549ccf5a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 3 Mar 2021 17:25:21 +0100 Subject: [PATCH 13/15] correct check for PandasArray --- pandas/core/internals/array_manager.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index dc57863973029..56bae63f511be 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -46,6 +46,7 @@ ) from pandas.core.dtypes.generic import ( ABCDataFrame, + ABCPandasArray, ABCSeries, ) from pandas.core.dtypes.missing import ( @@ -1075,7 +1076,7 @@ def __init__( self._axes = [ensure_index(ax) for ax in self._axes] arr = arrays[0] arr = ensure_wrapped_if_datetimelike(arr) - if isinstance(arr, PandasArray): + if isinstance(arr, ABCPandasArray): arr = arr.to_numpy() self.arrays = [arr] self._verify_integrity() From 2b2fa84029607659cc8c238a618b77ced4054cae Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 3 Mar 2021 19:41:00 +0100 Subject: [PATCH 14/15] fix typing --- pandas/core/internals/array_manager.py | 4 ++-- pandas/core/series.py | 9 ++++----- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 56bae63f511be..d6f2530ed2ca5 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -1063,7 +1063,7 @@ class SingleArrayManager(ArrayManager, SingleDataManager): def __init__( self, - arrays: Union[np.ndarray, ExtensionArray], + arrays: List[Union[np.ndarray, ExtensionArray]], axes: List[Index], verify_integrity: bool = True, ): @@ -1090,7 +1090,7 @@ def _verify_integrity(self) -> None: def _normalize_axis(axis): return axis - def make_empty(self: T, axes=None) -> T: + def make_empty(self, axes=None) -> SingleArrayManager: """Return an empty ArrayManager with index/array of length 0""" if axes is None: axes = [Index([], dtype=object)] diff --git a/pandas/core/series.py b/pandas/core/series.py index ad3559601c294..5a5d1c44b312c 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -129,7 +129,6 @@ from pandas.core.internals import ( SingleArrayManager, SingleBlockManager, - SingleDataManager, ) from pandas.core.shared_docs import _shared_docs from pandas.core.sorting import ( @@ -290,7 +289,7 @@ def __init__( ): if ( - isinstance(data, SingleDataManager) + isinstance(data, (SingleBlockManager, SingleArrayManager)) and index is None and dtype is None and copy is False @@ -304,7 +303,7 @@ def __init__( if fastpath: # data is an ndarray, index is defined - if not isinstance(data, SingleDataManager): + if not isinstance(data, (SingleBlockManager, SingleArrayManager)): manager = get_option("mode.data_manager") if manager == "block": data = SingleBlockManager.from_array(data, index) @@ -372,7 +371,7 @@ def __init__( data, index = self._init_dict(data, index, dtype) dtype = None copy = False - elif isinstance(data, SingleDataManager): + elif isinstance(data, (SingleBlockManager, SingleArrayManager)): if index is None: index = data.index elif not data.index.equals(index) or copy: @@ -397,7 +396,7 @@ def __init__( com.require_length_match(data, index) # create/copy the manager - if isinstance(data, SingleDataManager): + if isinstance(data, (SingleBlockManager, SingleArrayManager)): if dtype is not None: data = data.astype(dtype=dtype, errors="ignore", copy=copy) elif copy: From 26648c76ba17fb7fdae6aa7581e7d6041863e24a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 4 Mar 2021 18:00:53 +0100 Subject: [PATCH 15/15] simplify py fallback function --- pandas/core/groupby/generic.py | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 4075dd3ba4e94..2de5e81360a93 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -108,10 +108,7 @@ all_indexes_same, ) import pandas.core.indexes.base as ibase -from pandas.core.internals import ( - ArrayManager, - BlockManager, -) +from pandas.core.internals import ArrayManager from pandas.core.series import Series from pandas.core.util.numba_ import maybe_use_numba @@ -1151,20 +1148,18 @@ def py_fallback(values: ArrayLike) -> ArrayLike: # in the operation. We un-split here. result = result._consolidate() assert isinstance(result, (Series, DataFrame)) # for mypy + # unwrap DataFrame/Series to get array mgr = result._mgr - if isinstance(mgr, BlockManager): - # unwrap DataFrame to get array - if len(mgr.blocks) != 1: - # We've split an object block! Everything we've assumed - # about a single block input returning a single block output - # is a lie. See eg GH-39329 - return mgr.as_array() - else: - result = mgr.blocks[0].values - return result + arrays = mgr.arrays + if len(arrays) != 1: + # We've split an object block! Everything we've assumed + # about a single block input returning a single block output + # is a lie. See eg GH-39329 + return mgr.as_array() else: - result = mgr.arrays[0] - return result + # We are a single block from a BlockManager + # or one array from SingleArrayManager + return arrays[0] def array_func(values: ArrayLike) -> ArrayLike: