From 57cc6e866e272dfe8433394b51493b1611d5deb5 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 19 Mar 2021 17:33:30 -0700 Subject: [PATCH 1/3] REF: move Manager.insert boilerplate to DataFrame.insert --- pandas/core/frame.py | 8 ++++- pandas/core/internals/array_manager.py | 19 ++--------- pandas/core/internals/managers.py | 31 ++---------------- pandas/core/series.py | 11 ++++--- pandas/tests/internals/test_internals.py | 40 +----------------------- 5 files changed, 18 insertions(+), 91 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8d0c8e5f29413..32439af6db238 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4298,8 +4298,14 @@ def insert(self, loc, column, value, allow_duplicates: bool = False) -> None: "Cannot specify 'allow_duplicates=True' when " "'self.flags.allows_duplicate_labels' is False." ) + if not allow_duplicates and column in self.columns: + # Should this be a different kind of error?? + raise ValueError(f"cannot insert {column}, already exists") + if not isinstance(loc, int): + raise TypeError("loc must be int") + value = self._sanitize_column(value) - self._mgr.insert(loc, column, value, allow_duplicates=allow_duplicates) + self._mgr.insert(loc, column, value) def assign(self, **kwargs) -> DataFrame: r""" diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 99a1706c671b1..22076612b4bc3 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -904,7 +904,7 @@ def iset(self, loc: Union[int, slice, np.ndarray], value): self.arrays[mgr_idx] = value_arr return - def insert(self, loc: int, item: Hashable, value, allow_duplicates: bool = False): + def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None: """ Insert item at selected position. @@ -912,18 +912,8 @@ def insert(self, loc: int, item: Hashable, value, allow_duplicates: bool = False ---------- loc : int item : hashable - value : array_like - allow_duplicates: bool - If False, trying to insert non-unique item will raise - + value : np.ndarray or ExtensionArray """ - if not allow_duplicates and item in self.items: - # Should this be a different kind of error?? - raise ValueError(f"cannot insert {item}, already exists") - - if not isinstance(loc, int): - raise TypeError("loc must be int") - # insert to the axis; this could possibly raise a TypeError new_axis = self.items.insert(loc, item) @@ -1236,11 +1226,6 @@ def get_slice(self, slobj: slice, axis: int = 0) -> SingleArrayManager: new_index = self.index._getitem_slice(slobj) return type(self)([new_array], [new_index], verify_integrity=False) - def getitem_mgr(self, indexer) -> SingleArrayManager: - new_array = self.array[indexer] - new_index = self.index[indexer] - return type(self)([new_array], [new_index]) - def apply(self, func, **kwargs): if callable(func): new_array = func(self.array, **kwargs) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index e4cce731b7b56..3ea02ee629ffb 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1075,8 +1075,7 @@ def idelete(self, indexer): blk.mgr_locs = bml.add(ref_loc_offset[bml.indexer]) - # FIXME: use Index.delete as soon as it uses fastpath=True - self.axes[0] = self.items[~is_deleted] + self.axes[0] = self.items.delete(indexer) self.blocks = tuple( b for blkno, b in enumerate(self.blocks) if not is_blk_deleted[blkno] ) @@ -1208,9 +1207,7 @@ def value_getitem(placement): # Newly created block's dtype may already be present. self._known_consolidated = False - def insert( - self, loc: int, item: Hashable, value: ArrayLike, allow_duplicates: bool = False - ): + def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None: """ Insert item at selected position. @@ -1219,17 +1216,7 @@ def insert( loc : int item : hashable value : np.ndarray or ExtensionArray - allow_duplicates: bool - If False, trying to insert non-unique item will raise - """ - if not allow_duplicates and item in self.items: - # Should this be a different kind of error?? - raise ValueError(f"cannot insert {item}, already exists") - - if not isinstance(loc, int): - raise TypeError("loc must be int") - # insert to the axis; this could possibly raise a TypeError new_axis = self.items.insert(loc, item) @@ -1633,20 +1620,6 @@ def _blklocs(self): """ compat with BlockManager """ return None - def getitem_mgr(self, indexer) -> SingleBlockManager: - # similar to get_slice, but not restricted to slice indexer - blk = self._block - array = blk._slice(indexer) - if array.ndim > 1: - # This will be caught by Series._get_values - raise ValueError("dimension-expanding indexing not allowed") - - bp = BlockPlacement(slice(0, len(array))) - block = blk.make_block_same_class(array, placement=bp) - - new_idx = self.index[indexer] - return type(self)(block, new_idx) - def get_slice(self, slobj: slice, axis: int = 0) -> SingleBlockManager: assert isinstance(slobj, slice), type(slobj) if axis >= self.ndim: diff --git a/pandas/core/series.py b/pandas/core/series.py index 27042f7de9dc1..f08dbf582844e 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -994,15 +994,16 @@ def _get_values_tuple(self, key): self ) - def _get_values(self, indexer): - try: - new_mgr = self._mgr.getitem_mgr(indexer) - return self._constructor(new_mgr).__finalize__(self) - except ValueError: + def _get_values(self, indexer) -> Series: + new_values = self._values[indexer] + if new_values.ndim > 1: # mpl compat if we look up e.g. ser[:, np.newaxis]; # see tests.series.timeseries.test_mpl_compat_hack # the asarray is needed to avoid returning a 2D DatetimeArray return np.asarray(self._values[indexer]) + new_index = self.index[indexer] + res = self._constructor(new_values, index=new_index, dtype=new_values.dtype) + return res.__finalize__(self) def _get_value(self, label, takeable: bool = False): """ diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index fc06b85b1f954..fd83211adb05e 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -822,23 +822,7 @@ def test_get_slice(self, mgr): def assert_slice_ok(mgr, axis, slobj): mat = mgr.as_array() - # we maybe using an ndarray to test slicing and - # might not be the full length of the axis - if isinstance(slobj, np.ndarray): - ax = mgr.axes[axis] - if len(ax) and len(slobj) and len(slobj) != len(ax): - slobj = np.concatenate( - [slobj, np.zeros(len(ax) - len(slobj), dtype=bool)] - ) - - if isinstance(slobj, slice): - sliced = mgr.get_slice(slobj, axis=axis) - elif mgr.ndim == 1 and axis == 0: - sliced = mgr.getitem_mgr(slobj) - else: - # BlockManager doesn't support non-slice, SingleBlockManager - # doesn't support axis > 0 - return + sliced = mgr.get_slice(slobj, axis=axis) mat_slobj = (slice(None),) * axis + (slobj,) tm.assert_numpy_array_equal( @@ -855,28 +839,6 @@ def assert_slice_ok(mgr, axis, slobj): assert_slice_ok(mgr, ax, slice(1, 4)) assert_slice_ok(mgr, ax, slice(3, 0, -2)) - if mgr.ndim < 2: - # 2D only support slice objects - - # boolean mask - assert_slice_ok(mgr, ax, np.array([], dtype=np.bool_)) - assert_slice_ok(mgr, ax, np.ones(mgr.shape[ax], dtype=np.bool_)) - assert_slice_ok(mgr, ax, np.zeros(mgr.shape[ax], dtype=np.bool_)) - - if mgr.shape[ax] >= 3: - assert_slice_ok(mgr, ax, np.arange(mgr.shape[ax]) % 3 == 0) - assert_slice_ok( - mgr, ax, np.array([True, True, False], dtype=np.bool_) - ) - - # fancy indexer - assert_slice_ok(mgr, ax, []) - assert_slice_ok(mgr, ax, list(range(mgr.shape[ax]))) - - if mgr.shape[ax] >= 3: - assert_slice_ok(mgr, ax, [0, 1, 2]) - assert_slice_ok(mgr, ax, [-1, -2, -3]) - @pytest.mark.parametrize("mgr", MANAGERS) def test_take(self, mgr): def assert_take_ok(mgr, axis, indexer): From 37fd8c39eae12173764acc6d72cccfe803d4a3b3 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 24 Mar 2021 13:52:52 -0700 Subject: [PATCH 2/3] revert getitem_mgr deletion --- pandas/core/internals/array_manager.py | 5 +++ pandas/core/internals/managers.py | 14 +++++++++ pandas/core/series.py | 11 +++---- pandas/tests/internals/test_internals.py | 40 +++++++++++++++++++++++- 4 files changed, 63 insertions(+), 7 deletions(-) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 22076612b4bc3..60df39312e971 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -1226,6 +1226,11 @@ def get_slice(self, slobj: slice, axis: int = 0) -> SingleArrayManager: new_index = self.index._getitem_slice(slobj) return type(self)([new_array], [new_index], verify_integrity=False) + def getitem_mgr(self, indexer) -> SingleArrayManager: + new_array = self.array[indexer] + new_index = self.index[indexer] + return type(self)([new_array], [new_index]) + def apply(self, func, **kwargs): if callable(func): new_array = func(self.array, **kwargs) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 3ea02ee629ffb..4985cbbe36bc8 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1620,6 +1620,20 @@ def _blklocs(self): """ compat with BlockManager """ return None + def getitem_mgr(self, indexer) -> SingleBlockManager: + # similar to get_slice, but not restricted to slice indexer + blk = self._block + array = blk._slice(indexer) + if array.ndim > 1: + # This will be caught by Series._get_values + raise ValueError("dimension-expanding indexing not allowed") + + bp = BlockPlacement(slice(0, len(array))) + block = blk.make_block_same_class(array, placement=bp) + + new_idx = self.index[indexer] + return type(self)(block, new_idx) + def get_slice(self, slobj: slice, axis: int = 0) -> SingleBlockManager: assert isinstance(slobj, slice), type(slobj) if axis >= self.ndim: diff --git a/pandas/core/series.py b/pandas/core/series.py index f08dbf582844e..27042f7de9dc1 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -994,16 +994,15 @@ def _get_values_tuple(self, key): self ) - def _get_values(self, indexer) -> Series: - new_values = self._values[indexer] - if new_values.ndim > 1: + def _get_values(self, indexer): + try: + new_mgr = self._mgr.getitem_mgr(indexer) + return self._constructor(new_mgr).__finalize__(self) + except ValueError: # mpl compat if we look up e.g. ser[:, np.newaxis]; # see tests.series.timeseries.test_mpl_compat_hack # the asarray is needed to avoid returning a 2D DatetimeArray return np.asarray(self._values[indexer]) - new_index = self.index[indexer] - res = self._constructor(new_values, index=new_index, dtype=new_values.dtype) - return res.__finalize__(self) def _get_value(self, label, takeable: bool = False): """ diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index fd83211adb05e..fc06b85b1f954 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -822,7 +822,23 @@ def test_get_slice(self, mgr): def assert_slice_ok(mgr, axis, slobj): mat = mgr.as_array() - sliced = mgr.get_slice(slobj, axis=axis) + # we maybe using an ndarray to test slicing and + # might not be the full length of the axis + if isinstance(slobj, np.ndarray): + ax = mgr.axes[axis] + if len(ax) and len(slobj) and len(slobj) != len(ax): + slobj = np.concatenate( + [slobj, np.zeros(len(ax) - len(slobj), dtype=bool)] + ) + + if isinstance(slobj, slice): + sliced = mgr.get_slice(slobj, axis=axis) + elif mgr.ndim == 1 and axis == 0: + sliced = mgr.getitem_mgr(slobj) + else: + # BlockManager doesn't support non-slice, SingleBlockManager + # doesn't support axis > 0 + return mat_slobj = (slice(None),) * axis + (slobj,) tm.assert_numpy_array_equal( @@ -839,6 +855,28 @@ def assert_slice_ok(mgr, axis, slobj): assert_slice_ok(mgr, ax, slice(1, 4)) assert_slice_ok(mgr, ax, slice(3, 0, -2)) + if mgr.ndim < 2: + # 2D only support slice objects + + # boolean mask + assert_slice_ok(mgr, ax, np.array([], dtype=np.bool_)) + assert_slice_ok(mgr, ax, np.ones(mgr.shape[ax], dtype=np.bool_)) + assert_slice_ok(mgr, ax, np.zeros(mgr.shape[ax], dtype=np.bool_)) + + if mgr.shape[ax] >= 3: + assert_slice_ok(mgr, ax, np.arange(mgr.shape[ax]) % 3 == 0) + assert_slice_ok( + mgr, ax, np.array([True, True, False], dtype=np.bool_) + ) + + # fancy indexer + assert_slice_ok(mgr, ax, []) + assert_slice_ok(mgr, ax, list(range(mgr.shape[ax]))) + + if mgr.shape[ax] >= 3: + assert_slice_ok(mgr, ax, [0, 1, 2]) + assert_slice_ok(mgr, ax, [-1, -2, -3]) + @pytest.mark.parametrize("mgr", MANAGERS) def test_take(self, mgr): def assert_take_ok(mgr, axis, indexer): From 7f7bbce093201b2fba89b6d715437c34e02462fd Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 24 Mar 2021 14:42:29 -0700 Subject: [PATCH 3/3] type ignore, windows troubleshoot --- pandas/core/internals/array_manager.py | 5 ++++- pandas/core/internals/managers.py | 3 ++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 60df39312e971..950d229c45f9e 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -920,7 +920,10 @@ def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None: value = extract_array(value, extract_numpy=True) if value.ndim == 2: if value.shape[0] == 1: - value = value[0, :] + # error: Invalid index type "Tuple[int, slice]" for + # "Union[Any, ExtensionArray, ndarray]"; expected type + # "Union[int, slice, ndarray]" + value = value[0, :] # type: ignore[index] else: raise ValueError( f"Expected a 1D array, got an array with shape {value.shape}" diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 4985cbbe36bc8..14fa994631623 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1075,7 +1075,8 @@ def idelete(self, indexer): blk.mgr_locs = bml.add(ref_loc_offset[bml.indexer]) - self.axes[0] = self.items.delete(indexer) + # FIXME: use Index.delete as soon as it uses fastpath=True + self.axes[0] = self.items[~is_deleted] self.blocks = tuple( b for blkno, b in enumerate(self.blocks) if not is_blk_deleted[blkno] )