From a8459ae4fac4e671cf5753c454545470e2053b4b Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 28 Nov 2020 08:08:00 -0800 Subject: [PATCH 01/15] REF: de-duplicate Block.__init__ --- pandas/core/internals/blocks.py | 31 ++++--------------------------- 1 file changed, 4 insertions(+), 27 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 74b5a184df95d..b1dd6b2ad6bce 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -144,6 +144,10 @@ def __init__(self, values, placement, ndim: int): f"placement implies {len(self.mgr_locs)}" ) + if self.is_extension and self.ndim == 2 and len(self.mgr_locs) != 1: + # TODO(EA2D): check unnecessary with 2D EAs + raise AssertionError("block.size != values.size") + def _maybe_coerce_values(self, values): """ Ensure we have correctly-typed values. @@ -1667,33 +1671,6 @@ class ExtensionBlock(Block): values: ExtensionArray - def __init__(self, values, placement, ndim: int): - """ - Initialize a non-consolidatable block. - - 'ndim' may be inferred from 'placement'. - - This will call continue to call __init__ for the other base - classes mixed in with this Mixin. - """ - - # Placement must be converted to BlockPlacement so that we can check - # its length - if not isinstance(placement, libinternals.BlockPlacement): - placement = libinternals.BlockPlacement(placement) - - # Maybe infer ndim from placement - if ndim is None: - if len(placement) != 1: - ndim = 1 - else: - ndim = 2 - super().__init__(values, placement, ndim=ndim) - - if self.ndim == 2 and len(self.mgr_locs) != 1: - # TODO(EA2D): check unnecessary with 2D EAs - raise AssertionError("block.size != values.size") - @property def shape(self): # TODO(EA2D): override unnecessary with 2D EAs From e105769478a4ff915e66974b889847c0b19c9d35 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 28 Nov 2020 12:26:11 -0800 Subject: [PATCH 02/15] deprecate allowing ndim=None --- pandas/core/internals/blocks.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index b1dd6b2ad6bce..abf3218165d1a 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -134,9 +134,9 @@ def __init__(self, values, placement, ndim: int): 1 for SingleBlockManager/Series, 2 for BlockManager/DataFrame """ # TODO(EA2D): ndim will be unnecessary with 2D EAs - self.ndim = self._check_ndim(values, ndim) self.mgr_locs = placement self.values = self._maybe_coerce_values(values) + self.ndim = self._check_ndim(values, ndim) if self._validate_ndim and self.ndim and len(self.mgr_locs) != len(self.values): raise ValueError( @@ -184,7 +184,19 @@ def _check_ndim(self, values, ndim): ValueError : the number of dimensions do not match """ if ndim is None: - ndim = values.ndim + warnings.warn( + "Accepting ndim=None in the Block constructor is deprecated, " + "this will raise in a future version.", + FutureWarning, + stacklevel=3, + ) + if self.is_extension: + if len(self.mgr_locs) != 1: + ndim = 1 + else: + ndim = 2 + else: + ndim = values.ndim if self._validate_ndim and values.ndim != ndim: raise ValueError( From 766e4814859a82671862c873d98379a3e50d4458 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 28 Nov 2020 13:33:57 -0800 Subject: [PATCH 03/15] catch warnings in parquet docs --- doc/source/user_guide/io.rst | 4 ++++ doc/source/user_guide/scale.rst | 6 ++++++ 2 files changed, 10 insertions(+) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 1bd35131622ab..e7e653aed18a7 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -4735,6 +4735,7 @@ Write to a feather file. Read from a feather file. .. ipython:: python + :okwarning: result = pd.read_feather("example.feather") result @@ -4818,6 +4819,7 @@ Write to a parquet file. Read from a parquet file. .. ipython:: python + :okwarning: result = pd.read_parquet("example_fp.parquet", engine="fastparquet") result = pd.read_parquet("example_pa.parquet", engine="pyarrow") @@ -4827,6 +4829,7 @@ Read from a parquet file. Read only certain columns of a parquet file. .. ipython:: python + :okwarning: result = pd.read_parquet( "example_fp.parquet", @@ -4895,6 +4898,7 @@ Partitioning Parquet files Parquet supports partitioning of data based on the values of one or more columns. .. ipython:: python + :okwarning: df = pd.DataFrame({"a": [0, 0, 1, 1], "b": [0, 1, 0, 1]}) df.to_parquet(path="test", engine="pyarrow", partition_cols=["a"], compression=None) diff --git a/doc/source/user_guide/scale.rst b/doc/source/user_guide/scale.rst index 7f2419bc7f19d..ef50ed406bbf1 100644 --- a/doc/source/user_guide/scale.rst +++ b/doc/source/user_guide/scale.rst @@ -71,6 +71,7 @@ To load the columns we want, we have two options. Option 1 loads in all the data and then filters to what we need. .. ipython:: python + :okwarning: columns = ["id_0", "name_0", "x_0", "y_0"] @@ -98,6 +99,7 @@ referred to as "low-cardinality" data). By using more efficient data types, you can store larger datasets in memory. .. ipython:: python + :okwarning: ts = pd.read_parquet("timeseries.parquet") ts @@ -206,6 +208,7 @@ counts up to this point. As long as each individual file fits in memory, this wi work for arbitrary-sized datasets. .. ipython:: python + :okwarning: %%time files = pathlib.Path("data/timeseries/").glob("ts*.parquet") @@ -289,6 +292,7 @@ returns a Dask Series with the same dtype and the same name. To get the actual result you can call ``.compute()``. .. ipython:: python + :okwarning: %time ddf["name"].value_counts().compute() @@ -322,6 +326,7 @@ Dask implements the most used parts of the pandas API. For example, we can do a familiar groupby aggregation. .. ipython:: python + :okwarning: %time ddf.groupby("name")[["x", "y"]].mean().compute().head() @@ -345,6 +350,7 @@ we need to supply the divisions manually. Now we can do things like fast random access with ``.loc``. .. ipython:: python + :okwarning: ddf.loc["2002-01-01 12:01":"2002-01-01 12:05"].compute() From db17e6da11e188f93daafd8a490d7ffe8addfb53 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 28 Nov 2020 15:08:21 -0800 Subject: [PATCH 04/15] okwarning --- doc/source/user_guide/scale.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/user_guide/scale.rst b/doc/source/user_guide/scale.rst index ef50ed406bbf1..33be15b951aaa 100644 --- a/doc/source/user_guide/scale.rst +++ b/doc/source/user_guide/scale.rst @@ -80,6 +80,7 @@ Option 1 loads in all the data and then filters to what we need. Option 2 only loads the columns we request. .. ipython:: python + :okwarning: pd.read_parquet("timeseries_wide.parquet", columns=columns) From 5b70d6e199178e97cb9974c1770ad148a7b1eeab Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 29 Nov 2020 12:15:01 -0800 Subject: [PATCH 05/15] FutureWarning -> DeprecationWarning --- pandas/core/internals/blocks.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index abf3218165d1a..93c6b7f41a555 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -184,10 +184,11 @@ def _check_ndim(self, values, ndim): ValueError : the number of dimensions do not match """ if ndim is None: + # GH#38134 warnings.warn( "Accepting ndim=None in the Block constructor is deprecated, " "this will raise in a future version.", - FutureWarning, + DeprecationWarning, stacklevel=3, ) if self.is_extension: From 724fc562fca653c5b61acf27283d4b494ff64df6 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 9 Dec 2020 12:25:53 -0800 Subject: [PATCH 06/15] catch warnings from pyarrow --- pandas/tests/io/test_feather.py | 4 +++- pandas/tests/io/test_parquet.py | 8 ++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 58ae5196151c1..4d7a12de2cc9c 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -88,7 +88,9 @@ def test_basic(self): # df["intervals"] = pd.interval_range(0, 3, 3) assert df.dttz.dtype.tz.zone == "US/Eastern" - self.check_round_trip(df) + with tm.assert_produces_warning(FutureWarning): + # GH#38134 until pyarrow updates to pass ndim to Block constructor + self.check_round_trip(df) def test_duplicate_columns(self): diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index fe3ca0d0937b3..7f8cb2d8ca331 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -561,7 +561,9 @@ def test_basic(self, pa, df_full): df["datetime_tz"] = dti df["bool_with_none"] = [True, None, True] - check_round_trip(df, pa) + with tm.assert_produces_warning(FutureWarning): + # GH#38134 until pyarrow updates to pass ndim to Block constructor + check_round_trip(df, pa) def test_basic_subset_columns(self, pa, df_full): # GH18628 @@ -883,7 +885,9 @@ def test_timezone_aware_index(self, pa, timezone_aware_date_list): # they both implement datetime.tzinfo # they both wrap datetime.timedelta() # this use-case sets the resolution to 1 minute - check_round_trip(df, pa, check_dtype=False) + with tm.assert_produces_warning(FutureWarning): + # GH#38134 until pyarrow updates to pass ndim to Block constructor + check_round_trip(df, pa, check_dtype=False) @td.skip_if_no("pyarrow", min_version="0.17") def test_filter_row_groups(self, pa): From c6ba82614a7bad6695b0bfc06592de4c731faa91 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 9 Dec 2020 15:06:16 -0800 Subject: [PATCH 07/15] post-merge fixup --- pandas/core/internals/blocks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index b6225308ab9ad..7766ce109ed15 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1559,7 +1559,7 @@ def _unstack(self, unstacker, fill_value, new_placement): new_values = new_values.T[mask] new_placement = new_placement[mask] - blocks = [make_block(new_values, placement=new_placement)] + blocks = [make_block(new_values, placement=new_placement, ndim=2)] return blocks, mask def quantile(self, qs, interpolation="linear", axis: int = 0): From 8dc0f94f5ee0338d2c428660eaeb273977fef005 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 17 Dec 2020 15:08:19 -0800 Subject: [PATCH 08/15] update stacklevel --- pandas/core/internals/blocks.py | 3 ++- pandas/tests/io/test_feather.py | 2 +- pandas/tests/io/test_parquet.py | 6 ++---- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 8dd4d8dcaa185..a694c45fe726c 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -184,7 +184,7 @@ def _check_ndim(self, values, ndim): "Accepting ndim=None in the Block constructor is deprecated, " "this will raise in a future version.", DeprecationWarning, - stacklevel=3, + stacklevel=10, ) if self.is_extension: if len(self.mgr_locs) != 1: @@ -199,6 +199,7 @@ def _check_ndim(self, values, ndim): "Wrong number of dimensions. " f"values.ndim != ndim [{values.ndim} != {ndim}]" ) + return ndim @property diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 4d7a12de2cc9c..29dd72a1de379 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -88,7 +88,7 @@ def test_basic(self): # df["intervals"] = pd.interval_range(0, 3, 3) assert df.dttz.dtype.tz.zone == "US/Eastern" - with tm.assert_produces_warning(FutureWarning): + with tm.assert_produces_warning(DeprecationWarning): # GH#38134 until pyarrow updates to pass ndim to Block constructor self.check_round_trip(df) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 85e91ffad2d53..7ba959b64ebf6 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -561,9 +561,7 @@ def test_basic(self, pa, df_full): df["datetime_tz"] = dti df["bool_with_none"] = [True, None, True] - with tm.assert_produces_warning(FutureWarning): - # GH#38134 until pyarrow updates to pass ndim to Block constructor - check_round_trip(df, pa) + check_round_trip(df, pa) def test_basic_subset_columns(self, pa, df_full): # GH18628 @@ -880,7 +878,7 @@ def test_timezone_aware_index(self, pa, timezone_aware_date_list): # they both implement datetime.tzinfo # they both wrap datetime.timedelta() # this use-case sets the resolution to 1 minute - with tm.assert_produces_warning(FutureWarning): + with tm.assert_produces_warning(DeprecationWarning): # GH#38134 until pyarrow updates to pass ndim to Block constructor check_round_trip(df, pa, check_dtype=False) From 60a8649cd9e3c442b7922476f9de5990f9da617c Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 5 Mar 2021 16:08:36 -0800 Subject: [PATCH 09/15] CLN: BlockManager.get_slice require only slice arg --- pandas/core/internals/blocks.py | 2 +- pandas/core/internals/managers.py | 32 +++++++++++++++++------- pandas/core/series.py | 3 ++- pandas/tests/internals/test_internals.py | 11 +++++++- 4 files changed, 36 insertions(+), 12 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 597023cb5b000..1d65c45910875 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -362,7 +362,7 @@ def getitem_block(self, slicer, new_mgr_locs=None) -> Block: """ Perform __getitem__-like, return result as block. - As of now, only supports slices that preserve dimensionality. + Only supports slices that preserve dimensionality. """ if new_mgr_locs is None: axis0_slicer = slicer[0] if isinstance(slicer, tuple) else slicer diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 744d3453c8a96..f271866dcf0e3 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -788,6 +788,7 @@ def _combine( return type(self).from_blocks(new_blocks, axes) def get_slice(self, slobj: slice, axis: int = 0) -> BlockManager: + assert isinstance(slobj, slice), type(slobj) if axis == 0: new_blocks = self._slice_take_blocks_ax0(slobj) @@ -1188,7 +1189,9 @@ def value_getitem(placement): # Newly created block's dtype may already be present. self._known_consolidated = False - def insert(self, loc: int, item: Hashable, value, allow_duplicates: bool = False): + def insert( + self, loc: int, item: Hashable, value: ArrayLike, allow_duplicates: bool = False + ): """ Insert item at selected position. @@ -1196,7 +1199,7 @@ def insert(self, loc: int, item: Hashable, value, allow_duplicates: bool = False ---------- loc : int item : hashable - value : array_like + value : np.ndarray or ExtensionArray allow_duplicates: bool If False, trying to insert non-unique item will raise @@ -1213,11 +1216,9 @@ def insert(self, loc: int, item: Hashable, value, allow_duplicates: bool = False if value.ndim == 2: value = value.T - elif value.ndim == self.ndim - 1 and not is_extension_array_dtype(value.dtype): - # TODO(EA2D): special case not needed with 2D EAs - value = ensure_block_shape(value, ndim=2) + else: + value = ensure_block_shape(value, ndim=self.ndim) - # TODO: type value as ArrayLike block = new_block(values=value, ndim=self.ndim, placement=slice(loc, loc + 1)) for blkno, count in _fast_count_smallints(self.blknos[loc:]): @@ -1322,7 +1323,7 @@ def reindex_indexer( def _slice_take_blocks_ax0( self, slice_or_indexer, fill_value=lib.no_default, only_slice: bool = False - ): + ) -> List[Block]: """ Slice/take blocks along axis=0. @@ -1354,6 +1355,7 @@ def _slice_take_blocks_ax0( # TODO(EA2D): special casing unnecessary with 2D EAs if sllen == 0: return [] + # TODO: tests all have isinstance(slobj, slice), other possibilities? return [blk.getitem_block(slobj, new_mgr_locs=slice(0, sllen))] elif not allow_fill or self.ndim == 1: if allow_fill and fill_value is None: @@ -1363,9 +1365,11 @@ def _slice_take_blocks_ax0( # GH#33597 slice instead of take, so we get # views instead of copies blocks = [ - blk.getitem_block([ml], new_mgr_locs=i) + blk.getitem_block(slice(ml, ml + 1), new_mgr_locs=i) for i, ml in enumerate(slobj) ] + # We have + # all(np.shares_memory(nb.values, blk.values) for nb in blocks) return blocks else: return [ @@ -1427,7 +1431,8 @@ def _slice_take_blocks_ax0( # GH#33597 slice instead of take, so we get # views instead of copies for i, ml in zip(taker, mgr_locs): - nb = blk.getitem_block([i], new_mgr_locs=ml) + nb = blk.getitem_block(slice(i, i + 1), new_mgr_locs=ml) + # We have np.shares_memory(nb.values, blk.values) blocks.append(nb) else: nb = blk.take_nd(taker, axis=0, new_mgr_locs=mgr_locs) @@ -1591,7 +1596,15 @@ def _blklocs(self): """ compat with BlockManager """ return None + def getitem_mgr(self, indexer) -> SingleBlockManager: + # similar to get_slice, but not restricted to slice indexer + blk = self._block + array = blk._slice(indexer) + block = blk.make_block_same_class(array, placement=slice(0, len(array))) + return type(self)(block, self.index[indexer]) + def get_slice(self, slobj: slice, axis: int = 0) -> SingleBlockManager: + assert isinstance(slobj, slice), type(slobj) if axis >= self.ndim: raise IndexError("Requested axis not found in manager") @@ -1962,6 +1975,7 @@ def _preprocess_slice_or_indexer(slice_or_indexer, length: int, allow_fill: bool ): return "mask", slice_or_indexer, slice_or_indexer.sum() else: + # TODO: np.intp? indexer = np.asanyarray(slice_or_indexer, dtype=np.int64) if not allow_fill: indexer = maybe_convert_indices(indexer, length) diff --git a/pandas/core/series.py b/pandas/core/series.py index 5a5d1c44b312c..f1cc38d8c96a7 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -983,7 +983,8 @@ def _get_values_tuple(self, key): def _get_values(self, indexer): try: - return self._constructor(self._mgr.get_slice(indexer)).__finalize__(self) + new_mgr = self._mgr.getitem_mgr(indexer) + return self._constructor(new_mgr).__finalize__(self) except ValueError: # mpl compat if we look up e.g. ser[:, np.newaxis]; # see tests.series.timeseries.test_mpl_compat_hack diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 683006d9b3b9c..a7f318498a8ac 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -823,7 +823,16 @@ def assert_slice_ok(mgr, axis, slobj): slobj = np.concatenate( [slobj, np.zeros(len(ax) - len(slobj), dtype=bool)] ) - sliced = mgr.get_slice(slobj, axis=axis) + + if isinstance(slobj, slice): + sliced = mgr.get_slice(slobj, axis=axis) + elif mgr.ndim == 1 and axis == 0: + sliced = mgr.getitem_mgr(slobj) + else: + # BlockManager doesnt support non-slice, SingleBlockManager + # doesnt support axis > 0 + return + mat_slobj = (slice(None),) * axis + (slobj,) tm.assert_numpy_array_equal( mat[mat_slobj], sliced.as_array(), check_dtype=False From b17ad656657859b89cc3e051dd7636976b6e53c0 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 5 Mar 2021 17:48:38 -0800 Subject: [PATCH 10/15] mypy fixup --- pandas/core/internals/array_manager.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 905fa448ff033..67f4cb4511389 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -779,6 +779,8 @@ def get_slice(self, slobj: slice, axis: int = 0) -> ArrayManager: return type(self)(arrays, new_axes, verify_integrity=False) + getitem_mgr = get_slice + def fast_xs(self, loc: int) -> ArrayLike: """ Return the array corresponding to `frame.iloc[loc]`. From 5afce0407028adc012c2f2d168f26748c587e783 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 6 Mar 2021 11:57:50 -0800 Subject: [PATCH 11/15] PERF: implement Index._getitem_slice --- pandas/core/indexes/base.py | 7 +++++++ pandas/core/indexes/multi.py | 18 ++++++++++++++++++ pandas/core/indexes/range.py | 7 +++++++ pandas/core/internals/managers.py | 5 +++-- 4 files changed, 35 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 09c143468bc31..a81a085dc6ae6 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4560,6 +4560,13 @@ def __getitem__(self, key): else: return result + def _getitem_slice(self: _IndexT, slobj: slice) -> _IndexT: + """ + Fastpath for __getitem__ when we know we have a slice. + """ + res = self._data[slobj] + return type(self)._simple_new(res, name=self._name) + @final def _can_hold_identifiers_and_holds_name(self, name) -> bool: """ diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index fd0e0ef5fa799..990cc0907f46a 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2097,6 +2097,24 @@ def __getitem__(self, key): verify_integrity=False, ) + def _getitem_slice(self: MultiIndex, slobj: slice) -> MultiIndex: + """ + Fastpath for __getitem__ when we know we have a slice. + """ + sortorder = None + if slobj.step is None or slobj.step > 0: + sortorder = self.sortorder + + new_codes = [level_codes[slobj] for level_codes in self.codes] + + return type(self)( + levels=self.levels, + codes=new_codes, + names=self._names, + sortorder=sortorder, + verify_integrity=False, + ) + @Appender(_index_shared_docs["take"] % _index_doc_kwargs) def take( self: MultiIndex, indices, axis=0, allow_fill=True, fill_value=None, **kwargs diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index bf5a9825f04d0..c12fbb21c313b 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -816,6 +816,13 @@ def __getitem__(self, key): # fall back to Int64Index return super().__getitem__(key) + def _getitem_slice(self: RangeIndex, slobj: slice) -> RangeIndex: + """ + Fastpath for __getitem__ when we know we have a slice. + """ + res = self._range[slobj] + return type(self)._simple_new(res, name=self._name) + @unpack_zerodim_and_defer("__floordiv__") def __floordiv__(self, other): diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index cc8730c31f65e..2daa1ce8dc9a4 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -813,7 +813,7 @@ def get_slice(self, slobj: slice, axis: int = 0) -> BlockManager: raise IndexError("Requested axis not found in manager") new_axes = list(self.axes) - new_axes[axis] = new_axes[axis][slobj] + new_axes[axis] = new_axes[axis]._getitem_slice(slobj) return type(self)._simple_new(tuple(new_blocks), new_axes) @@ -1624,7 +1624,8 @@ def get_slice(self, slobj: slice, axis: int = 0) -> SingleBlockManager: blk = self._block array = blk._slice(slobj) block = blk.make_block_same_class(array, placement=slice(0, len(array))) - return type(self)(block, self.index[slobj]) + new_index = self.index._getitem_slice(slobj) + return type(self)(block, new_index) @property def index(self) -> Index: From 97a1e22b4ba6a8b91a37242f6c640094b90390b8 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 10 Mar 2021 14:58:53 -0800 Subject: [PATCH 12/15] REF: require BlockPlacement in the Block constructor --- pandas/core/internals/blocks.py | 35 +++++++++++----- pandas/core/internals/managers.py | 42 +++++++++++++------ pandas/core/internals/ops.py | 2 +- pandas/tests/extension/test_external_block.py | 5 ++- pandas/tests/internals/test_internals.py | 8 ++-- 5 files changed, 62 insertions(+), 30 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index e157d741fbe77..86a2fa64c505c 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -155,7 +155,7 @@ def _simple_new( obj._mgr_locs = placement return obj - def __init__(self, values, placement, ndim: int): + def __init__(self, values, placement: BlockPlacement, ndim: int): """ Parameters ---------- @@ -164,8 +164,10 @@ def __init__(self, values, placement, ndim: int): ndim : int 1 for SingleBlockManager/Series, 2 for BlockManager/DataFrame """ + assert isinstance(ndim, int) + assert isinstance(placement, BlockPlacement) self.ndim = ndim - self.mgr_locs = placement + self._mgr_locs = placement self.values = self._maybe_coerce_values(values) @classmethod @@ -259,14 +261,12 @@ def fill_value(self): return np.nan @property - def mgr_locs(self): + def mgr_locs(self) -> BlockPlacement: return self._mgr_locs @mgr_locs.setter - def mgr_locs(self, new_mgr_locs): - if not isinstance(new_mgr_locs, libinternals.BlockPlacement): - new_mgr_locs = libinternals.BlockPlacement(new_mgr_locs) - + def mgr_locs(self, new_mgr_locs: BlockPlacement): + assert isinstance(new_mgr_locs, BlockPlacement) self._mgr_locs = new_mgr_locs @final @@ -283,7 +283,9 @@ def make_block(self, values, placement=None) -> Block: return new_block(values, placement=placement, ndim=self.ndim) @final - def make_block_same_class(self, values, placement=None) -> Block: + def make_block_same_class( + self, values, placement: Optional[BlockPlacement] = None + ) -> Block: """ Wrap given values in a block of same type as self. """ if placement is None: placement = self.mgr_locs @@ -1266,7 +1268,11 @@ def func(yvalues: np.ndarray) -> np.ndarray: return self._maybe_downcast(blocks, downcast) def take_nd( - self, indexer, axis: int, new_mgr_locs=None, fill_value=lib.no_default + self, + indexer, + axis: int, + new_mgr_locs: Optional[BlockPlacement] = None, + fill_value=lib.no_default, ) -> Block: """ Take values according to indexer and return them as a block.bb @@ -1645,7 +1651,11 @@ def to_native_types(self, na_rep="nan", quoting=None, **kwargs): return self.make_block(values) def take_nd( - self, indexer, axis: int = 0, new_mgr_locs=None, fill_value=lib.no_default + self, + indexer, + axis: int = 0, + new_mgr_locs: Optional[BlockPlacement] = None, + fill_value=lib.no_default, ) -> Block: """ Take values according to indexer and return them as a block. @@ -1812,7 +1822,7 @@ def _unstack(self, unstacker, fill_value, new_placement): blocks = [ self.make_block_same_class( self.values.take(indices, allow_fill=True, fill_value=fill_value), - [place], + BlockPlacement(place), ) for indices, place in zip(new_values.T, new_placement) ] @@ -2300,6 +2310,9 @@ def new_block(values, placement, *, ndim: int, klass=None) -> Block: if klass is None: klass = get_block_type(values, values.dtype) + if not isinstance(placement, BlockPlacement): + placement = BlockPlacement(placement) + return klass(values, ndim=ndim, placement=placement) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 447148b4ef0b7..d766830f75c27 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -23,6 +23,7 @@ internals as libinternals, lib, ) +from pandas._libs.internals import BlockPlacement from pandas._typing import ( ArrayLike, Dtype, @@ -239,7 +240,8 @@ def make_empty(self: T, axes=None) -> T: assert isinstance(self, SingleBlockManager) # for mypy blk = self.blocks[0] arr = blk.values[:0] - nb = blk.make_block_same_class(arr, placement=slice(0, 0)) + bp = BlockPlacement(slice(0, 0)) + nb = blk.make_block_same_class(arr, placement=bp) blocks = [nb] else: blocks = [] @@ -799,7 +801,7 @@ def _combine( new_blocks: List[Block] = [] for b in blocks: b = b.copy(deep=copy) - b.mgr_locs = inv_indexer[b.mgr_locs.indexer] + b.mgr_locs = BlockPlacement(inv_indexer[b.mgr_locs.indexer]) new_blocks.append(b) axes = list(self.axes) @@ -1069,7 +1071,8 @@ def iget(self, i: int) -> SingleBlockManager: values = block.iget(self.blklocs[i]) # shortcut for select a single-dim from a 2-dim BM - nb = type(block)(values, placement=slice(0, len(values)), ndim=1) + bp = BlockPlacement(slice(0, len(values))) + nb = type(block)(values, placement=bp, ndim=1) return SingleBlockManager(nb, self.axes[1]) def iget_values(self, i: int) -> ArrayLike: @@ -1281,7 +1284,7 @@ def insert( else: new_mgr_locs = blk.mgr_locs.as_array.copy() new_mgr_locs[new_mgr_locs >= loc] += 1 - blk.mgr_locs = new_mgr_locs + blk.mgr_locs = BlockPlacement(new_mgr_locs) # Accessing public blklocs ensures the public versions are initialized if loc == self.blklocs.shape[0]: @@ -1409,7 +1412,8 @@ def _slice_take_blocks_ax0( if sllen == 0: return [] # TODO: tests all have isinstance(slobj, slice), other possibilities? - return [blk.getitem_block(slobj, new_mgr_locs=slice(0, sllen))] + bp = BlockPlacement(slice(0, sllen)) + return [blk.getitem_block(slobj, new_mgr_locs=bp)] elif not allow_fill or self.ndim == 1: if allow_fill and fill_value is None: fill_value = blk.fill_value @@ -1418,18 +1422,21 @@ def _slice_take_blocks_ax0( # GH#33597 slice instead of take, so we get # views instead of copies blocks = [ - blk.getitem_block(slice(ml, ml + 1), new_mgr_locs=i) + blk.getitem_block( + slice(ml, ml + 1), new_mgr_locs=BlockPlacement(i) + ) for i, ml in enumerate(slobj) ] # We have # all(np.shares_memory(nb.values, blk.values) for nb in blocks) return blocks else: + bp = BlockPlacement(slice(0, sllen)) return [ blk.take_nd( slobj, axis=0, - new_mgr_locs=slice(0, sllen), + new_mgr_locs=bp, fill_value=fill_value, ) ] @@ -1466,7 +1473,7 @@ def _slice_take_blocks_ax0( # item. for mgr_loc in mgr_locs: newblk = blk.copy(deep=False) - newblk.mgr_locs = slice(mgr_loc, mgr_loc + 1) + newblk.mgr_locs = BlockPlacement(slice(mgr_loc, mgr_loc + 1)) blocks.append(newblk) else: @@ -1659,8 +1666,15 @@ def getitem_mgr(self, indexer) -> SingleBlockManager: # similar to get_slice, but not restricted to slice indexer blk = self._block array = blk._slice(indexer) - block = blk.make_block_same_class(array, placement=slice(0, len(array))) - return type(self)(block, self.index[indexer]) + if array.ndim > 1: + # This will be caught by Series._get_values + raise ValueError("dimension-expanding indexing not allowed") + + bp = BlockPlacement(slice(0, len(array))) + block = blk.make_block_same_class(array, placement=bp) + + new_idx = self.index[indexer] + return type(self)(block, new_idx) def get_slice(self, slobj: slice, axis: int = 0) -> SingleBlockManager: assert isinstance(slobj, slice), type(slobj) @@ -1672,7 +1686,8 @@ def get_slice(self, slobj: slice, axis: int = 0) -> SingleBlockManager: if array.ndim > blk.values.ndim: # This will be caught by Series._get_values raise ValueError("dimension-expanding indexing not allowed") - block = blk.make_block_same_class(array, placement=slice(0, len(array))) + bp = BlockPlacement(slice(0, len(array))) + block = blk.make_block_same_class(array, placement=bp) new_index = self.index._getitem_slice(slobj) return type(self)(block, new_index) @@ -1736,7 +1751,7 @@ def set_values(self, values: ArrayLike): valid for the current Block/SingleBlockManager (length, dtype, etc). """ self.blocks[0].values = values - self.blocks[0]._mgr_locs = libinternals.BlockPlacement(slice(len(values))) + self.blocks[0]._mgr_locs = BlockPlacement(slice(len(values))) # -------------------------------------------------------------------- @@ -2022,7 +2037,8 @@ def _merge_blocks( new_values = new_values[argsort] new_mgr_locs = new_mgr_locs[argsort] - return [new_block(new_values, placement=new_mgr_locs, ndim=2)] + bp = BlockPlacement(new_mgr_locs) + return [new_block(new_values, placement=bp, ndim=2)] # can't consolidate --> no merge return blocks diff --git a/pandas/core/internals/ops.py b/pandas/core/internals/ops.py index 103092ba37b70..996421ade226c 100644 --- a/pandas/core/internals/ops.py +++ b/pandas/core/internals/ops.py @@ -87,7 +87,7 @@ def _reset_block_mgr_locs(nbs: List[Block], locs): Reset mgr_locs to correspond to our original DataFrame. """ for nb in nbs: - nblocs = locs.as_array[nb.mgr_locs.indexer] + nblocs = locs[nb.mgr_locs.indexer] nb.mgr_locs = nblocs # Assertions are disabled for performance, but should hold: # assert len(nblocs) == nb.shape[0], (len(nblocs), nb.shape) diff --git a/pandas/tests/extension/test_external_block.py b/pandas/tests/extension/test_external_block.py index 693d0645c9519..9360294f5a3f7 100644 --- a/pandas/tests/extension/test_external_block.py +++ b/pandas/tests/extension/test_external_block.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._libs.internals import BlockPlacement + import pandas as pd from pandas.core.internals import BlockManager from pandas.core.internals.blocks import ExtensionBlock @@ -17,7 +19,8 @@ def df(): df1 = pd.DataFrame({"a": [1, 2, 3]}) blocks = df1._mgr.blocks values = np.arange(3, dtype="int64") - custom_block = CustomBlock(values, placement=slice(1, 2), ndim=2) + bp = BlockPlacement(slice(1, 2)) + custom_block = CustomBlock(values, placement=bp, ndim=2) blocks = blocks + (custom_block,) block_manager = BlockManager(blocks, [pd.Index(["a", "b"]), df1.index]) return pd.DataFrame(block_manager) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 2f5764ab5bd77..714010f411a99 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -327,8 +327,8 @@ def test_duplicate_ref_loc_failure(self): axes, blocks = tmp_mgr.axes, tmp_mgr.blocks - blocks[0].mgr_locs = np.array([0]) - blocks[1].mgr_locs = np.array([0]) + blocks[0].mgr_locs = BlockPlacement(np.array([0])) + blocks[1].mgr_locs = BlockPlacement(np.array([0])) # test trying to create block manager with overlapping ref locs @@ -338,8 +338,8 @@ def test_duplicate_ref_loc_failure(self): mgr = BlockManager(blocks, axes) mgr._rebuild_blknos_and_blklocs() - blocks[0].mgr_locs = np.array([0]) - blocks[1].mgr_locs = np.array([1]) + blocks[0].mgr_locs = BlockPlacement(np.array([0])) + blocks[1].mgr_locs = BlockPlacement(np.array([1])) mgr = BlockManager(blocks, axes) mgr.iget(1) From afdc559e0494bb03a8cc6a9a0639d3fd141fa55f Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 10 Mar 2021 16:23:11 -0800 Subject: [PATCH 13/15] mypy fixup --- pandas/core/internals/api.py | 19 ++++++++++++++++--- pandas/core/internals/blocks.py | 4 ++-- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/pandas/core/internals/api.py b/pandas/core/internals/api.py index e6ea2c642650d..3048a50da81bd 100644 --- a/pandas/core/internals/api.py +++ b/pandas/core/internals/api.py @@ -6,14 +6,23 @@ 2) Use only functions exposed here (or in core.internals) """ -from typing import Optional +from typing import ( + Optional, + cast, +) import numpy as np from pandas._libs.internals import BlockPlacement -from pandas._typing import Dtype +from pandas._typing import ( + Dtype, + DtypeObj, +) -from pandas.core.dtypes.common import is_datetime64tz_dtype +from pandas.core.dtypes.common import ( + is_datetime64tz_dtype, + pandas_dtype, +) from pandas.core.arrays import DatetimeArray from pandas.core.internals.blocks import ( @@ -39,6 +48,10 @@ def make_block( - Block.make_block_same_class - Block.__init__ """ + if dtype is not None: + dtype = pandas_dtype(dtype) + dtype = cast(Optional[DtypeObj], dtype) + values, dtype = extract_pandas_array(values, dtype, ndim) if klass is None: diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 86a2fa64c505c..f44a7a27f5891 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2359,8 +2359,8 @@ def check_ndim(values, placement: BlockPlacement, ndim: int): def extract_pandas_array( - values: ArrayLike, dtype: Optional[DtypeObj], ndim: int -) -> Tuple[ArrayLike, Optional[DtypeObj]]: + values: Union[np.ndarray, ExtensionArray], dtype: Optional[DtypeObj], ndim: int +) -> Tuple[Union[np.ndarray, ExtensionArray], Optional[DtypeObj]]: """ Ensure that we don't allow PandasArray / PandasDtype in internals. """ From 935a2920fa029b49a811e3cc312b4204781a7cf2 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 10 Mar 2021 17:31:21 -0800 Subject: [PATCH 14/15] mypy fixup --- pandas/core/internals/api.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/pandas/core/internals/api.py b/pandas/core/internals/api.py index 3048a50da81bd..468c338e215fe 100644 --- a/pandas/core/internals/api.py +++ b/pandas/core/internals/api.py @@ -6,18 +6,12 @@ 2) Use only functions exposed here (or in core.internals) """ -from typing import ( - Optional, - cast, -) +from typing import Optional import numpy as np from pandas._libs.internals import BlockPlacement -from pandas._typing import ( - Dtype, - DtypeObj, -) +from pandas._typing import Dtype from pandas.core.dtypes.common import ( is_datetime64tz_dtype, @@ -50,7 +44,6 @@ def make_block( """ if dtype is not None: dtype = pandas_dtype(dtype) - dtype = cast(Optional[DtypeObj], dtype) values, dtype = extract_pandas_array(values, dtype, ndim) From abdf059711a83e816392e32c103177232a494b58 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 16 Mar 2021 09:41:13 -0700 Subject: [PATCH 15/15] mypy fixup --- pandas/core/arrays/timedeltas.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index f86d8755acb22..f3889ff360aa8 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -42,7 +42,6 @@ from pandas.core.dtypes.common import ( DT64NS_DTYPE, TD64NS_DTYPE, - is_categorical_dtype, is_dtype_equal, is_float_dtype, is_integer_dtype, @@ -53,7 +52,10 @@ pandas_dtype, ) from pandas.core.dtypes.dtypes import DatetimeTZDtype -from pandas.core.dtypes.generic import ABCMultiIndex +from pandas.core.dtypes.generic import ( + ABCCategorical, + ABCMultiIndex, +) from pandas.core.dtypes.missing import isna from pandas.core import nanops @@ -970,7 +972,7 @@ def sequence_to_td64ns( elif not isinstance(data, (np.ndarray, ExtensionArray)): # GH#24539 e.g. xarray, dask object data = np.asarray(data) - elif is_categorical_dtype(data.dtype): + elif isinstance(data, ABCCategorical): data = data.categories.take(data.codes, fill_value=NaT)._values copy = False