From db60b58ec62b1c30c0c7b9fc6d409358c7abf32e Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Thu, 20 Apr 2023 14:43:15 -0600 Subject: [PATCH 1/8] CLN: unify NumpyBlock, ObjectBlock, and NumericBlock --- pandas/core/internals/__init__.py | 6 +- pandas/core/internals/blocks.py | 111 +++++++++++---------- pandas/tests/extension/base/casting.py | 9 +- pandas/tests/frame/test_block_internals.py | 13 ++- pandas/tests/internals/test_api.py | 3 +- pandas/tests/series/test_constructors.py | 4 +- 6 files changed, 73 insertions(+), 73 deletions(-) diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py index 0797e62de7a9f..fdca7c0895bc0 100644 --- a/pandas/core/internals/__init__.py +++ b/pandas/core/internals/__init__.py @@ -11,8 +11,7 @@ Block, DatetimeTZBlock, ExtensionBlock, - NumericBlock, - ObjectBlock, + NumpyBlock, ) from pandas.core.internals.concat import concatenate_managers from pandas.core.internals.managers import ( @@ -23,10 +22,9 @@ __all__ = [ "Block", - "NumericBlock", "DatetimeTZBlock", "ExtensionBlock", - "ObjectBlock", + "NumpyBlock", "make_block", "DataManager", "ArrayManager", diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 02f8393eed102..a0881ee1a8c88 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -463,9 +463,8 @@ def convert( using_cow: bool = False, ) -> list[Block]: """ - attempt to coerce any object types to better types return a copy - of the block (if copy = True) by definition we are not an ObjectBlock - here! + Attempt to coerce any object types to better types. Return a copy + of the block (if copy = True). """ if not copy and using_cow: return [self.copy(deep=False)] @@ -674,7 +673,7 @@ def _replace_regex( List[Block] """ if not self._can_hold_element(to_replace): - # i.e. only ObjectBlock, but could in principle include a + # i.e. only if self.is_object is True, but could in principle include a # String ExtensionBlock if using_cow: return [self.copy(deep=False)] @@ -1267,7 +1266,7 @@ def fillna( ) -> list[Block]: """ fillna on the block with the value. If we fail, then convert to - ObjectBlock and try again + block to hold objects instead and try again """ # Caller is responsible for validating limit; if int it is strictly positive inplace = validate_bool_kwarg(inplace, "inplace") @@ -2058,7 +2057,7 @@ def _unstack( needs_masking: npt.NDArray[np.bool_], ): # ExtensionArray-safe unstack. - # We override ObjectBlock._unstack, which unstacks directly on the + # We override Block._unstack, which unstacks directly on the # values of the array. For EA-backed blocks, this would require # converting to a 2-D ndarray of objects. # Instead, we unstack an ndarray of integer positions, followed by @@ -2094,6 +2093,7 @@ def _unstack( class NumpyBlock(libinternals.NumpyBlock, Block): values: np.ndarray + __slots__ = () @property def is_view(self) -> bool: @@ -2112,10 +2112,56 @@ def get_values(self, dtype: DtypeObj | None = None) -> np.ndarray: def values_for_json(self) -> np.ndarray: return self.values + @cache_readonly + def is_numeric(self) -> bool: + dtype = self.values.dtype + kind = dtype.kind -class NumericBlock(NumpyBlock): - __slots__ = () - is_numeric = True + if kind in "fciub": + return True + else: + return False + + @cache_readonly + def is_object(self) -> bool: + if self.values.dtype.kind == "O": + return True + else: + return False + + @maybe_split + def convert( + self, + *, + copy: bool = True, + using_cow: bool = False, + ) -> list[Block]: + """ + Attempt to coerce any object types to better types. Return a copy + of the block (if copy = True). + """ + if not self.is_object: + return super().convert(copy=copy, using_cow=using_cow) + + values = self.values + if values.ndim == 2: + # maybe_split ensures we only get here with values.shape[0] == 1, + # avoid doing .ravel as that might make a copy + values = values[0] + + res_values = lib.maybe_convert_objects( + values, + convert_non_numeric=True, + ) + refs = None + if copy and res_values is values: + res_values = values.copy() + elif res_values is values and using_cow: + refs = self.refs + + res_values = ensure_block_shape(res_values, self.ndim) + res_values = maybe_coerce_values(res_values) + return [self.make_block(res_values, refs=refs)] class NDArrayBackedExtensionBlock(libinternals.NDArrayBackedBlock, EABackedBlock): @@ -2251,49 +2297,6 @@ class DatetimeTZBlock(DatetimeLikeBlock): values_for_json = NDArrayBackedExtensionBlock.values_for_json -class ObjectBlock(NumpyBlock): - __slots__ = () - is_object = True - - @maybe_split - def convert( - self, - *, - copy: bool = True, - using_cow: bool = False, - ) -> list[Block]: - """ - attempt to cast any object types to better types return a copy of - the block (if copy = True) by definition we ARE an ObjectBlock!!!!! - """ - if self.dtype != _dtype_obj: - # GH#50067 this should be impossible in ObjectBlock, but until - # that is fixed, we short-circuit here. - if using_cow: - return [self.copy(deep=False)] - return [self] - - values = self.values - if values.ndim == 2: - # maybe_split ensures we only get here with values.shape[0] == 1, - # avoid doing .ravel as that might make a copy - values = values[0] - - res_values = lib.maybe_convert_objects( - values, - convert_non_numeric=True, - ) - refs = None - if copy and res_values is values: - res_values = values.copy() - elif res_values is values and using_cow: - refs = self.refs - - res_values = ensure_block_shape(res_values, self.ndim) - res_values = maybe_coerce_values(res_values) - return [self.make_block(res_values, refs=refs)] - - # ----------------------------------------------------------------- # Constructor Helpers @@ -2352,10 +2355,8 @@ def get_block_type(dtype: DtypeObj) -> type[Block]: kind = dtype.kind if kind in "Mm": return DatetimeLikeBlock - elif kind in "fciub": - return NumericBlock - return ObjectBlock + return NumpyBlock def new_block_2d( diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py index e24c3b4569e3e..4739d2e1ffed0 100644 --- a/pandas/tests/extension/base/casting.py +++ b/pandas/tests/extension/base/casting.py @@ -4,7 +4,7 @@ import pandas.util._test_decorators as td import pandas as pd -from pandas.core.internals import ObjectBlock +from pandas.core.internals import NumpyBlock from pandas.tests.extension.base.base import BaseExtensionTests @@ -16,7 +16,9 @@ def test_astype_object_series(self, all_data): result = ser.astype(object) assert result.dtype == np.dtype(object) if hasattr(result._mgr, "blocks"): - assert isinstance(result._mgr.blocks[0], ObjectBlock) + blk = result._mgr.blocks[0] + assert isinstance(blk, NumpyBlock) + assert blk.is_object assert isinstance(result._mgr.array, np.ndarray) assert result._mgr.array.dtype == np.dtype(object) @@ -26,7 +28,8 @@ def test_astype_object_frame(self, all_data): result = df.astype(object) if hasattr(result._mgr, "blocks"): blk = result._mgr.blocks[0] - assert isinstance(blk, ObjectBlock), type(blk) + assert isinstance(blk, NumpyBlock), type(blk) + assert blk.is_object assert isinstance(result._mgr.arrays[0], np.ndarray) assert result._mgr.arrays[0].dtype == np.dtype(object) diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index 0ddcbf87e3b4c..4084edf3c20a2 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -20,10 +20,7 @@ option_context, ) import pandas._testing as tm -from pandas.core.internals import ( - NumericBlock, - ObjectBlock, -) +from pandas.core.internals import NumpyBlock # Segregated collection of methods that require the BlockManager internal data # structure @@ -387,7 +384,7 @@ def test_constructor_no_pandas_array(self): result = DataFrame({"A": arr}) expected = DataFrame({"A": [1, 2, 3]}) tm.assert_frame_equal(result, expected) - assert isinstance(result._mgr.blocks[0], NumericBlock) + assert isinstance(result._mgr.blocks[0], NumpyBlock) def test_add_column_with_pandas_array(self): # GH 26390 @@ -400,8 +397,10 @@ def test_add_column_with_pandas_array(self): "c": pd.arrays.PandasArray(np.array([1, 2, None, 3], dtype=object)), } ) - assert type(df["c"]._mgr.blocks[0]) == ObjectBlock - assert type(df2["c"]._mgr.blocks[0]) == ObjectBlock + assert type(df["c"]._mgr.blocks[0]) == NumpyBlock + assert df["c"]._mgr.blocks[0].is_object + assert type(df2["c"]._mgr.blocks[0]) == NumpyBlock + assert df2["c"]._mgr.blocks[0].is_object tm.assert_frame_equal(df, df2) diff --git a/pandas/tests/internals/test_api.py b/pandas/tests/internals/test_api.py index c759cc163106d..38afa09102d9a 100644 --- a/pandas/tests/internals/test_api.py +++ b/pandas/tests/internals/test_api.py @@ -27,10 +27,9 @@ def test_namespace(): ] expected = [ "Block", - "NumericBlock", "DatetimeTZBlock", "ExtensionBlock", - "ObjectBlock", + "NumpyBlock", "make_block", "DataManager", "ArrayManager", diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 7238232a46e60..cf0ee045b835d 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -46,7 +46,7 @@ IntervalArray, period_array, ) -from pandas.core.internals.blocks import NumericBlock +from pandas.core.internals.blocks import NumpyBlock class TestSeriesConstructors: @@ -2092,7 +2092,7 @@ def test_constructor_no_pandas_array(self, using_array_manager): result = Series(ser.array) tm.assert_series_equal(ser, result) if not using_array_manager: - assert isinstance(result._mgr.blocks[0], NumericBlock) + assert isinstance(result._mgr.blocks[0], NumpyBlock) @td.skip_array_manager_invalid_test def test_from_array(self): From 4583b47b31e3036ec474f464a9236510ca474227 Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Fri, 21 Apr 2023 11:44:57 -0600 Subject: [PATCH 2/8] CLN: respond to review comments --- pandas/core/internals/blocks.py | 74 +++++++++++++-------------------- 1 file changed, 29 insertions(+), 45 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index a0881ee1a8c88..b1fc08eb621e8 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -466,9 +466,33 @@ def convert( Attempt to coerce any object types to better types. Return a copy of the block (if copy = True). """ - if not copy and using_cow: - return [self.copy(deep=False)] - return [self.copy()] if copy else [self] + if not self.is_object: + if not copy and using_cow: + return [self.copy(deep=False)] + return [self.copy()] if copy else [self] + + if self.ndim != 1 and self.shape[0] != 1: + return self.split_and_operate(Block.convert, copy=copy, using_cow=using_cow) + + values = self.values + if values.ndim == 2: + # maybe_split ensures we only get here with values.shape[0] == 1, + # avoid doing .ravel as that might make a copy + values = values[0] + + res_values = lib.maybe_convert_objects( + values, + convert_non_numeric=True, + ) + refs = None + if copy and res_values is values: + res_values = values.copy() + elif res_values is values and using_cow: + refs = self.refs + + res_values = ensure_block_shape(res_values, self.ndim) + res_values = maybe_coerce_values(res_values) + return [self.make_block(res_values, refs=refs)] # --------------------------------------------------------------------- # Array-Like Methods @@ -2117,51 +2141,11 @@ def is_numeric(self) -> bool: dtype = self.values.dtype kind = dtype.kind - if kind in "fciub": - return True - else: - return False + return kind in "fciub" @cache_readonly def is_object(self) -> bool: - if self.values.dtype.kind == "O": - return True - else: - return False - - @maybe_split - def convert( - self, - *, - copy: bool = True, - using_cow: bool = False, - ) -> list[Block]: - """ - Attempt to coerce any object types to better types. Return a copy - of the block (if copy = True). - """ - if not self.is_object: - return super().convert(copy=copy, using_cow=using_cow) - - values = self.values - if values.ndim == 2: - # maybe_split ensures we only get here with values.shape[0] == 1, - # avoid doing .ravel as that might make a copy - values = values[0] - - res_values = lib.maybe_convert_objects( - values, - convert_non_numeric=True, - ) - refs = None - if copy and res_values is values: - res_values = values.copy() - elif res_values is values and using_cow: - refs = self.refs - - res_values = ensure_block_shape(res_values, self.ndim) - res_values = maybe_coerce_values(res_values) - return [self.make_block(res_values, refs=refs)] + return self.values.dtype.kind == "O" class NDArrayBackedExtensionBlock(libinternals.NDArrayBackedBlock, EABackedBlock): From 9ae67efc1be65fa9a5e67a50abc225dffb97644c Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Fri, 21 Apr 2023 11:59:30 -0600 Subject: [PATCH 3/8] CLN: deprecate ObjectBlock and NumericBlock --- pandas/core/internals/__init__.py | 25 +++++++++++++++++++++++++ pandas/core/internals/blocks.py | 10 ++++++++++ 2 files changed, 35 insertions(+) diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py index fdca7c0895bc0..fbfbdf6800298 100644 --- a/pandas/core/internals/__init__.py +++ b/pandas/core/internals/__init__.py @@ -36,3 +36,28 @@ # this is preserved here for downstream compatibility (GH-33892) "create_block_manager_from_blocks", ] + + +def __getattr__(name: str): + import warnings + + from pandas.util._exceptions import find_stack_level + + if name in ["NumericBlock", "ObjectBlock"]: + if name == "NumericBlock": + from pandas.core.internals.blocks import NumericBlock + + block_type = NumericBlock + elif name == "ObjectBlock": + from pandas.core.internals.blocks import ObjectBlock + + block_type = ObjectBlock + warnings.warn( + f"{name} is deprecated and will be removed in a future version. " + "Use NumpyBlock instead.", + DeprecationWarning, + stacklevel=find_stack_level(), + ) + return block_type + + raise AttributeError(f"module 'pandas.core.internals' has no attribute '{name}'") diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index b1fc08eb621e8..6bd8fd8b35c43 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2148,6 +2148,16 @@ def is_object(self) -> bool: return self.values.dtype.kind == "O" +class NumericBlock(NumpyBlock): + # this Block type is kept for backwards-compatibility + __slots__ = () + + +class ObjectBlock(NumpyBlock): + # this Block type is kept for backwards-compatibility + __slots__ = () + + class NDArrayBackedExtensionBlock(libinternals.NDArrayBackedBlock, EABackedBlock): """ Block backed by an NDArrayBackedExtensionArray From e827f7b70b74944b981a2262d744af772df4e958 Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Fri, 21 Apr 2023 13:36:03 -0600 Subject: [PATCH 4/8] CLN: appease mypy --- pandas/core/internals/__init__.py | 17 ++++++++--------- pandas/core/internals/blocks.py | 6 +++--- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py index fbfbdf6800298..e22f6a3501329 100644 --- a/pandas/core/internals/__init__.py +++ b/pandas/core/internals/__init__.py @@ -44,20 +44,19 @@ def __getattr__(name: str): from pandas.util._exceptions import find_stack_level if name in ["NumericBlock", "ObjectBlock"]: - if name == "NumericBlock": - from pandas.core.internals.blocks import NumericBlock - - block_type = NumericBlock - elif name == "ObjectBlock": - from pandas.core.internals.blocks import ObjectBlock - - block_type = ObjectBlock warnings.warn( f"{name} is deprecated and will be removed in a future version. " "Use NumpyBlock instead.", DeprecationWarning, stacklevel=find_stack_level(), ) - return block_type + if name == "NumericBlock": + from pandas.core.internals.blocks import NumericBlock + + return NumericBlock + else: + from pandas.core.internals.blocks import ObjectBlock + + return ObjectBlock raise AttributeError(f"module 'pandas.core.internals' has no attribute '{name}'") diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 6bd8fd8b35c43..b02fd1c8bd166 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -481,7 +481,7 @@ def convert( values = values[0] res_values = lib.maybe_convert_objects( - values, + values, # type: ignore[arg-type] convert_non_numeric=True, ) refs = None @@ -2137,14 +2137,14 @@ def values_for_json(self) -> np.ndarray: return self.values @cache_readonly - def is_numeric(self) -> bool: + def is_numeric(self) -> bool: # type: ignore[override] dtype = self.values.dtype kind = dtype.kind return kind in "fciub" @cache_readonly - def is_object(self) -> bool: + def is_object(self) -> bool: # type: ignore[override] return self.values.dtype.kind == "O" From f85e1a17ad9cee47075321eaa72677fda453213a Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Fri, 21 Apr 2023 14:23:10 -0600 Subject: [PATCH 5/8] CLN: remove out-of-date reference to maybe_split --- pandas/core/internals/blocks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index b02fd1c8bd166..4ef044af999be 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -476,7 +476,7 @@ def convert( values = self.values if values.ndim == 2: - # maybe_split ensures we only get here with values.shape[0] == 1, + # the check above ensures we only get here with values.shape[0] == 1, # avoid doing .ravel as that might make a copy values = values[0] From 6d077630a65ca21db8f5da9c6043b30278731c2e Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Mon, 1 May 2023 10:23:57 -0600 Subject: [PATCH 6/8] CLN: respond to review comments --- pandas/core/internals/blocks.py | 2 ++ pandas/tests/series/test_constructors.py | 1 + 2 files changed, 3 insertions(+) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 4ef044af999be..4cace9d90b139 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2150,11 +2150,13 @@ def is_object(self) -> bool: # type: ignore[override] class NumericBlock(NumpyBlock): # this Block type is kept for backwards-compatibility + # TODO(3.0): delete and remove deprecation in __init__.py. __slots__ = () class ObjectBlock(NumpyBlock): # this Block type is kept for backwards-compatibility + # TODO(3.0): delete and remove deprecation in __init__.py. __slots__ = () diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index cf0ee045b835d..53f91d5c79f78 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -2093,6 +2093,7 @@ def test_constructor_no_pandas_array(self, using_array_manager): tm.assert_series_equal(ser, result) if not using_array_manager: assert isinstance(result._mgr.blocks[0], NumpyBlock) + assert result._mgr.blocks[0].is_numeric @td.skip_array_manager_invalid_test def test_from_array(self): From e52b6e8877c4335af8cdb2c28a790ea75e45880c Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Mon, 1 May 2023 11:22:21 -0600 Subject: [PATCH 7/8] CLN: remove NumpyBlock from the semi-public API --- pandas/core/internals/__init__.py | 4 +--- pandas/tests/extension/base/casting.py | 2 +- pandas/tests/frame/test_block_internals.py | 2 +- pandas/tests/internals/test_api.py | 1 - 4 files changed, 3 insertions(+), 6 deletions(-) diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py index e22f6a3501329..284f8ef135d99 100644 --- a/pandas/core/internals/__init__.py +++ b/pandas/core/internals/__init__.py @@ -11,7 +11,6 @@ Block, DatetimeTZBlock, ExtensionBlock, - NumpyBlock, ) from pandas.core.internals.concat import concatenate_managers from pandas.core.internals.managers import ( @@ -24,7 +23,6 @@ "Block", "DatetimeTZBlock", "ExtensionBlock", - "NumpyBlock", "make_block", "DataManager", "ArrayManager", @@ -46,7 +44,7 @@ def __getattr__(name: str): if name in ["NumericBlock", "ObjectBlock"]: warnings.warn( f"{name} is deprecated and will be removed in a future version. " - "Use NumpyBlock instead.", + "Use public APIs instead.", DeprecationWarning, stacklevel=find_stack_level(), ) diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py index 4739d2e1ffed0..5a6b0d38e5055 100644 --- a/pandas/tests/extension/base/casting.py +++ b/pandas/tests/extension/base/casting.py @@ -4,7 +4,7 @@ import pandas.util._test_decorators as td import pandas as pd -from pandas.core.internals import NumpyBlock +from pandas.core.internals.blocks import NumpyBlock from pandas.tests.extension.base.base import BaseExtensionTests diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index 4084edf3c20a2..2d87bacd8d1b9 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -20,7 +20,7 @@ option_context, ) import pandas._testing as tm -from pandas.core.internals import NumpyBlock +from pandas.core.internals.blocks import NumpyBlock # Segregated collection of methods that require the BlockManager internal data # structure diff --git a/pandas/tests/internals/test_api.py b/pandas/tests/internals/test_api.py index 38afa09102d9a..5cd6c718260ea 100644 --- a/pandas/tests/internals/test_api.py +++ b/pandas/tests/internals/test_api.py @@ -29,7 +29,6 @@ def test_namespace(): "Block", "DatetimeTZBlock", "ExtensionBlock", - "NumpyBlock", "make_block", "DataManager", "ArrayManager", From e6f9b3f8cb4009c5906356858e2095fa43c71591 Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Thu, 11 May 2023 11:10:41 -0600 Subject: [PATCH 8/8] CLN: test for is_numeric in block internals tests --- pandas/tests/frame/test_block_internals.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index 2d87bacd8d1b9..4c96738155583 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -385,6 +385,7 @@ def test_constructor_no_pandas_array(self): expected = DataFrame({"A": [1, 2, 3]}) tm.assert_frame_equal(result, expected) assert isinstance(result._mgr.blocks[0], NumpyBlock) + assert result._mgr.blocks[0].is_numeric def test_add_column_with_pandas_array(self): # GH 26390