-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
[ArrayManager] DataFrame constructors #39991
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 11 commits
61983d8
1d0315f
ffc8314
46e73c8
3e108df
854bb17
8726d42
6e17183
8096665
aef4cc8
9c0a3d6
1eb5cb7
0992e67
936b290
54d36ab
c56ffa8
164387c
143b572
6166927
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -564,41 +564,59 @@ def __init__( | |
if isinstance(data, DataFrame): | ||
data = data._mgr | ||
|
||
if isinstance(data, (BlockManager, ArrayManager)): | ||
if index is None and columns is None and dtype is None and copy is False: | ||
# GH#33357 fastpath | ||
NDFrame.__init__(self, data) | ||
return | ||
# first check if a Manager is passed without any other arguments | ||
# -> use fastpath (without checking Manager type) | ||
if ( | ||
index is None | ||
and columns is None | ||
and dtype is None | ||
and copy is False | ||
and isinstance(data, (BlockManager, ArrayManager)) | ||
): | ||
# GH#33357 fastpath | ||
NDFrame.__init__(self, data) | ||
return | ||
|
||
manager = get_option("mode.data_manager") | ||
|
||
if isinstance(data, (BlockManager, ArrayManager)): | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
mgr = self._init_mgr( | ||
data, axes={"index": index, "columns": columns}, dtype=dtype, copy=copy | ||
) | ||
|
||
elif isinstance(data, dict): | ||
mgr = init_dict(data, index, columns, dtype=dtype) | ||
mgr = init_dict(data, index, columns, dtype=dtype, typ=manager) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you rename typ -> manager? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note this is the same keyword as I use for Alternatively, we could make the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok yeah +1 on the rename (followon ok ) and anything to make the keyword more obvious There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. -> #40074 |
||
elif isinstance(data, ma.MaskedArray): | ||
import numpy.ma.mrecords as mrecords | ||
|
||
# masked recarray | ||
if isinstance(data, mrecords.MaskedRecords): | ||
mgr = masked_rec_array_to_mgr(data, index, columns, dtype, copy) | ||
mgr = masked_rec_array_to_mgr( | ||
data, index, columns, dtype, copy, typ=manager | ||
) | ||
|
||
# a masked array | ||
else: | ||
data = sanitize_masked_array(data) | ||
mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy) | ||
mgr = init_ndarray( | ||
data, index, columns, dtype=dtype, copy=copy, typ=manager | ||
) | ||
|
||
elif isinstance(data, (np.ndarray, Series, Index)): | ||
if data.dtype.names: | ||
data_columns = list(data.dtype.names) | ||
data = {k: data[k] for k in data_columns} | ||
if columns is None: | ||
columns = data_columns | ||
mgr = init_dict(data, index, columns, dtype=dtype) | ||
mgr = init_dict(data, index, columns, dtype=dtype, typ=manager) | ||
elif getattr(data, "name", None) is not None: | ||
mgr = init_dict({data.name: data}, index, columns, dtype=dtype) | ||
mgr = init_dict( | ||
{data.name: data}, index, columns, dtype=dtype, typ=manager | ||
) | ||
else: | ||
mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy) | ||
mgr = init_ndarray( | ||
data, index, columns, dtype=dtype, copy=copy, typ=manager | ||
) | ||
|
||
# For data is list-like, or Iterable (will consume into list) | ||
elif is_list_like(data): | ||
|
@@ -611,11 +629,15 @@ def __init__( | |
arrays, columns, index = nested_data_to_arrays( | ||
data, columns, index, dtype | ||
) | ||
mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype) | ||
mgr = arrays_to_mgr( | ||
arrays, columns, index, columns, dtype=dtype, typ=manager | ||
) | ||
else: | ||
mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy) | ||
mgr = init_ndarray( | ||
data, index, columns, dtype=dtype, copy=copy, typ=manager | ||
) | ||
else: | ||
mgr = init_dict({}, index, columns, dtype=dtype) | ||
mgr = init_dict({}, index, columns, dtype=dtype, typ=manager) | ||
# For data is scalar | ||
else: | ||
if index is None or columns is None: | ||
|
@@ -632,18 +654,19 @@ def __init__( | |
construct_1d_arraylike_from_scalar(data, len(index), dtype) | ||
for _ in range(len(columns)) | ||
] | ||
mgr = arrays_to_mgr(values, columns, index, columns, dtype=None) | ||
mgr = arrays_to_mgr( | ||
values, columns, index, columns, dtype=None, typ=manager | ||
) | ||
else: | ||
values = construct_2d_arraylike_from_scalar( | ||
data, len(index), len(columns), dtype, copy | ||
) | ||
|
||
mgr = init_ndarray( | ||
values, index, columns, dtype=values.dtype, copy=False | ||
values, index, columns, dtype=values.dtype, copy=False, typ=manager | ||
) | ||
|
||
# ensure correct Manager type according to settings | ||
manager = get_option("mode.data_manager") | ||
mgr = mgr_to_mgr(mgr, typ=manager) | ||
|
||
NDFrame.__init__(self, mgr) | ||
|
@@ -1971,7 +1994,8 @@ def from_records( | |
arr_columns = arr_columns.drop(arr_exclude) | ||
columns = columns.drop(exclude) | ||
|
||
mgr = arrays_to_mgr(arrays, arr_columns, result_index, columns) | ||
manager = get_option("mode.data_manager") | ||
mgr = arrays_to_mgr(arrays, arr_columns, result_index, columns, typ=manager) | ||
|
||
return cls(mgr) | ||
|
||
|
@@ -2178,13 +2202,15 @@ def _from_arrays( | |
if dtype is not None: | ||
dtype = pandas_dtype(dtype) | ||
|
||
manager = get_option("mode.data_manager") | ||
mgr = arrays_to_mgr( | ||
arrays, | ||
columns, | ||
index, | ||
columns, | ||
dtype=dtype, | ||
verify_integrity=verify_integrity, | ||
typ=manager, | ||
) | ||
return cls(mgr) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -61,6 +61,7 @@ | |
) | ||
from pandas.core.arrays import Categorical | ||
from pandas.core.construction import ( | ||
ensure_wrapped_if_datetimelike, | ||
extract_array, | ||
sanitize_array, | ||
) | ||
|
@@ -71,7 +72,9 @@ | |
get_objs_combined_axis, | ||
union_indexes, | ||
) | ||
from pandas.core.internals.array_manager import ArrayManager | ||
from pandas.core.internals.managers import ( | ||
BlockManager, | ||
create_block_manager_from_arrays, | ||
create_block_manager_from_blocks, | ||
) | ||
|
@@ -90,6 +93,7 @@ def arrays_to_mgr( | |
columns, | ||
dtype: Optional[DtypeObj] = None, | ||
verify_integrity: bool = True, | ||
typ: Optional[str] = None, | ||
): | ||
""" | ||
Segregate Series based on type and coerce into matrices. | ||
|
@@ -107,7 +111,8 @@ def arrays_to_mgr( | |
|
||
# don't force copy because getting jammed in an ndarray anyway | ||
arrays = _homogenize(arrays, index, dtype) | ||
|
||
if typ == "array": | ||
arrays = [ensure_wrapped_if_datetimelike(arr) for arr in arrays] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. could do this unconditionally. tradeoff: slightly less complex code here, perf of unecessary wrapping/unwrapping in DatetimeBlock/TimedeltaBlock._maybe_coerce_values (for now, until upcoming PR) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would then leave that until it is useful There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you do this inside e.g. in ArrayManager constructor (or add a constructor on L127), prob should do that anyhow. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
What do you mean exactly with that? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i think the suggestion is to move the call on L115 to just before the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I intentionally put it here because, in theory, I think this wrapping only needs to be done with the default of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I mean on line L127 this should call a method inside internal and not directly calling the manager. Then you can handle the special cases. on L110 inside that function. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So you want me to write this function?
(IMO this just splits the handling of
Note you are commenting on a file that already is inside internals There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. its L110 which needs to be handled in array_manager.py and not here There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK, moved inside |
||
columns = ensure_index(columns) | ||
else: | ||
columns = ensure_index(columns) | ||
|
@@ -116,11 +121,16 @@ def arrays_to_mgr( | |
# from BlockManager perspective | ||
axes = [columns, index] | ||
|
||
return create_block_manager_from_arrays(arrays, arr_names, axes) | ||
if typ == "block": | ||
return create_block_manager_from_arrays(arrays, arr_names, axes) | ||
elif typ == "array": | ||
return ArrayManager(arrays, [index, columns]) | ||
else: | ||
raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{typ}'") | ||
|
||
|
||
def masked_rec_array_to_mgr( | ||
data: MaskedRecords, index, columns, dtype: Optional[DtypeObj], copy: bool | ||
data: MaskedRecords, index, columns, dtype: Optional[DtypeObj], copy: bool, typ: str | ||
): | ||
""" | ||
Extract from a masked rec array and create the manager. | ||
|
@@ -154,7 +164,7 @@ def masked_rec_array_to_mgr( | |
if columns is None: | ||
columns = arr_columns | ||
|
||
mgr = arrays_to_mgr(arrays, arr_columns, index, columns, dtype) | ||
mgr = arrays_to_mgr(arrays, arr_columns, index, columns, dtype, typ=typ) | ||
|
||
if copy: | ||
mgr = mgr.copy() | ||
|
@@ -166,19 +176,14 @@ def mgr_to_mgr(mgr, typ: str): | |
Convert to specific type of Manager. Does not copy if the type is already | ||
correct. Does not guarantee a copy otherwise. | ||
""" | ||
from pandas.core.internals import ( | ||
ArrayManager, | ||
BlockManager, | ||
) | ||
|
||
new_mgr: Manager | ||
|
||
if typ == "block": | ||
if isinstance(mgr, BlockManager): | ||
new_mgr = mgr | ||
else: | ||
new_mgr = arrays_to_mgr( | ||
mgr.arrays, mgr.axes[0], mgr.axes[1], mgr.axes[0], dtype=None | ||
mgr.arrays, mgr.axes[0], mgr.axes[1], mgr.axes[0], typ="block" | ||
) | ||
elif typ == "array": | ||
if isinstance(mgr, ArrayManager): | ||
|
@@ -187,15 +192,17 @@ def mgr_to_mgr(mgr, typ: str): | |
arrays = [mgr.iget_values(i).copy() for i in range(len(mgr.axes[0]))] | ||
new_mgr = ArrayManager(arrays, [mgr.axes[1], mgr.axes[0]]) | ||
else: | ||
raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{type}'") | ||
raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{typ}'") | ||
return new_mgr | ||
|
||
|
||
# --------------------------------------------------------------------- | ||
# DataFrame Constructor Interface | ||
|
||
|
||
def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool): | ||
def init_ndarray( | ||
values, index, columns, dtype: Optional[DtypeObj], copy: bool, typ: str | ||
): | ||
# input must be a ndarray, list, Series, index | ||
|
||
if isinstance(values, ABCSeries): | ||
|
@@ -224,7 +231,7 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool): | |
if columns is None: | ||
columns = Index(range(len(values))) | ||
|
||
return arrays_to_mgr(values, columns, index, columns, dtype=dtype) | ||
return arrays_to_mgr(values, columns, index, columns, dtype=dtype, typ=typ) | ||
|
||
# by definition an array here | ||
# the dtypes will be coerced to a single dtype | ||
|
@@ -277,7 +284,7 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool): | |
return create_block_manager_from_blocks(block_values, [columns, index]) | ||
|
||
|
||
def init_dict(data: Dict, index, columns, dtype: Optional[DtypeObj] = None): | ||
def init_dict(data: Dict, index, columns, dtype: Optional[DtypeObj], typ: str): | ||
""" | ||
Segregate Series based on type and coerce into matrices. | ||
Needs to handle a lot of exceptional cases. | ||
|
@@ -321,7 +328,7 @@ def init_dict(data: Dict, index, columns, dtype: Optional[DtypeObj] = None): | |
arrays = [ | ||
arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays | ||
] | ||
return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype) | ||
return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype, typ=typ) | ||
|
||
|
||
def nested_data_to_arrays( | ||
|
@@ -415,6 +422,11 @@ def _homogenize(data, index: Index, dtype: Optional[DtypeObj]): | |
# Forces alignment. No need to copy data since we | ||
# are putting it into an ndarray later | ||
val = val.reindex(index, copy=False) | ||
# TODO extract_array should be preferred, but that gives failures for | ||
# `extension/test_numpy.py` (extract_array will convert numpy arrays | ||
# to PandasArray), see https://github.com/pandas-dev/pandas/issues/40021 | ||
# val = extract_array(val, extract_numpy=True) | ||
val = val._values | ||
jorisvandenbossche marked this conversation as resolved.
Show resolved
Hide resolved
|
||
else: | ||
if isinstance(val, dict): | ||
if oindex is None: | ||
|
Uh oh!
There was an error while loading. Please reload this page.