From 8f94ce0eff05810bbdc905c67c2d4a8178db469e Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Thu, 11 Jan 2018 10:55:42 -0800 Subject: [PATCH 1/6] unify logic for form_blocks and make_blocks --- pandas/core/internals.py | 163 ++++++++++++++++++++------------------- 1 file changed, 84 insertions(+), 79 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index ba90503e3bf40..1c4c05473ea5d 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -2914,37 +2914,54 @@ def sparse_reindex(self, new_index): placement=self.mgr_locs) +_block_type_map = { + 'int': IntBlock, + 'complex': ComplexBlock, + 'float': FloatBlock, + 'sparse': SparseBlock, + 'timedelta': TimeDeltaBlock, + 'bool': BoolBlock, + 'object': ObjectBlock, + 'cat': CategoricalBlock, + 'datetime': DatetimeBlock, + 'datetime_tz': DatetimeTZBlock} + + +def _get_block_type(values, dtype=None): + dtype = dtype or values.dtype + vtype = dtype.type + + if is_sparse(values): + block_type = 'sparse' + elif issubclass(vtype, np.floating): + block_type = 'float' + elif issubclass(vtype, np.timedelta64): + assert issubclass(vtype, np.integer) + block_type = 'timedelta' + elif issubclass(vtype, np.complexfloating): + block_type = 'complex' + elif issubclass(vtype, np.datetime64): + assert not is_datetimetz(values) + block_type = 'datetime' + elif is_datetimetz(values): + block_type = 'datetime_tz' + elif issubclass(vtype, np.integer): + block_type = 'int' + elif dtype == np.bool_: + block_type = 'bool' + elif is_categorical(values): + block_type = 'cat' + else: + block_type = 'object' + return block_type + + def make_block(values, placement, klass=None, ndim=None, dtype=None, fastpath=False): if klass is None: dtype = dtype or values.dtype - vtype = dtype.type - - if isinstance(values, SparseArray): - klass = SparseBlock - elif issubclass(vtype, np.floating): - klass = FloatBlock - elif (issubclass(vtype, np.integer) and - issubclass(vtype, np.timedelta64)): - klass = TimeDeltaBlock - elif (issubclass(vtype, np.integer) and - not issubclass(vtype, np.datetime64)): - klass = IntBlock - elif dtype == np.bool_: - klass = BoolBlock - elif issubclass(vtype, np.datetime64): - if hasattr(values, 'tz'): - klass = DatetimeTZBlock - else: - klass = DatetimeBlock - elif is_datetimetz(values): - klass = DatetimeTZBlock - elif issubclass(vtype, np.complexfloating): - klass = ComplexBlock - elif is_categorical(values): - klass = CategoricalBlock - else: - klass = ObjectBlock + block_type = _get_block_type(values, dtype) + klass = _block_type_map[block_type] elif klass is DatetimeTZBlock and not is_datetimetz(values): return klass(values, ndim=ndim, fastpath=fastpath, @@ -4660,15 +4677,16 @@ def create_block_manager_from_arrays(arrays, names, axes): def form_blocks(arrays, names, axes): # put "leftover" items in float bucket, where else? # generalize? - float_items = [] - complex_items = [] - int_items = [] - bool_items = [] - object_items = [] - sparse_items = [] - datetime_items = [] - datetime_tz_items = [] - cat_items = [] + items_dict = {'float': [], + 'complex': [], + 'int': [], + 'bool': [], + 'object': [], + 'sparse': [], + 'timedelta': [], + 'datetime': [], + 'datetime_tz': [], + 'cat': []} extra_locs = [] names_idx = Index(names) @@ -4686,72 +4704,59 @@ def form_blocks(arrays, names, axes): k = names[name_idx] v = arrays[name_idx] - if is_sparse(v): - sparse_items.append((i, k, v)) - elif issubclass(v.dtype.type, np.floating): - float_items.append((i, k, v)) - elif issubclass(v.dtype.type, np.complexfloating): - complex_items.append((i, k, v)) - elif issubclass(v.dtype.type, np.datetime64): - if v.dtype != _NS_DTYPE: - v = conversion.ensure_datetime64ns(v) - - if is_datetimetz(v): - datetime_tz_items.append((i, k, v)) - else: - datetime_items.append((i, k, v)) - elif is_datetimetz(v): - datetime_tz_items.append((i, k, v)) - elif issubclass(v.dtype.type, np.integer): - int_items.append((i, k, v)) - elif v.dtype == np.bool_: - bool_items.append((i, k, v)) - elif is_categorical(v): - cat_items.append((i, k, v)) - else: - object_items.append((i, k, v)) + block_type = _get_block_type(v) + + if block_type == 'datetime' and v.dtype != _NS_DTYPE: + # TODO: i dont think this is necessary + v = conversion.ensure_datetime64ns(v) + + items_dict[block_type].append((i, k, v)) blocks = [] - if len(float_items): - float_blocks = _multi_blockify(float_items) + if len(items_dict['float']): + float_blocks = _multi_blockify(items_dict['float']) blocks.extend(float_blocks) - if len(complex_items): - complex_blocks = _multi_blockify(complex_items) + if len(items_dict['complex']): + complex_blocks = _multi_blockify(items_dict['complex']) blocks.extend(complex_blocks) - if len(int_items): - int_blocks = _multi_blockify(int_items) + if len(items_dict['timedelta']): + timedelta_blocks = _multi_blockify(items_dict['timedelta']) + blocks.extend(timedelta_blocks) + + if len(items_dict['int']): + int_blocks = _multi_blockify(items_dict['int']) blocks.extend(int_blocks) - if len(datetime_items): - datetime_blocks = _simple_blockify(datetime_items, _NS_DTYPE) + if len(items_dict['datetime']): + datetime_blocks = _simple_blockify(items_dict['datetime'], _NS_DTYPE) blocks.extend(datetime_blocks) - if len(datetime_tz_items): + if len(items_dict['datetime_tz']): dttz_blocks = [make_block(array, klass=DatetimeTZBlock, fastpath=True, - placement=[i], ) - for i, _, array in datetime_tz_items] + placement=[i]) + for i, _, array in items_dict['datetime_tz']] blocks.extend(dttz_blocks) - if len(bool_items): - bool_blocks = _simple_blockify(bool_items, np.bool_) + if len(items_dict['bool']): + bool_blocks = _simple_blockify(items_dict['bool'], np.bool_) blocks.extend(bool_blocks) - if len(object_items) > 0: - object_blocks = _simple_blockify(object_items, np.object_) + if len(items_dict['object']) > 0: + object_blocks = _simple_blockify(items_dict['object'], np.object_) blocks.extend(object_blocks) - if len(sparse_items) > 0: - sparse_blocks = _sparse_blockify(sparse_items) + if len(items_dict['sparse']) > 0: + sparse_blocks = _sparse_blockify(items_dict['sparse']) blocks.extend(sparse_blocks) - if len(cat_items) > 0: + if len(items_dict['cat']) > 0: cat_blocks = [make_block(array, klass=CategoricalBlock, fastpath=True, placement=[i]) - for i, _, array in cat_items] + for i, _, array in items_dict['cat']] blocks.extend(cat_blocks) if len(extra_locs): From 989c51aed457bee05e42a2539cf69a368a8445c7 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Thu, 11 Jan 2018 17:45:51 -0800 Subject: [PATCH 2/6] deprivatize, docstring, defaultdict --- pandas/core/internals.py | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 1c4c05473ea5d..1a6f5fc53f5e9 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -2927,7 +2927,23 @@ def sparse_reindex(self, new_index): 'datetime_tz': DatetimeTZBlock} -def _get_block_type(values, dtype=None): +def get_block_type(values, dtype=None): + """ + Find the appropriate Block subclass to use for the given values and dtype. + + Parameters + ---------- + values : ndarray-like + dtype : numpy or pandas dtype + + Returns + ------- + block_type : str + + See Also + -------- + _block_type_map : maps block_type to Block class objects + """ dtype = dtype or values.dtype vtype = dtype.type @@ -2960,7 +2976,7 @@ def make_block(values, placement, klass=None, ndim=None, dtype=None, fastpath=False): if klass is None: dtype = dtype or values.dtype - block_type = _get_block_type(values, dtype) + block_type = get_block_type(values, dtype) klass = _block_type_map[block_type] elif klass is DatetimeTZBlock and not is_datetimetz(values): @@ -4677,16 +4693,7 @@ def create_block_manager_from_arrays(arrays, names, axes): def form_blocks(arrays, names, axes): # put "leftover" items in float bucket, where else? # generalize? - items_dict = {'float': [], - 'complex': [], - 'int': [], - 'bool': [], - 'object': [], - 'sparse': [], - 'timedelta': [], - 'datetime': [], - 'datetime_tz': [], - 'cat': []} + items_dict = defaultdict([]) extra_locs = [] names_idx = Index(names) @@ -4704,12 +4711,7 @@ def form_blocks(arrays, names, axes): k = names[name_idx] v = arrays[name_idx] - block_type = _get_block_type(v) - - if block_type == 'datetime' and v.dtype != _NS_DTYPE: - # TODO: i dont think this is necessary - v = conversion.ensure_datetime64ns(v) - + block_type = get_block_type(v) items_dict[block_type].append((i, k, v)) blocks = [] From fdfca24b2bf40ccaca454cf22ea44d7ec22c2c7b Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Fri, 12 Jan 2018 08:42:18 -0800 Subject: [PATCH 3/6] fix defaultdict usage --- pandas/core/internals.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 1a6f5fc53f5e9..47a47feddbac8 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -4693,7 +4693,7 @@ def create_block_manager_from_arrays(arrays, names, axes): def form_blocks(arrays, names, axes): # put "leftover" items in float bucket, where else? # generalize? - items_dict = defaultdict([]) + items_dict = defaultdict(list) extra_locs = [] names_idx = Index(names) From b2c27c16be71a23d97c411217a3ede6ac06a4bb5 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Wed, 17 Jan 2018 10:52:55 -0800 Subject: [PATCH 4/6] remove global _block_type_map --- pandas/core/internals.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 47a47feddbac8..f1af02af3cdf6 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -2939,10 +2939,6 @@ def get_block_type(values, dtype=None): Returns ------- block_type : str - - See Also - -------- - _block_type_map : maps block_type to Block class objects """ dtype = dtype or values.dtype vtype = dtype.type @@ -2977,7 +2973,17 @@ def make_block(values, placement, klass=None, ndim=None, dtype=None, if klass is None: dtype = dtype or values.dtype block_type = get_block_type(values, dtype) - klass = _block_type_map[block_type] + block_type_map = {'int': IntBlock, + 'complex': ComplexBlock, + 'float': FloatBlock, + 'sparse': SparseBlock, + 'timedelta': TimeDeltaBlock, + 'bool': BoolBlock, + 'object': ObjectBlock, + 'cat': CategoricalBlock, + 'datetime': DatetimeBlock, + 'datetime_tz': DatetimeTZBlock} + klass = block_type_map[block_type] elif klass is DatetimeTZBlock and not is_datetimetz(values): return klass(values, ndim=ndim, fastpath=fastpath, From 6c4c1596f9ffcaff0f14cdd005a36fdeb7730d98 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Wed, 17 Jan 2018 10:53:42 -0800 Subject: [PATCH 5/6] actually remove _block_type_map --- pandas/core/internals.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index f1af02af3cdf6..1b7c2f96ad0b5 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -2914,19 +2914,6 @@ def sparse_reindex(self, new_index): placement=self.mgr_locs) -_block_type_map = { - 'int': IntBlock, - 'complex': ComplexBlock, - 'float': FloatBlock, - 'sparse': SparseBlock, - 'timedelta': TimeDeltaBlock, - 'bool': BoolBlock, - 'object': ObjectBlock, - 'cat': CategoricalBlock, - 'datetime': DatetimeBlock, - 'datetime_tz': DatetimeTZBlock} - - def get_block_type(values, dtype=None): """ Find the appropriate Block subclass to use for the given values and dtype. From 9d5bd26bb18adec21d3464c018d8eb2bb930a4c7 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Wed, 17 Jan 2018 17:41:05 -0800 Subject: [PATCH 6/6] remove dict step --- pandas/core/internals.py | 80 ++++++++++++++++++---------------------- 1 file changed, 35 insertions(+), 45 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 1b7c2f96ad0b5..079f3113dc2da 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -2925,52 +2925,41 @@ def get_block_type(values, dtype=None): Returns ------- - block_type : str + cls : class, subclass of Block """ dtype = dtype or values.dtype vtype = dtype.type if is_sparse(values): - block_type = 'sparse' + cls = SparseBlock elif issubclass(vtype, np.floating): - block_type = 'float' + cls = FloatBlock elif issubclass(vtype, np.timedelta64): assert issubclass(vtype, np.integer) - block_type = 'timedelta' + cls = TimeDeltaBlock elif issubclass(vtype, np.complexfloating): - block_type = 'complex' + cls = ComplexBlock elif issubclass(vtype, np.datetime64): assert not is_datetimetz(values) - block_type = 'datetime' + cls = DatetimeBlock elif is_datetimetz(values): - block_type = 'datetime_tz' + cls = DatetimeTZBlock elif issubclass(vtype, np.integer): - block_type = 'int' + cls = IntBlock elif dtype == np.bool_: - block_type = 'bool' + cls = BoolBlock elif is_categorical(values): - block_type = 'cat' + cls = CategoricalBlock else: - block_type = 'object' - return block_type + cls = ObjectBlock + return cls def make_block(values, placement, klass=None, ndim=None, dtype=None, fastpath=False): if klass is None: dtype = dtype or values.dtype - block_type = get_block_type(values, dtype) - block_type_map = {'int': IntBlock, - 'complex': ComplexBlock, - 'float': FloatBlock, - 'sparse': SparseBlock, - 'timedelta': TimeDeltaBlock, - 'bool': BoolBlock, - 'object': ObjectBlock, - 'cat': CategoricalBlock, - 'datetime': DatetimeBlock, - 'datetime_tz': DatetimeTZBlock} - klass = block_type_map[block_type] + klass = get_block_type(values, dtype) elif klass is DatetimeTZBlock and not is_datetimetz(values): return klass(values, ndim=ndim, fastpath=fastpath, @@ -4705,53 +4694,54 @@ def form_blocks(arrays, names, axes): v = arrays[name_idx] block_type = get_block_type(v) - items_dict[block_type].append((i, k, v)) + items_dict[block_type.__name__].append((i, k, v)) blocks = [] - if len(items_dict['float']): - float_blocks = _multi_blockify(items_dict['float']) + if len(items_dict['FloatBlock']): + float_blocks = _multi_blockify(items_dict['FloatBlock']) blocks.extend(float_blocks) - if len(items_dict['complex']): - complex_blocks = _multi_blockify(items_dict['complex']) + if len(items_dict['ComplexBlock']): + complex_blocks = _multi_blockify(items_dict['ComplexBlock']) blocks.extend(complex_blocks) - if len(items_dict['timedelta']): - timedelta_blocks = _multi_blockify(items_dict['timedelta']) + if len(items_dict['TimeDeltaBlock']): + timedelta_blocks = _multi_blockify(items_dict['TimeDeltaBlock']) blocks.extend(timedelta_blocks) - if len(items_dict['int']): - int_blocks = _multi_blockify(items_dict['int']) + if len(items_dict['IntBlock']): + int_blocks = _multi_blockify(items_dict['IntBlock']) blocks.extend(int_blocks) - if len(items_dict['datetime']): - datetime_blocks = _simple_blockify(items_dict['datetime'], _NS_DTYPE) + if len(items_dict['DatetimeBlock']): + datetime_blocks = _simple_blockify(items_dict['DatetimeBlock'], + _NS_DTYPE) blocks.extend(datetime_blocks) - if len(items_dict['datetime_tz']): + if len(items_dict['DatetimeTZBlock']): dttz_blocks = [make_block(array, klass=DatetimeTZBlock, fastpath=True, placement=[i]) - for i, _, array in items_dict['datetime_tz']] + for i, _, array in items_dict['DatetimeTZBlock']] blocks.extend(dttz_blocks) - if len(items_dict['bool']): - bool_blocks = _simple_blockify(items_dict['bool'], np.bool_) + if len(items_dict['BoolBlock']): + bool_blocks = _simple_blockify(items_dict['BoolBlock'], np.bool_) blocks.extend(bool_blocks) - if len(items_dict['object']) > 0: - object_blocks = _simple_blockify(items_dict['object'], np.object_) + if len(items_dict['ObjectBlock']) > 0: + object_blocks = _simple_blockify(items_dict['ObjectBlock'], np.object_) blocks.extend(object_blocks) - if len(items_dict['sparse']) > 0: - sparse_blocks = _sparse_blockify(items_dict['sparse']) + if len(items_dict['SparseBlock']) > 0: + sparse_blocks = _sparse_blockify(items_dict['SparseBlock']) blocks.extend(sparse_blocks) - if len(items_dict['cat']) > 0: + if len(items_dict['CategoricalBlock']) > 0: cat_blocks = [make_block(array, klass=CategoricalBlock, fastpath=True, placement=[i]) - for i, _, array in items_dict['cat']] + for i, _, array in items_dict['CategoricalBlock']] blocks.extend(cat_blocks) if len(extra_locs):