From f91606adde8aa01bb7913785fc10fa5e499b1bb9 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 1 Dec 2019 15:39:49 -0800 Subject: [PATCH 1/5] merge --- pandas/io/pytables.py | 51 ++++++++++++++++++------------------------- 1 file changed, 21 insertions(+), 30 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 6ef821fc52d46..fca89c04667de 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -3460,12 +3460,12 @@ def indexables(self): """ create/cache the indexables if they don't exist """ if self._indexables is None: - self._indexables = [] + _indexables = [] # Note: each of the `name` kwargs below are str, ensured # by the definition in index_cols. # index columns - self._indexables.extend( + _indexables.extend( [ IndexCol(name=name, axis=axis, pos=i) for i, (axis, name) in enumerate(self.attrs.index_cols) @@ -3474,7 +3474,7 @@ def indexables(self): # values columns dc = set(self.data_columns) - base_pos = len(self._indexables) + base_pos = len(_indexables) def f(i, c): assert isinstance(c, str) @@ -3487,10 +3487,12 @@ def f(i, c): # Note: the definition of `values_cols` ensures that each # `c` below is a str. - self._indexables.extend( + _indexables.extend( [f(i, c) for i, c in enumerate(self.attrs.values_cols)] ) + self._indexables = _indexables + return self._indexables def create_index(self, columns=None, optlevel=None, kind=None): @@ -3703,6 +3705,7 @@ def create_axes( else: existing_table = None + assert self.ndim == 2 # with next check, we must have len(axes) == 1 # currently support on ndim-1 axes if len(axes) != self.ndim - 1: raise ValueError( @@ -3720,14 +3723,16 @@ def create_axes( self.nan_rep = nan_rep # create axes to index and non_index - index_axes_map = dict() for i, a in enumerate(obj.axes): if i in axes: name = obj._AXIS_NAMES[i] new_index = _convert_index(name, a, self.encoding, self.errors) new_index.axis = i - index_axes_map[i] = new_index + + # Because we are always 2D, only one axis ever gets here, so + # we know it will have pos=0 + new_index.set_pos(0) else: @@ -3754,20 +3759,18 @@ def create_axes( self.non_index_axes.append((i, append_axis)) - # set axis positions (based on the axes) - new_index_axes = [index_axes_map[a] for a in axes] - for j, iax in enumerate(new_index_axes): - iax.set_pos(j) - iax.update_info(self.info) - self.index_axes = new_index_axes + new_index.update_info(self.info) + self.index_axes = [new_index] j = len(self.index_axes) + assert j == 1 # check for column conflicts for a in self.axes: a.maybe_set_size(min_itemsize=min_itemsize) # reindex by our non_index_axes & compute data_columns + assert len(self.non_index_axes) == 1 for a in self.non_index_axes: obj = _reindex_axis(obj, a[0], a[1]) @@ -4152,21 +4155,7 @@ def write_data(self, chunksize: Optional[int], dropna: bool = False): # broadcast the indexes if needed indexes = [a.cvalues for a in self.index_axes] nindexes = len(indexes) - bindexes = [] - for i, idx in enumerate(indexes): - - # broadcast to all other indexes except myself - if i > 0 and i < nindexes: - repeater = np.prod([indexes[bi].shape[0] for bi in range(0, i)]) - idx = np.tile(idx, repeater) - - if i < nindexes - 1: - repeater = np.prod( - [indexes[bi].shape[0] for bi in range(i + 1, nindexes)] - ) - idx = np.repeat(idx, repeater) - - bindexes.append(idx) + assert nindexes == 1, nindexes # ensures we dont need to broadcast # transpose the values so first dimension is last # reshape the values if needed @@ -4191,7 +4180,7 @@ def write_data(self, chunksize: Optional[int], dropna: bool = False): self.write_data_chunk( rows, - indexes=[a[start_i:end_i] for a in bindexes], + indexes=[a[start_i:end_i] for a in indexes], mask=mask[start_i:end_i] if mask is not None else None, values=[v[start_i:end_i] for v in bvalues], ) @@ -4477,13 +4466,15 @@ def indexables(self): d = self.description # the index columns is just a simple index - self._indexables = [GenericIndexCol(name="index", axis=0)] + _indexables = [GenericIndexCol(name="index", axis=0)] for i, n in enumerate(d._v_names): assert isinstance(n, str) dc = GenericDataIndexableCol(name=n, pos=i, values=[n]) - self._indexables.append(dc) + _indexables.append(dc) + + self._indexables = _indexables return self._indexables From 0daae16fcd3d32ef0e234028631098847efd14f0 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 3 Dec 2019 15:10:39 -0800 Subject: [PATCH 2/5] revert changes that are in other PRs --- pandas/io/pytables.py | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index f161763afc221..b5457bef2ae3f 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -3527,12 +3527,12 @@ def indexables(self): """ create/cache the indexables if they don't exist """ if self._indexables is None: - _indexables = [] + self._indexables = [] # Note: each of the `name` kwargs below are str, ensured # by the definition in index_cols. # index columns - _indexables.extend( + self._indexables.extend( [ IndexCol(name=name, axis=axis, pos=i) for i, (axis, name) in enumerate(self.attrs.index_cols) @@ -3541,7 +3541,7 @@ def indexables(self): # values columns dc = set(self.data_columns) - base_pos = len(_indexables) + base_pos = len(self._indexables) def f(i, c): assert isinstance(c, str) @@ -3554,12 +3554,10 @@ def f(i, c): # Note: the definition of `values_cols` ensures that each # `c` below is a str. - _indexables.extend( + self._indexables.extend( [f(i, c) for i, c in enumerate(self.attrs.values_cols)] ) - self._indexables = _indexables - return self._indexables def create_index(self, columns=None, optlevel=None, kind: Optional[str] = None): @@ -4235,7 +4233,21 @@ def write_data(self, chunksize: Optional[int], dropna: bool = False): # broadcast the indexes if needed indexes = [a.cvalues for a in self.index_axes] nindexes = len(indexes) - assert nindexes == 1, nindexes # ensures we dont need to broadcast + bindexes = [] + for i, idx in enumerate(indexes): + + # broadcast to all other indexes except myself + if i > 0 and i < nindexes: + repeater = np.prod([indexes[bi].shape[0] for bi in range(0, i)]) + idx = np.tile(idx, repeater) + + if i < nindexes - 1: + repeater = np.prod( + [indexes[bi].shape[0] for bi in range(i + 1, nindexes)] + ) + idx = np.repeat(idx, repeater) + + bindexes.append(idx) # transpose the values so first dimension is last # reshape the values if needed @@ -4260,7 +4272,7 @@ def write_data(self, chunksize: Optional[int], dropna: bool = False): self.write_data_chunk( rows, - indexes=[a[start_i:end_i] for a in indexes], + indexes=[a[start_i:end_i] for a in bindexes], mask=mask[start_i:end_i] if mask is not None else None, values=[v[start_i:end_i] for v in bvalues], ) @@ -4551,15 +4563,13 @@ def indexables(self): d = self.description # the index columns is just a simple index - _indexables = [GenericIndexCol(name="index", axis=0)] + self._indexables = [GenericIndexCol(name="index", axis=0)] for i, n in enumerate(d._v_names): assert isinstance(n, str) dc = GenericDataIndexableCol(name=n, pos=i, values=[n]) - _indexables.append(dc) - - self._indexables = _indexables + self._indexables.append(dc) return self._indexables From d3030f24fde9ebbae61d31d44b216f4c9bab277a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 3 Dec 2019 15:55:51 -0800 Subject: [PATCH 3/5] comment --- pandas/io/pytables.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index b5457bef2ae3f..8b6925c20692f 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -3832,6 +3832,8 @@ def create_axes( self.non_index_axes.append((i, append_axis)) + # Note: we can't do this update_info inside the loop because self.info + # is modified at another step in the loop above. new_index.update_info(self.info) self.index_axes = [new_index] From effdf96d81f0bdd9a2ab0a146f1f7d424035a63e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 4 Dec 2019 13:17:37 -0800 Subject: [PATCH 4/5] de-loop --- pandas/io/pytables.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 70ca5542ed238..1da051e388226 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -3889,15 +3889,12 @@ def create_axes( # Note: we can't do this update_info inside the loop because self.info # is modified at another step in the loop above. new_index.update_info(self.info) + new_index.maybe_set_size(min_itemsize) # check for column conflicts new_index_axes = [new_index] j = len(new_index_axes) assert j == 1 - # check for column conflicts - for a in new_index_axes: - a.maybe_set_size(min_itemsize=min_itemsize) - # reindex by our non_index_axes & compute data_columns assert len(new_non_index_axes) == 1 for a in new_non_index_axes: @@ -3906,7 +3903,7 @@ def create_axes( def get_blk_items(mgr, blocks): return [mgr.items.take(blk.mgr_locs) for blk in blocks] - transposed = new_index_axes[0].axis == 1 + transposed = new_index.axis == 1 # figure out data_columns and get out blocks block_obj = self.get_object(obj, transposed)._consolidate() From 02ac3b741e1fa5292180511092be62f0a6596104 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 4 Dec 2019 13:40:24 -0800 Subject: [PATCH 5/5] REF: handle new-non_index_axis first --- pandas/io/pytables.py | 67 ++++++++++++++++++++----------------------- 1 file changed, 31 insertions(+), 36 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 1da051e388226..5a3f488ddb544 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -3847,52 +3847,47 @@ def create_axes( if nan_rep is None: nan_rep = "nan" - # create axes to index and non_index - for i, a in enumerate(obj.axes): + # We construct the non-index-axis first, since that alters self.info + idx = [x for x in [0, 1] if x not in axes][0] - if i in axes: - name = obj._AXIS_NAMES[i] - new_index = _convert_index(name, a, self.encoding, self.errors) - new_index.axis = i + a = obj.axes[idx] + # we might be able to change the axes on the appending data if necessary + append_axis = list(a) + if existing_table is not None: + indexer = len(new_non_index_axes) # i.e. 0 + exist_axis = existing_table.non_index_axes[indexer][1] + if not array_equivalent(np.array(append_axis), np.array(exist_axis)): - # Because we are always 2D, only one axis ever gets here, so - # we know it will have pos=0 - new_index.set_pos(0) + # ahah! -> reindex + if array_equivalent( + np.array(sorted(append_axis)), np.array(sorted(exist_axis)) + ): + append_axis = exist_axis - else: + # the non_index_axes info + info = self.info.setdefault(idx, {}) + info["names"] = list(a.names) + info["type"] = type(a).__name__ - # we might be able to change the axes on the appending data if - # necessary - append_axis = list(a) - if existing_table is not None: - indexer = len(new_non_index_axes) - exist_axis = existing_table.non_index_axes[indexer][1] - if not array_equivalent( - np.array(append_axis), np.array(exist_axis) - ): - - # ahah! -> reindex - if array_equivalent( - np.array(sorted(append_axis)), np.array(sorted(exist_axis)) - ): - append_axis = exist_axis + new_non_index_axes.append((idx, append_axis)) - # the non_index_axes info - info = _get_info(self.info, i) - info["names"] = list(a.names) - info["type"] = type(a).__name__ + # Now we can construct our new index axis + idx = axes[0] + a = obj.axes[idx] + name = obj._AXIS_NAMES[idx] + new_index = _convert_index(name, a, self.encoding, self.errors) + new_index.axis = idx - new_non_index_axes.append((i, append_axis)) + # Because we are always 2D, there is only one new_index, so + # we know it will have pos=0 + new_index.set_pos(0) + new_index.update_info(self.info) + new_index.maybe_set_size(min_itemsize) # check for column conflicts self.non_index_axes = new_non_index_axes - # Note: we can't do this update_info inside the loop because self.info - # is modified at another step in the loop above. - new_index.update_info(self.info) - new_index.maybe_set_size(min_itemsize) # check for column conflicts new_index_axes = [new_index] - - j = len(new_index_axes) + j = len(new_index_axes) # i.e. 1 assert j == 1 # reindex by our non_index_axes & compute data_columns