From a75db0c8dc6a48883711fd895b1df0648ed0781f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 7 Dec 2019 15:16:17 -0800 Subject: [PATCH 1/5] Make read_axes return instead of alter self --- pandas/io/pytables.py | 77 ++++++++++++++++++++++++++----------------- 1 file changed, 47 insertions(+), 30 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index a95d7f39ab82c..0cb086efee4a2 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1999,15 +1999,16 @@ def convert( kwargs["name"] = _ensure_decoded(self.index_name) # making an Index instance could throw a number of different errors try: - self.values = Index(values, **kwargs) + values = Index(values, **kwargs) except ValueError: # if the output freq is different that what we recorded, # it should be None (see also 'doc example part 2') if "freq" in kwargs: kwargs["freq"] = None - self.values = Index(values, **kwargs) + values = Index(values, **kwargs) - self.values = _set_tz(self.values, self.tz) + out = _set_tz(values, self.tz) + return out, out def take_data(self): """ return the values & release the memory """ @@ -2195,7 +2196,8 @@ def convert( _start = start if start is not None else 0 _stop = min(stop, self.table.nrows) if stop is not None else self.table.nrows - self.values = Int64Index(np.arange(_stop - _start)) + out = Int64Index(np.arange(_stop - _start)) + return out, out def get_attr(self): pass @@ -2444,6 +2446,7 @@ def convert(self, values, nan_rep, encoding, errors, start=None, stop=None): # NB: unlike in the other calls to set_data, self.dtype may not be None here self.set_data(values) + converted = self.data # TODO: Setting should not be necessary # use the meta if needed meta = _ensure_decoded(self.meta) @@ -2456,25 +2459,25 @@ def convert(self, values, nan_rep, encoding, errors, start=None, stop=None): if dtype == "datetime64": # recreate with tz if indicated - self.data = _set_tz(self.data, self.tz, coerce=True) + converted = _set_tz(converted, self.tz, coerce=True) elif dtype == "timedelta64": - self.data = np.asarray(self.data, dtype="m8[ns]") + converted = np.asarray(converted, dtype="m8[ns]") elif dtype == "date": try: - self.data = np.asarray( - [date.fromordinal(v) for v in self.data], dtype=object + converted = np.asarray( + [date.fromordinal(v) for v in converted], dtype=object ) except ValueError: - self.data = np.asarray( - [date.fromtimestamp(v) for v in self.data], dtype=object + converted = np.asarray( + [date.fromtimestamp(v) for v in converted], dtype=object ) elif meta == "category": # we have a categorical categories = self.metadata - codes = self.data.ravel() + codes = converted.ravel() # if we have stored a NaN in the categories # then strip it; in theory we could have BOTH @@ -2491,23 +2494,25 @@ def convert(self, values, nan_rep, encoding, errors, start=None, stop=None): categories = categories[~mask] codes[codes != -1] -= mask.astype(int).cumsum().values - self.data = Categorical.from_codes( + converted = Categorical.from_codes( codes, categories=categories, ordered=self.ordered ) else: try: - self.data = self.data.astype(dtype, copy=False) + converted = converted.astype(dtype, copy=False) except TypeError: - self.data = self.data.astype("O", copy=False) + converted = converted.astype("O", copy=False) # convert nans / decode if _ensure_decoded(self.kind) == "string": - self.data = _unconvert_string_array( - self.data, nan_rep=nan_rep, encoding=encoding, errors=errors + converted = _unconvert_string_array( + converted, nan_rep=nan_rep, encoding=encoding, errors=errors ) + return self.values, converted + def get_attr(self): """ get the data for this column """ self.values = getattr(self.attrs, self.kind_attr, None) @@ -3608,7 +3613,7 @@ def create_index(self, columns=None, optlevel=None, kind: Optional[str] = None): ) v.create_index(**kw) - def read_axes( + def _read_axes( self, where, start: Optional[int] = None, stop: Optional[int] = None ) -> bool: """ @@ -3622,8 +3627,6 @@ def read_axes( Returns ------- - bool - Indicates success. """ # validate the version @@ -3637,10 +3640,11 @@ def read_axes( selection = Selection(self, where=where, start=start, stop=stop) values = selection.select() + results = [] # convert the data for a in self.axes: a.set_info(self.info) - a.convert( + res = a.convert( values, nan_rep=self.nan_rep, encoding=self.encoding, @@ -3648,8 +3652,9 @@ def read_axes( start=start, stop=stop, ) + results.append(res) - return True + return results def get_object(self, obj, transposed: bool): """ return the data for this obj """ @@ -4082,13 +4087,13 @@ def read_column( # column must be an indexable or a data column c = getattr(self.table.cols, column) a.set_info(self.info) - a.convert( + col_values = a.convert( c[start:stop], nan_rep=self.nan_rep, encoding=self.encoding, errors=self.errors, ) - return Series(_set_tz(a.take_data(), a.tz), name=column) + return Series(_set_tz(col_values[1], a.tz), name=column) raise KeyError(f"column [{column}] not found in the table") @@ -4372,7 +4377,8 @@ def read( stop: Optional[int] = None, ): - if not self.read_axes(where=where, start=start, stop=stop): + result = self._read_axes(where=where, start=start, stop=stop) + if result is False: return None info = ( @@ -4380,26 +4386,37 @@ def read( if len(self.non_index_axes) else dict() ) - index = self.index_axes[0].values + + axes = list(self.axes) + inds = [i for i in range(len(axes)) if axes[i] is self.index_axes[0]] + assert len(inds) == 1 + ind = inds[0] + + index = result[ind][0]#self.index_axes[0].values + frames = [] - for a in self.values_axes: + for i, a in enumerate(self.axes): + if a not in self.values_axes: + continue + values, cvalues = result[i] # we could have a multi-index constructor here # ensure_index doesn't recognized our list-of-tuples here if info.get("type") == "MultiIndex": - cols = MultiIndex.from_tuples(a.values) + cols = MultiIndex.from_tuples(values) else: - cols = Index(a.values) + cols = Index(values) + names = info.get("names") if names is not None: cols.set_names(names, inplace=True) if self.is_transposed: - values = a.cvalues + values = cvalues index_ = cols cols_ = Index(index, name=getattr(index, "name", None)) else: - values = a.cvalues.T + values = cvalues.T index_ = Index(index, name=getattr(index, "name", None)) cols_ = cols From f0dfbb649f7b4291d5efb127f4edc3fcb0283ccf Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 9 Dec 2019 10:56:09 -0800 Subject: [PATCH 2/5] port _get_data_and_dtype_name to avoid calling set_data at all --- pandas/io/pytables.py | 49 ++++++++++++++++++++++++++++++++----------- 1 file changed, 37 insertions(+), 12 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 14501eed84e19..e9eda31c96eb6 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -2373,24 +2373,28 @@ def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str): assert self.typ is not None if self.dtype is None: - self.set_data(values) + converted, dtype_name = _get_data_and_dtype_name(values) + kind = _dtype_to_kind(dtype_name) else: - self.data = values - converted = self.data # TODO: Setting should not be necessary + converted = values + dtype_name = self.dtype + kind = self.kind # use the meta if needed meta = _ensure_decoded(self.meta) + metadata = self.metadata + ordered = self.ordered + tz = self.tz - assert self.dtype is not None + assert dtype_name is not None # convert to the correct dtype - dtype = _ensure_decoded(self.dtype) - + dtype = _ensure_decoded(dtype_name) # reverse converts if dtype == "datetime64": # recreate with tz if indicated - converted = _set_tz(converted, self.tz, coerce=True) + converted = _set_tz(converted, tz, coerce=True) elif dtype == "timedelta64": converted = np.asarray(converted, dtype="m8[ns]") @@ -2407,7 +2411,7 @@ def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str): elif meta == "category": # we have a categorical - categories = self.metadata + categories = metadata codes = converted.ravel() # if we have stored a NaN in the categories @@ -2426,7 +2430,7 @@ def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str): codes[codes != -1] -= mask.astype(int).cumsum().values converted = Categorical.from_codes( - codes, categories=categories, ordered=self.ordered + codes, categories=categories, ordered=ordered ) else: @@ -2436,9 +2440,8 @@ def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str): except TypeError: converted = converted.astype("O", copy=False) - # convert nans / decode - if _ensure_decoded(self.kind) == "string": + if _ensure_decoded(kind) == "string": converted = _unconvert_string_array( converted, nan_rep=nan_rep, encoding=encoding, errors=errors ) @@ -4357,7 +4360,7 @@ def read( assert len(inds) == 1 ind = inds[0] - index = result[ind][0]#self.index_axes[0].values + index = result[ind][0] frames = [] for i, a in enumerate(self.axes): @@ -4979,6 +4982,28 @@ def _dtype_to_kind(dtype_str: str) -> str: return kind +def _get_data_and_dtype_name(data: Union[np.ndarray, ABCExtensionArray]): + """ + Convert the passed data into a storable form and a dtype string. + """ + if is_categorical_dtype(data.dtype): + data = data.codes + + # For datetime64tz we need to drop the TZ in tests TODO: why? + dtype_name = data.dtype.name.split("[")[0] + + if data.dtype.kind in ["m", "M"]: + data = np.asarray(data.view("i8")) + # TODO: we used to reshape for the dt64tz case, but no longer + # doing that doesnt seem to break anything. why? + + elif isinstance(data, PeriodIndex): + data = data.asi8 + + data = np.asarray(data) + return data, dtype_name + + class Selection: """ Carries out a selection operation on a tables.Table object. From 0e46c4d5b1e20c0030191b2d6553965338186cf0 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 10 Dec 2019 08:25:18 -0800 Subject: [PATCH 3/5] comment --- pandas/io/pytables.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index e9eda31c96eb6..8c2218e7ac36f 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -2373,6 +2373,8 @@ def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str): assert self.typ is not None if self.dtype is None: + # Note: in tests we never have timedelta64 or datetime64, + # so the _get_data_and_dtype_name may be unnecessary converted, dtype_name = _get_data_and_dtype_name(values) kind = _dtype_to_kind(dtype_name) else: From e7974f2fca6153f8c5f36b1733d720612e91a508 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 10 Dec 2019 11:27:49 -0800 Subject: [PATCH 4/5] DOC: improve docstrings --- pandas/io/pytables.py | 47 +++++++++++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 401f5b9e8422d..8456a4b6015e4 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1974,7 +1974,9 @@ def is_indexed(self) -> bool: return getattr(self.table.cols, self.cname).is_indexed # type: ignore def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str): - """ set the values from this selection: take = take ownership """ + """ + Convert the data from this selection to the appropriate pandas type. + """ assert isinstance(values, np.ndarray), type(values) # values is a recarray @@ -2158,7 +2160,7 @@ def is_indexed(self) -> bool: def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str): """ - Set the values from this selection. + Convert the data from this selection to the appropriate pandas type. Parameters ---------- @@ -2356,8 +2358,20 @@ def validate_attr(self, append): ) def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str): - """set the data from this selection (and convert to the correct dtype - if we can) + """ + Convert the data from this selection to the appropriate pandas type. + + Parameters + ---------- + values : np.ndarray + nan_rep : + encoding : str + errors : str + + Returns + ------- + index : listlike to become an Index + data : ndarraylike to become a column """ assert isinstance(values, np.ndarray), type(values) @@ -3570,7 +3584,7 @@ def create_index(self, columns=None, optlevel=None, kind: Optional[str] = None): def _read_axes( self, where, start: Optional[int] = None, stop: Optional[int] = None - ) -> bool: + ) -> List[Tuple[ArrayLike, ArrayLike]]: """ Create the axes sniffed from the table. @@ -3582,15 +3596,9 @@ def _read_axes( Returns ------- + List[Tuple[index_values, column_values]] """ - # validate the version - self.validate_version(where) - - # infer the data kind - if not self.infer_axes(): - return False - # create the selection selection = Selection(self, where=where, start=start, stop=stop) values = selection.select() @@ -4341,10 +4349,15 @@ def read( stop: Optional[int] = None, ): - result = self._read_axes(where=where, start=start, stop=stop) - if result is False: + # validate the version + self.validate_version(where) + + # infer the data kind + if not self.infer_axes(): return None + result = self._read_axes(where=where, start=start, stop=stop) + info = ( self.info.get(self.non_index_axes[0][0], dict()) if len(self.non_index_axes) @@ -4362,14 +4375,14 @@ def read( for i, a in enumerate(self.axes): if a not in self.values_axes: continue - values, cvalues = result[i] + index_vals, cvalues = result[i] # we could have a multi-index constructor here # ensure_index doesn't recognized our list-of-tuples here if info.get("type") == "MultiIndex": - cols = MultiIndex.from_tuples(values) + cols = MultiIndex.from_tuples(index_vals) else: - cols = Index(values) + cols = Index(index_vals) names = info.get("names") if names is not None: From 648c86410d5d5f474142eb6e6251e80c29661b1f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 11 Dec 2019 08:43:32 -0800 Subject: [PATCH 5/5] inline --- pandas/io/pytables.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index bd5ed4fa53456..3d4e4252adfdd 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -4360,8 +4360,7 @@ def read( else dict() ) - axes = list(self.axes) - inds = [i for i in range(len(axes)) if axes[i] is self.index_axes[0]] + inds = [i for i, ax in enumerate(self.axes) if ax is self.index_axes[0]] assert len(inds) == 1 ind = inds[0]