Skip to content

Commit 22fc174

Browse files
committed
fix #563 : improved speed of read_hdf() when stored object is an LArray object
1 parent 01669f2 commit 22fc174

File tree

4 files changed

+46
-12
lines changed

4 files changed

+46
-12
lines changed

doc/source/changes/version_0_30.rst.inc

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -223,7 +223,6 @@ Miscellaneous improvements
223223

224224
* implemented :py:obj:`LArray.reverse()` method to reverse one or several axes of an array (closes :issue:`631`).
225225

226-
227226
* added :py:obj:`set_options` allowing to set options for larray within a ``with`` block or globally:
228227

229228
>>> from larray import *
@@ -272,6 +271,10 @@ Miscellaneous improvements
272271

273272
Closes :issue:`274`.
274273

274+
* improved speed of :py:obj:`read_hdf()` function when reading a stored LArray object dumped with
275+
the current and future version of larray. To get benefit of the speedup of reading arrays dumped
276+
with older versions of larray, please read and re-dump them. Closes :issue:`563`.
277+
275278

276279
Fixes
277280
-----

larray/core/array.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6194,7 +6194,9 @@ def to_hdf(self, filepath, key):
61946194
key = _translate_group_key_hdf(key)
61956195
with LHDFStore(filepath) as store:
61966196
store.put(key, self.to_frame())
6197-
store.get_storer(key).attrs.type = 'Array'
6197+
attrs = store.get_storer(key).attrs
6198+
attrs.type = 'Array'
6199+
attrs.writer = 'LArray'
61986200
self.meta.to_hdf(store, key)
61996201

62006202
@deprecate_kwarg('sheet_name', 'sheet')

larray/inout/hdf.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,12 @@ def read_hdf(filepath_or_buffer, key, fill_value=nan, na=nan, sort_rows=False, s
3131
Value used to fill cells corresponding to label combinations which are not present in the input.
3232
Defaults to NaN.
3333
sort_rows : bool, optional
34-
Whether or not to sort the rows alphabetically (sorting is more efficient than not sorting). Defaults to False.
34+
Whether or not to sort the rows alphabetically.
35+
Must be False if the read array has been dumped with an larray version >= 0.30.
36+
Defaults to False.
3537
sort_columns : bool, optional
36-
Whether or not to sort the columns alphabetically (sorting is more efficient than not sorting).
38+
Whether or not to sort the columns alphabetically.
39+
Must be False if the read array has been dumped with an larray version >= 0.30.
3740
Defaults to False.
3841
name : str, optional
3942
Name of the axis or group to return. If None, name is set to passed key.
@@ -69,12 +72,15 @@ def read_hdf(filepath_or_buffer, key, fill_value=nan, na=nan, sort_rows=False, s
6972
with LHDFStore(filepath_or_buffer) as store:
7073
pd_obj = store.get(key)
7174
attrs = store.get_storer(key).attrs
75+
writer = attrs.writer if 'writer' in attrs else None
7276
# for backward compatibility but any object read from an hdf file should have an attribute 'type'
7377
_type = attrs.type if 'type' in attrs else 'Array'
7478
_meta = attrs.metadata if 'metadata' in attrs else None
7579
if _type == 'Array':
80+
# cartesian product is not necessary if the array was written by LArray
81+
cartesian_prod = writer != 'LArray'
7682
res = df_aslarray(pd_obj, sort_rows=sort_rows, sort_columns=sort_columns, fill_value=fill_value,
77-
parse_header=False)
83+
parse_header=False, cartesian_prod=cartesian_prod)
7884
if _meta is not None:
7985
res.meta = _meta
8086
elif _type == 'Axis':

larray/inout/pandas.py

Lines changed: 30 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@ def from_series(s, sort_rows=False, fill_value=nan, meta=None, **kwargs):
141141

142142

143143
def from_frame(df, sort_rows=False, sort_columns=False, parse_header=False, unfold_last_axis_name=False,
144-
fill_value=nan, meta=None, **kwargs):
144+
fill_value=nan, meta=None, cartesian_prod=True, **kwargs):
145145
r"""
146146
Converts Pandas DataFrame into LArray.
147147
@@ -151,9 +151,12 @@ def from_frame(df, sort_rows=False, sort_columns=False, parse_header=False, unfo
151151
Input dataframe. By default, name and labels of the last axis are defined by the name and labels of the
152152
columns Index of the dataframe unless argument unfold_last_axis_name is set to True.
153153
sort_rows : bool, optional
154-
Whether or not to sort the rows alphabetically (sorting is more efficient than not sorting). Defaults to False.
154+
Whether or not to sort the rows alphabetically (sorting is more efficient than not sorting).
155+
Must be False if `cartesian_prod` is set to True.
156+
Defaults to False.
155157
sort_columns : bool, optional
156158
Whether or not to sort the columns alphabetically (sorting is more efficient than not sorting).
159+
Must be False if `cartesian_prod` is set to True.
157160
Defaults to False.
158161
parse_header : bool, optional
159162
Whether or not to parse columns labels. Pandas treats column labels as strings.
@@ -167,6 +170,11 @@ def from_frame(df, sort_rows=False, sort_columns=False, parse_header=False, unfo
167170
meta : list of pairs or dict or OrderedDict or Metadata, optional
168171
Metadata (title, description, author, creation_date, ...) associated with the array.
169172
Keys must be strings. Values must be of type string, int, float, date, time or datetime.
173+
cartesian_prod : bool, optional
174+
Whether or not to expand the dataframe to a cartesian product dataframe as needed by LArray.
175+
This is an expensive operation but is absolutely required if you cannot guarantee your dataframe is already
176+
well formed. If True, arguments `sort_rows` and `sort_columns` must be set to False.
177+
Defaults to True.
170178
171179
Returns
172180
-------
@@ -223,8 +231,14 @@ def from_frame(df, sort_rows=False, sort_columns=False, parse_header=False, unfo
223231
else:
224232
axes_names += [df.columns.name]
225233

226-
df, axes_labels = cartesian_product_df(df, sort_rows=sort_rows, sort_columns=sort_columns,
227-
fill_value=fill_value, **kwargs)
234+
if cartesian_prod:
235+
df, axes_labels = cartesian_product_df(df, sort_rows=sort_rows, sort_columns=sort_columns,
236+
fill_value=fill_value, **kwargs)
237+
else:
238+
if sort_rows or sort_columns:
239+
raise ValueError('sort_rows and sort_columns cannot not be used when cartesian_prod is set to False. '
240+
'Please call the method sort_axes on the returned array to sort rows or columns')
241+
axes_labels = index_to_labels(df.index, sort=False)
228242

229243
# Pandas treats column labels as column names (strings) so we need to convert them to values
230244
last_axis_labels = [parse(cell) for cell in df.columns.values] if parse_header else list(df.columns.values)
@@ -237,7 +251,8 @@ def from_frame(df, sort_rows=False, sort_columns=False, parse_header=False, unfo
237251
return LArray(data, axes, meta=meta)
238252

239253

240-
def df_aslarray(df, sort_rows=False, sort_columns=False, raw=False, parse_header=True, wide=True, **kwargs):
254+
def df_aslarray(df, sort_rows=False, sort_columns=False, raw=False, parse_header=True, wide=True, cartesian_prod=True,
255+
**kwargs):
241256
"""
242257
Prepare Pandas DataFrame and then convert it into LArray.
243258
@@ -246,9 +261,12 @@ def df_aslarray(df, sort_rows=False, sort_columns=False, raw=False, parse_header
246261
df : Pandas DataFrame
247262
Input dataframe.
248263
sort_rows : bool, optional
249-
Whether or not to sort the rows alphabetically (sorting is more efficient than not sorting). Defaults to False.
264+
Whether or not to sort the rows alphabetically (sorting is more efficient than not sorting).
265+
Must be False if `cartesian_prod` is set to True.
266+
Defaults to False.
250267
sort_columns : bool, optional
251268
Whether or not to sort the columns alphabetically (sorting is more efficient than not sorting).
269+
Must be False if `cartesian_prod` is set to True.
252270
Defaults to False.
253271
raw : bool, optional
254272
Whether or not to consider the input dataframe as a raw dataframe, i.e. read without index at all.
@@ -260,6 +278,11 @@ def df_aslarray(df, sort_rows=False, sort_columns=False, raw=False, parse_header
260278
Whether or not to assume the array is stored in "wide" format.
261279
If False, the array is assumed to be stored in "narrow" format: one column per axis plus one value column.
262280
Defaults to True.
281+
cartesian_prod : bool, optional
282+
Whether or not to expand the dataframe to a cartesian product dataframe as needed by LArray.
283+
This is an expensive operation but is absolutely required if you cannot guarantee your dataframe is already
284+
well formed. If True, arguments `sort_rows` and `sort_columns` must be set to False.
285+
Defaults to True.
263286
264287
Returns
265288
-------
@@ -306,7 +329,7 @@ def df_aslarray(df, sort_rows=False, sort_columns=False, raw=False, parse_header
306329
axes_names = [decode(name, 'utf8') for name in df.index.names]
307330
unfold_last_axis_name = isinstance(axes_names[-1], basestring) and '\\' in axes_names[-1]
308331
return from_frame(df, sort_rows=sort_rows, sort_columns=sort_columns, parse_header=parse_header,
309-
unfold_last_axis_name=unfold_last_axis_name, **kwargs)
332+
unfold_last_axis_name=unfold_last_axis_name, cartesian_prod=cartesian_prod, **kwargs)
310333

311334

312335
# #################################### #

0 commit comments

Comments
 (0)