fix #563 : improved speed of read_hdf() when stored object is an LArray object

alixdamman · alixdamman · commit 22fc174c8a74 · 2019-05-23T15:40:29.000+02:00
diff --git a/doc/source/changes/version_0_30.rst.inc b/doc/source/changes/version_0_30.rst.inc
@@ -223,7 +223,6 @@ Miscellaneous improvements
 
 * implemented :py:obj:`LArray.reverse()` method to reverse one or several axes of an array (closes :issue:`631`).
 
-
 * added :py:obj:`set_options` allowing to set options for larray within a ``with`` block or globally:
 
     >>> from larray import *
@@ -272,6 +271,10 @@ Miscellaneous improvements
 
   Closes :issue:`274`.
 
+* improved speed of :py:obj:`read_hdf()` function when reading a stored LArray object dumped with
+  the current and future version of larray. To get benefit of the speedup of reading arrays dumped
+  with older versions of larray, please read and re-dump them. Closes :issue:`563`.
+
 
 Fixes
 -----
diff --git a/larray/core/array.py b/larray/core/array.py
@@ -6194,7 +6194,9 @@ def to_hdf(self, filepath, key):
         key = _translate_group_key_hdf(key)
         with LHDFStore(filepath) as store:
             store.put(key, self.to_frame())
-            store.get_storer(key).attrs.type = 'Array'
+            attrs = store.get_storer(key).attrs
+            attrs.type = 'Array'
+            attrs.writer = 'LArray'
             self.meta.to_hdf(store, key)
 
     @deprecate_kwarg('sheet_name', 'sheet')
diff --git a/larray/inout/hdf.py b/larray/inout/hdf.py
@@ -31,9 +31,12 @@ def read_hdf(filepath_or_buffer, key, fill_value=nan, na=nan, sort_rows=False, s
         Value used to fill cells corresponding to label combinations which are not present in the input.
         Defaults to NaN.
     sort_rows : bool, optional
-        Whether or not to sort the rows alphabetically (sorting is more efficient than not sorting). Defaults to False.
+        Whether or not to sort the rows alphabetically.
+        Must be False if the read array has been dumped with an larray version >= 0.30.
+        Defaults to False.
     sort_columns : bool, optional
-        Whether or not to sort the columns alphabetically (sorting is more efficient than not sorting).
+        Whether or not to sort the columns alphabetically.
+        Must be False if the read array has been dumped with an larray version >= 0.30.
         Defaults to False.
     name : str, optional
         Name of the axis or group to return. If None, name is set to passed key.
@@ -69,12 +72,15 @@ def read_hdf(filepath_or_buffer, key, fill_value=nan, na=nan, sort_rows=False, s
     with LHDFStore(filepath_or_buffer) as store:
         pd_obj = store.get(key)
         attrs = store.get_storer(key).attrs
+        writer = attrs.writer if 'writer' in attrs else None
         # for backward compatibility but any object read from an hdf file should have an attribute 'type'
         _type = attrs.type if 'type' in attrs else 'Array'
         _meta = attrs.metadata if 'metadata' in attrs else None
         if _type == 'Array':
+            # cartesian product is not necessary if the array was written by LArray
+            cartesian_prod = writer != 'LArray'
             res = df_aslarray(pd_obj, sort_rows=sort_rows, sort_columns=sort_columns, fill_value=fill_value,
-                              parse_header=False)
+                              parse_header=False, cartesian_prod=cartesian_prod)
             if _meta is not None:
                 res.meta = _meta
         elif _type == 'Axis':
diff --git a/larray/inout/pandas.py b/larray/inout/pandas.py
@@ -141,7 +141,7 @@ def from_series(s, sort_rows=False, fill_value=nan, meta=None, **kwargs):
 
 
 def from_frame(df, sort_rows=False, sort_columns=False, parse_header=False, unfold_last_axis_name=False,
-               fill_value=nan, meta=None, **kwargs):
+               fill_value=nan, meta=None, cartesian_prod=True, **kwargs):
     r"""
     Converts Pandas DataFrame into LArray.
 
@@ -151,9 +151,12 @@ def from_frame(df, sort_rows=False, sort_columns=False, parse_header=False, unfo
         Input dataframe. By default, name and labels of the last axis are defined by the name and labels of the
         columns Index of the dataframe unless argument unfold_last_axis_name is set to True.
     sort_rows : bool, optional
-        Whether or not to sort the rows alphabetically (sorting is more efficient than not sorting). Defaults to False.
+        Whether or not to sort the rows alphabetically (sorting is more efficient than not sorting).
+        Must be False if `cartesian_prod` is set to True.
+        Defaults to False.
     sort_columns : bool, optional
         Whether or not to sort the columns alphabetically (sorting is more efficient than not sorting).
+        Must be False if `cartesian_prod` is set to True.
         Defaults to False.
     parse_header : bool, optional
         Whether or not to parse columns labels. Pandas treats column labels as strings.
@@ -167,6 +170,11 @@ def from_frame(df, sort_rows=False, sort_columns=False, parse_header=False, unfo
     meta : list of pairs or dict or OrderedDict or Metadata, optional
         Metadata (title, description, author, creation_date, ...) associated with the array.
         Keys must be strings. Values must be of type string, int, float, date, time or datetime.
+    cartesian_prod : bool, optional
+        Whether or not to expand the dataframe to a cartesian product dataframe as needed by LArray.
+        This is an expensive operation but is absolutely required if you cannot guarantee your dataframe is already
+        well formed. If True, arguments `sort_rows` and `sort_columns` must be set to False.
+        Defaults to True.
 
     Returns
     -------
@@ -223,8 +231,14 @@ def from_frame(df, sort_rows=False, sort_columns=False, parse_header=False, unfo
     else:
         axes_names += [df.columns.name]
 
-    df, axes_labels = cartesian_product_df(df, sort_rows=sort_rows, sort_columns=sort_columns,
-                                           fill_value=fill_value, **kwargs)
+    if cartesian_prod:
+        df, axes_labels = cartesian_product_df(df, sort_rows=sort_rows, sort_columns=sort_columns,
+                                               fill_value=fill_value, **kwargs)
+    else:
+        if sort_rows or sort_columns:
+            raise ValueError('sort_rows and sort_columns cannot not be used when cartesian_prod is set to False. '
+                             'Please call the method sort_axes on the returned array to sort rows or columns')
+        axes_labels = index_to_labels(df.index, sort=False)
 
     # Pandas treats column labels as column names (strings) so we need to convert them to values
     last_axis_labels = [parse(cell) for cell in df.columns.values] if parse_header else list(df.columns.values)
@@ -237,7 +251,8 @@ def from_frame(df, sort_rows=False, sort_columns=False, parse_header=False, unfo
     return LArray(data, axes, meta=meta)
 
 
-def df_aslarray(df, sort_rows=False, sort_columns=False, raw=False, parse_header=True, wide=True, **kwargs):
+def df_aslarray(df, sort_rows=False, sort_columns=False, raw=False, parse_header=True, wide=True, cartesian_prod=True,
+                **kwargs):
     """
     Prepare Pandas DataFrame and then convert it into LArray.
 
@@ -246,9 +261,12 @@ def df_aslarray(df, sort_rows=False, sort_columns=False, raw=False, parse_header
     df : Pandas DataFrame
         Input dataframe.
     sort_rows : bool, optional
-        Whether or not to sort the rows alphabetically (sorting is more efficient than not sorting). Defaults to False.
+        Whether or not to sort the rows alphabetically (sorting is more efficient than not sorting).
+        Must be False if `cartesian_prod` is set to True.
+        Defaults to False.
     sort_columns : bool, optional
         Whether or not to sort the columns alphabetically (sorting is more efficient than not sorting).
+        Must be False if `cartesian_prod` is set to True.
         Defaults to False.
     raw : bool, optional
         Whether or not to consider the input dataframe as a raw dataframe, i.e. read without index at all.
@@ -260,6 +278,11 @@ def df_aslarray(df, sort_rows=False, sort_columns=False, raw=False, parse_header
         Whether or not to assume the array is stored in "wide" format.
         If False, the array is assumed to be stored in "narrow" format: one column per axis plus one value column.
         Defaults to True.
+    cartesian_prod : bool, optional
+        Whether or not to expand the dataframe to a cartesian product dataframe as needed by LArray.
+        This is an expensive operation but is absolutely required if you cannot guarantee your dataframe is already
+        well formed. If True, arguments `sort_rows` and `sort_columns` must be set to False.
+        Defaults to True.
 
     Returns
     -------
@@ -306,7 +329,7 @@ def df_aslarray(df, sort_rows=False, sort_columns=False, raw=False, parse_header
         axes_names = [decode(name, 'utf8') for name in df.index.names]
         unfold_last_axis_name = isinstance(axes_names[-1], basestring) and '\\' in axes_names[-1]
         return from_frame(df, sort_rows=sort_rows, sort_columns=sort_columns, parse_header=parse_header,
-                          unfold_last_axis_name=unfold_last_axis_name, **kwargs)
+                          unfold_last_axis_name=unfold_last_axis_name, cartesian_prod=cartesian_prod, **kwargs)
 
 
 # #################################### #