ENH/DOC: added PerformanceWarning which will trigger on putting a non-endemic type

jreback · jreback · commit 4cfc8cd2f9e5 · 2013-01-18T10:41:03.000-05:00
fixed legacy_0.10.h5 issues with docs (finding file)
diff --git a/RELEASE.rst b/RELEASE.rst
@@ -88,6 +88,7 @@ pandas 0.10.1
     - refactored HFDStore to deal with non-table stores as objects, will allow future enhancements
     - removed keyword ``compression`` from ``put`` (replaced by keyword
       ``complib`` to be consistent across library)
+    - warn `PerformanceWarning` if you are attempting to store types that will be pickled by PyTables
 
 .. _GH512: https://github.com/pydata/pandas/issues/512
 .. _GH1277: https://github.com/pydata/pandas/issues/1277
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -1211,7 +1211,7 @@ You can create/modify an index for a table with ``create_table_index`` after dat
 
 Query via Data Columns
 ~~~~~~~~~~~~~~~~~~~~~~
-You can designate (and index) certain columns that you want to be able to perform queries (other than the `indexable` columns, which you can always query). For instance say you want to perform this common operation, on-disk, and return just the frame that matches this query.
+You can designate (and index) certain columns that you want to be able to perform queries (other than the `indexable` columns, which you can always query). For instance say you want to perform this common operation, on-disk, and return just the frame that matches this query. You can specify ``data_columns = True`` to force all columns to be data_columns
 
 .. ipython:: python
 
@@ -1377,21 +1377,26 @@ External Compatibility
      .. ipython:: python
         :suppress:
  
-        legacy_store.close()
+        store_export.close()
         import os
-        os.remove('store_export.h5')
+        os.remove('export.h5')
 
 Backwards Compatibility
 ~~~~~~~~~~~~~~~~~~~~~~~
 
 0.10.1 of ``HDFStore`` is backwards compatible for reading tables created in a prior version of pandas however, query terms using the prior (undocumented) methodology are unsupported. ``HDFStore`` will issue a warning if you try to use a prior-version format file. You must read in the entire file and write it out using the new format, using the method ``copy`` to take advantage of the updates. The group attribute ``pandas_version`` contains the version information. ``copy`` takes a number of options, please see the docstring.
 
 
+     .. ipython:: python
+        :suppress:
+ 
+        import os
+        legacy_file_path = os.path.abspath('source/_static/legacy_0.10.h5')
+
      .. ipython:: python
 
         # a legacy store
-	import os
-        legacy_store = HDFStore('legacy_0.10.h5', 'r')
+        legacy_store = HDFStore(legacy_file_path,'r')
         legacy_store
 
         # copy (and return the new handle)
@@ -1415,6 +1420,7 @@ Performance
    - You can pass ``chunksize=an integer`` to ``append``, to change the writing chunksize (default is 50000). This will signficantly lower your memory usage on writing.
    - You can pass ``expectedrows=an integer`` to the first ``append``, to set the TOTAL number of expectedrows that ``PyTables`` will expected. This will optimize read/write performance.
    - Duplicate rows can be written to tables, but are filtered out in selection (with the last items being selected; thus a table is unique on major, minor pairs)
+   - A ``PerformanceWarning`` will be raised if you are attempting to store types that will be pickled by PyTables (rather than stored as endemic types). See <http://stackoverflow.com/questions/14355151/how-to-make-pandas-hdfstore-put-operation-faster/14370190#14370190> for more information and some solutions.
 
 Experimental
 ~~~~~~~~~~~~
diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
@@ -42,6 +42,11 @@ class IncompatibilityWarning(Warning): pass
 where criteria is being ignored as this version [%s] is too old (or not-defined),
 read the file in and write it out to a new file to upgrade (with the copy_to method)
 """
+class PerformanceWarning(Warning): pass
+performance_doc = """
+your performance may suffer as PyTables swill pickle object types that it cannot map
+directly to c-types [inferred_type->%s,key->%s]
+"""
 
 # map object types
 _TYPE_MAP = {
@@ -510,7 +515,7 @@ def append(self, key, value, columns=None, **kwargs):
 
         Optional Parameters
         -------------------
-        data_columns : list of columns to create as data columns
+        data_columns : list of columns to create as data columns, or True to use all columns
         min_itemsize : dict of columns that specify minimum string sizes
         nan_rep      : string to use as string nan represenation
         chunksize    : size to chunk the writing
@@ -1606,6 +1611,17 @@ def write_array(self, key, value):
                 return
 
         if value.dtype.type == np.object_:
+
+            # infer the type, warn if we have a non-string type here (for performance)
+            inferred_type = lib.infer_dtype(value.flatten())
+            if empty_array:
+                pass
+            elif inferred_type == 'string':
+                pass
+            else:
+                ws = performance_doc % (inferred_type,key)
+                warnings.warn(ws, PerformanceWarning)
+
             vlarr = self.handle.createVLArray(self.group, key,
                                               _tables().ObjectAtom())
             vlarr.append(value)
@@ -1846,7 +1862,7 @@ class Table(Storer):
         index_axes    : a list of tuples of the (original indexing axis and index column)
         non_index_axes: a list of tuples of the (original index axis and columns on a non-indexing axis)
         values_axes   : a list of the columns which comprise the data of this table
-        data_columns  : a list of the columns that we are allowing indexing (these become single columns in values_axes)
+        data_columns  : a list of the columns that we are allowing indexing (these become single columns in values_axes), or True to force all columns
         nan_rep       : the string to use for nan representations for string objects
         levels        : the names of levels
 
@@ -2111,7 +2127,7 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None,
             validate: validate the obj against an existiing object already written
             min_itemsize: a dict of the min size for a column in bytes
             nan_rep : a values to use for string column nan_rep
-            data_columns : a list of columns that we want to create separate to allow indexing
+            data_columns : a list of columns that we want to create separate to allow indexing (or True will force all colummns)
 
         """
 
@@ -2196,6 +2212,9 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None,
         if data_columns is not None and len(self.non_index_axes):
             axis = self.non_index_axes[0][0]
             axis_labels = self.non_index_axes[0][1]
+            if data_columns is True:
+                data_columns = axis_labels
+
             data_columns = [c for c in data_columns if c in axis_labels]
             if len(data_columns):
                 blocks = block_obj.reindex_axis(Index(axis_labels) - Index(
@@ -2238,7 +2257,7 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None,
             except (NotImplementedError):
                 raise
             except (Exception), detail:
-                raise Exception("cannot find the correct atom type -> [dtype->%s] %s" % (b.dtype.name, str(detail)))
+                raise Exception("cannot find the correct atom type -> [dtype->%s,items->%s] %s" % (b.dtype.name, b.items, str(detail)))
             j += 1
 
         # validate the axes if we have an existing table
@@ -2722,6 +2741,8 @@ def table_type_short(self):
     def write(self, obj, data_columns=None, **kwargs):
         if data_columns is None:
             data_columns = []
+        elif data_columns is True:
+            data_columns = obj.columns[:]
         for n in obj.index.names:
             if n not in data_columns:
                 data_columns.insert(0, n)
diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py
@@ -9,7 +9,7 @@
 
 from pandas import (Series, DataFrame, Panel, MultiIndex, bdate_range,
                     date_range, Index)
-from pandas.io.pytables import HDFStore, get_store, Term, IncompatibilityWarning
+from pandas.io.pytables import HDFStore, get_store, Term, IncompatibilityWarning, PerformanceWarning
 import pandas.util.testing as tm
 from pandas.tests.test_series import assert_series_equal
 from pandas.tests.test_frame import assert_frame_equal
@@ -260,6 +260,28 @@ def test_put_integer(self):
         df = DataFrame(np.random.randn(50, 100))
         self._check_roundtrip(df, tm.assert_frame_equal)
 
+    def test_put_mixed_type(self):
+        df = tm.makeTimeDataFrame()
+        df['obj1'] = 'foo'
+        df['obj2'] = 'bar'
+        df['bool1'] = df['A'] > 0
+        df['bool2'] = df['B'] > 0
+        df['bool3'] = True
+        df['int1'] = 1
+        df['int2'] = 2
+        df['timestamp1'] = Timestamp('20010102')
+        df['timestamp2'] = Timestamp('20010103')
+        df['datetime1'] = datetime.datetime(2001, 1, 2, 0, 0)
+        df['datetime2'] = datetime.datetime(2001, 1, 3, 0, 0)
+        df.ix[3:6, ['obj1']] = np.nan
+        df = df.consolidate().convert_objects()
+        self.store.remove('df')
+        warnings.filterwarnings('ignore', category=PerformanceWarning)
+        self.store.put('df',df)
+        expected = self.store.get('df')
+        tm.assert_frame_equal(expected,df)
+        warnings.filterwarnings('always', category=PerformanceWarning)
+
     def test_append(self):
 
         df = tm.makeTimeDataFrame()
@@ -703,18 +725,20 @@ def test_big_table_frame(self):
         print "\nbig_table frame [%s] -> %5.2f" % (rows, time.time() - x)
 
     def test_big_table2_frame(self):
-        # this is a really big table: 2.5m rows x 300 float columns, 20 string
+        # this is a really big table: 1m rows x 60 float columns, 20 string, 20 datetime
         # columns
         raise nose.SkipTest('no big table2 frame')
 
         # create and write a big table
         print "\nbig_table2 start"
         import time
         start_time = time.time()
-        df = DataFrame(np.random.randn(2.5 * 1000 * 1000, 300), index=range(int(
-            2.5 * 1000 * 1000)), columns=['E%03d' % i for i in xrange(300)])
-        for x in range(20):
+        df = DataFrame(np.random.randn(1000 * 1000, 60), index=xrange(int(
+            1000 * 1000)), columns=['E%03d' % i for i in xrange(60)])
+        for x in xrange(20):
             df['String%03d' % x] = 'string%03d' % x
+        for x in xrange(20):
+            df['datetime%03d' % x] = datetime.datetime(2001, 1, 2, 0, 0)
 
         print "\nbig_table2 frame (creation of df) [rows->%s] -> %5.2f" % (len(df.index), time.time() - start_time)
         fn = 'big_table2.h5'
@@ -728,7 +752,7 @@ def f(chunksize):
                 store.close()
                 return r
 
-            for c in [10000, 50000, 100000, 250000]:
+            for c in [10000, 50000, 250000]:
                 start_time = time.time()
                 print "big_table2 frame [chunk->%s]" % c
                 rows = f(c)
@@ -737,6 +761,35 @@ def f(chunksize):
         finally:
             os.remove(fn)
 
+    def test_big_put_frame(self):
+        raise nose.SkipTest('no big put frame')
+
+        print "\nbig_put start"
+        import time
+        start_time = time.time()
+        df = DataFrame(np.random.randn(1000 * 1000, 60), index=xrange(int(
+            1000 * 1000)), columns=['E%03d' % i for i in xrange(60)])
+        for x in xrange(20):
+            df['String%03d' % x] = 'string%03d' % x
+        for x in xrange(20):
+            df['datetime%03d' % x] = datetime.datetime(2001, 1, 2, 0, 0)
+
+        print "\nbig_put frame (creation of df) [rows->%s] -> %5.2f" % (len(df.index), time.time() - start_time)
+        fn = 'big_put.h5'
+
+        try:
+
+            start_time = time.time()
+            store = HDFStore(fn, mode='w')
+            store.put('df', df)
+            store.close()
+
+            print df.get_dtype_counts()
+            print "big_put frame [shape->%s] -> %5.2f" % (df.shape, time.time() - start_time)
+
+        finally:
+            os.remove(fn)
+
     def test_big_table_panel(self):
         raise nose.SkipTest('no big table panel')
 
@@ -823,15 +876,23 @@ def test_table_index_incompatible_dtypes(self):
 
     def test_table_values_dtypes_roundtrip(self):
         df1 = DataFrame({'a': [1, 2, 3]}, dtype='f8')
-        self.store.append('df1', df1)
-        assert df1.dtypes == self.store['df1'].dtypes
+        self.store.append('df_f8', df1)
+        assert df1.dtypes == self.store['df_f8'].dtypes
 
         df2 = DataFrame({'a': [1, 2, 3]}, dtype='i8')
-        self.store.append('df2', df2)
-        assert df2.dtypes == self.store['df2'].dtypes
+        self.store.append('df_i8', df2)
+        assert df2.dtypes == self.store['df_i8'].dtypes
 
         # incompatible dtype
-        self.assertRaises(Exception, self.store.append, 'df2', df1)
+        self.assertRaises(Exception, self.store.append, 'df_i8', df1)
+
+        #df1 = DataFrame({'a': Series([1, 2, 3], dtype='f4')})
+        #self.store.append('df_f4', df1)
+        #assert df1.dtypes == self.store['df_f4'].dtypes
+
+        #df2 = DataFrame({'a': Series([1, 2, 3], dtype='i4')})
+        #self.store.append('df_i4', df2)
+        #assert df2.dtypes == self.store['df_i4'].dtypes
 
     def test_table_mixed_dtypes(self):
 
@@ -1165,27 +1226,35 @@ def test_tuple_index(self):
         idx = [(0., 1.), (2., 3.), (4., 5.)]
         data = np.random.randn(30).reshape((3, 10))
         DF = DataFrame(data, index=idx, columns=col)
+        warnings.filterwarnings('ignore', category=PerformanceWarning)
         self._check_roundtrip(DF, tm.assert_frame_equal)
+        warnings.filterwarnings('always', category=PerformanceWarning)
 
     def test_index_types(self):
         values = np.random.randn(2)
 
         func = lambda l, r: tm.assert_series_equal(l, r, True, True, True)
 
+        warnings.filterwarnings('ignore', category=PerformanceWarning)
         ser = Series(values, [0, 'y'])
         self._check_roundtrip(ser, func)
+        warnings.filterwarnings('always', category=PerformanceWarning)
 
         ser = Series(values, [datetime.datetime.today(), 0])
         self._check_roundtrip(ser, func)
 
         ser = Series(values, ['y', 0])
         self._check_roundtrip(ser, func)
 
+        warnings.filterwarnings('ignore', category=PerformanceWarning)
         ser = Series(values, [datetime.date.today(), 'a'])
         self._check_roundtrip(ser, func)
+        warnings.filterwarnings('always', category=PerformanceWarning)
 
+        warnings.filterwarnings('ignore', category=PerformanceWarning)
         ser = Series(values, [1.23, 'b'])
         self._check_roundtrip(ser, func)
+        warnings.filterwarnings('always', category=PerformanceWarning)
 
         ser = Series(values, [1, 1.53])
         self._check_roundtrip(ser, func)
@@ -1456,6 +1525,13 @@ def test_select(self):
         expected = df[df.A > 0].reindex(columns=['A', 'B'])
         tm.assert_frame_equal(expected, result)
 
+        # all a data columns
+        self.store.remove('df')
+        self.store.append('df', df, data_columns=True)
+        result = self.store.select('df', ['A > 0'], columns=['A', 'B'])
+        expected = df[df.A > 0].reindex(columns=['A', 'B'])
+        tm.assert_frame_equal(expected, result)
+
         # with a data column, but different columns
         self.store.remove('df')
         self.store.append('df', df, data_columns=['A'])
@@ -1776,7 +1852,6 @@ def test_legacy_table_read(self):
         store.select('df2', typ='legacy_frame')
 
         # old version warning
-        import warnings
         warnings.filterwarnings('ignore', category=IncompatibilityWarning)
         self.assertRaises(
             Exception, store.select, 'wp1', Term('minor_axis', '=', 'B'))
@@ -1915,9 +1990,11 @@ def test_tseries_indices_frame(self):
 
     def test_unicode_index(self):
         unicode_values = [u'\u03c3', u'\u03c3\u03c3']
-
+        warnings.filterwarnings('ignore', category=PerformanceWarning)
         s = Series(np.random.randn(len(unicode_values)), unicode_values)
         self._check_roundtrip(s, tm.assert_series_equal)
+        warnings.filterwarnings('always', category=PerformanceWarning)
+
 
     def test_store_datetime_mixed(self):
         df = DataFrame(