Skip to content

Commit 4cfc8cd

Browse files
committed
ENH/DOC: added PerformanceWarning which will trigger on putting a non-endemic type
fixed legacy_0.10.h5 issues with docs (finding file)
1 parent 8b28862 commit 4cfc8cd

File tree

4 files changed

+127
-22
lines changed

4 files changed

+127
-22
lines changed

RELEASE.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ pandas 0.10.1
8888
- refactored HFDStore to deal with non-table stores as objects, will allow future enhancements
8989
- removed keyword ``compression`` from ``put`` (replaced by keyword
9090
``complib`` to be consistent across library)
91+
- warn `PerformanceWarning` if you are attempting to store types that will be pickled by PyTables
9192

9293
.. _GH512: https://github.com/pydata/pandas/issues/512
9394
.. _GH1277: https://github.com/pydata/pandas/issues/1277

doc/source/io.rst

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1211,7 +1211,7 @@ You can create/modify an index for a table with ``create_table_index`` after dat
12111211
12121212
Query via Data Columns
12131213
~~~~~~~~~~~~~~~~~~~~~~
1214-
You can designate (and index) certain columns that you want to be able to perform queries (other than the `indexable` columns, which you can always query). For instance say you want to perform this common operation, on-disk, and return just the frame that matches this query.
1214+
You can designate (and index) certain columns that you want to be able to perform queries (other than the `indexable` columns, which you can always query). For instance say you want to perform this common operation, on-disk, and return just the frame that matches this query. You can specify ``data_columns = True`` to force all columns to be data_columns
12151215

12161216
.. ipython:: python
12171217
@@ -1377,21 +1377,26 @@ External Compatibility
13771377
.. ipython:: python
13781378
:suppress:
13791379
1380-
legacy_store.close()
1380+
store_export.close()
13811381
import os
1382-
os.remove('store_export.h5')
1382+
os.remove('export.h5')
13831383
13841384
Backwards Compatibility
13851385
~~~~~~~~~~~~~~~~~~~~~~~
13861386

13871387
0.10.1 of ``HDFStore`` is backwards compatible for reading tables created in a prior version of pandas however, query terms using the prior (undocumented) methodology are unsupported. ``HDFStore`` will issue a warning if you try to use a prior-version format file. You must read in the entire file and write it out using the new format, using the method ``copy`` to take advantage of the updates. The group attribute ``pandas_version`` contains the version information. ``copy`` takes a number of options, please see the docstring.
13881388

13891389

1390+
.. ipython:: python
1391+
:suppress:
1392+
1393+
import os
1394+
legacy_file_path = os.path.abspath('source/_static/legacy_0.10.h5')
1395+
13901396
.. ipython:: python
13911397
13921398
# a legacy store
1393-
import os
1394-
legacy_store = HDFStore('legacy_0.10.h5', 'r')
1399+
legacy_store = HDFStore(legacy_file_path,'r')
13951400
legacy_store
13961401
13971402
# copy (and return the new handle)
@@ -1415,6 +1420,7 @@ Performance
14151420
- You can pass ``chunksize=an integer`` to ``append``, to change the writing chunksize (default is 50000). This will signficantly lower your memory usage on writing.
14161421
- You can pass ``expectedrows=an integer`` to the first ``append``, to set the TOTAL number of expectedrows that ``PyTables`` will expected. This will optimize read/write performance.
14171422
- Duplicate rows can be written to tables, but are filtered out in selection (with the last items being selected; thus a table is unique on major, minor pairs)
1423+
- A ``PerformanceWarning`` will be raised if you are attempting to store types that will be pickled by PyTables (rather than stored as endemic types). See <http://stackoverflow.com/questions/14355151/how-to-make-pandas-hdfstore-put-operation-faster/14370190#14370190> for more information and some solutions.
14181424

14191425
Experimental
14201426
~~~~~~~~~~~~

pandas/io/pytables.py

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,11 @@ class IncompatibilityWarning(Warning): pass
4242
where criteria is being ignored as this version [%s] is too old (or not-defined),
4343
read the file in and write it out to a new file to upgrade (with the copy_to method)
4444
"""
45+
class PerformanceWarning(Warning): pass
46+
performance_doc = """
47+
your performance may suffer as PyTables swill pickle object types that it cannot map
48+
directly to c-types [inferred_type->%s,key->%s]
49+
"""
4550

4651
# map object types
4752
_TYPE_MAP = {
@@ -510,7 +515,7 @@ def append(self, key, value, columns=None, **kwargs):
510515
511516
Optional Parameters
512517
-------------------
513-
data_columns : list of columns to create as data columns
518+
data_columns : list of columns to create as data columns, or True to use all columns
514519
min_itemsize : dict of columns that specify minimum string sizes
515520
nan_rep : string to use as string nan represenation
516521
chunksize : size to chunk the writing
@@ -1606,6 +1611,17 @@ def write_array(self, key, value):
16061611
return
16071612

16081613
if value.dtype.type == np.object_:
1614+
1615+
# infer the type, warn if we have a non-string type here (for performance)
1616+
inferred_type = lib.infer_dtype(value.flatten())
1617+
if empty_array:
1618+
pass
1619+
elif inferred_type == 'string':
1620+
pass
1621+
else:
1622+
ws = performance_doc % (inferred_type,key)
1623+
warnings.warn(ws, PerformanceWarning)
1624+
16091625
vlarr = self.handle.createVLArray(self.group, key,
16101626
_tables().ObjectAtom())
16111627
vlarr.append(value)
@@ -1846,7 +1862,7 @@ class Table(Storer):
18461862
index_axes : a list of tuples of the (original indexing axis and index column)
18471863
non_index_axes: a list of tuples of the (original index axis and columns on a non-indexing axis)
18481864
values_axes : a list of the columns which comprise the data of this table
1849-
data_columns : a list of the columns that we are allowing indexing (these become single columns in values_axes)
1865+
data_columns : a list of the columns that we are allowing indexing (these become single columns in values_axes), or True to force all columns
18501866
nan_rep : the string to use for nan representations for string objects
18511867
levels : the names of levels
18521868
@@ -2111,7 +2127,7 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None,
21112127
validate: validate the obj against an existiing object already written
21122128
min_itemsize: a dict of the min size for a column in bytes
21132129
nan_rep : a values to use for string column nan_rep
2114-
data_columns : a list of columns that we want to create separate to allow indexing
2130+
data_columns : a list of columns that we want to create separate to allow indexing (or True will force all colummns)
21152131
21162132
"""
21172133

@@ -2196,6 +2212,9 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None,
21962212
if data_columns is not None and len(self.non_index_axes):
21972213
axis = self.non_index_axes[0][0]
21982214
axis_labels = self.non_index_axes[0][1]
2215+
if data_columns is True:
2216+
data_columns = axis_labels
2217+
21992218
data_columns = [c for c in data_columns if c in axis_labels]
22002219
if len(data_columns):
22012220
blocks = block_obj.reindex_axis(Index(axis_labels) - Index(
@@ -2238,7 +2257,7 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None,
22382257
except (NotImplementedError):
22392258
raise
22402259
except (Exception), detail:
2241-
raise Exception("cannot find the correct atom type -> [dtype->%s] %s" % (b.dtype.name, str(detail)))
2260+
raise Exception("cannot find the correct atom type -> [dtype->%s,items->%s] %s" % (b.dtype.name, b.items, str(detail)))
22422261
j += 1
22432262

22442263
# validate the axes if we have an existing table
@@ -2722,6 +2741,8 @@ def table_type_short(self):
27222741
def write(self, obj, data_columns=None, **kwargs):
27232742
if data_columns is None:
27242743
data_columns = []
2744+
elif data_columns is True:
2745+
data_columns = obj.columns[:]
27252746
for n in obj.index.names:
27262747
if n not in data_columns:
27272748
data_columns.insert(0, n)

pandas/io/tests/test_pytables.py

Lines changed: 90 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
from pandas import (Series, DataFrame, Panel, MultiIndex, bdate_range,
1111
date_range, Index)
12-
from pandas.io.pytables import HDFStore, get_store, Term, IncompatibilityWarning
12+
from pandas.io.pytables import HDFStore, get_store, Term, IncompatibilityWarning, PerformanceWarning
1313
import pandas.util.testing as tm
1414
from pandas.tests.test_series import assert_series_equal
1515
from pandas.tests.test_frame import assert_frame_equal
@@ -260,6 +260,28 @@ def test_put_integer(self):
260260
df = DataFrame(np.random.randn(50, 100))
261261
self._check_roundtrip(df, tm.assert_frame_equal)
262262

263+
def test_put_mixed_type(self):
264+
df = tm.makeTimeDataFrame()
265+
df['obj1'] = 'foo'
266+
df['obj2'] = 'bar'
267+
df['bool1'] = df['A'] > 0
268+
df['bool2'] = df['B'] > 0
269+
df['bool3'] = True
270+
df['int1'] = 1
271+
df['int2'] = 2
272+
df['timestamp1'] = Timestamp('20010102')
273+
df['timestamp2'] = Timestamp('20010103')
274+
df['datetime1'] = datetime.datetime(2001, 1, 2, 0, 0)
275+
df['datetime2'] = datetime.datetime(2001, 1, 3, 0, 0)
276+
df.ix[3:6, ['obj1']] = np.nan
277+
df = df.consolidate().convert_objects()
278+
self.store.remove('df')
279+
warnings.filterwarnings('ignore', category=PerformanceWarning)
280+
self.store.put('df',df)
281+
expected = self.store.get('df')
282+
tm.assert_frame_equal(expected,df)
283+
warnings.filterwarnings('always', category=PerformanceWarning)
284+
263285
def test_append(self):
264286

265287
df = tm.makeTimeDataFrame()
@@ -703,18 +725,20 @@ def test_big_table_frame(self):
703725
print "\nbig_table frame [%s] -> %5.2f" % (rows, time.time() - x)
704726

705727
def test_big_table2_frame(self):
706-
# this is a really big table: 2.5m rows x 300 float columns, 20 string
728+
# this is a really big table: 1m rows x 60 float columns, 20 string, 20 datetime
707729
# columns
708730
raise nose.SkipTest('no big table2 frame')
709731

710732
# create and write a big table
711733
print "\nbig_table2 start"
712734
import time
713735
start_time = time.time()
714-
df = DataFrame(np.random.randn(2.5 * 1000 * 1000, 300), index=range(int(
715-
2.5 * 1000 * 1000)), columns=['E%03d' % i for i in xrange(300)])
716-
for x in range(20):
736+
df = DataFrame(np.random.randn(1000 * 1000, 60), index=xrange(int(
737+
1000 * 1000)), columns=['E%03d' % i for i in xrange(60)])
738+
for x in xrange(20):
717739
df['String%03d' % x] = 'string%03d' % x
740+
for x in xrange(20):
741+
df['datetime%03d' % x] = datetime.datetime(2001, 1, 2, 0, 0)
718742

719743
print "\nbig_table2 frame (creation of df) [rows->%s] -> %5.2f" % (len(df.index), time.time() - start_time)
720744
fn = 'big_table2.h5'
@@ -728,7 +752,7 @@ def f(chunksize):
728752
store.close()
729753
return r
730754

731-
for c in [10000, 50000, 100000, 250000]:
755+
for c in [10000, 50000, 250000]:
732756
start_time = time.time()
733757
print "big_table2 frame [chunk->%s]" % c
734758
rows = f(c)
@@ -737,6 +761,35 @@ def f(chunksize):
737761
finally:
738762
os.remove(fn)
739763

764+
def test_big_put_frame(self):
765+
raise nose.SkipTest('no big put frame')
766+
767+
print "\nbig_put start"
768+
import time
769+
start_time = time.time()
770+
df = DataFrame(np.random.randn(1000 * 1000, 60), index=xrange(int(
771+
1000 * 1000)), columns=['E%03d' % i for i in xrange(60)])
772+
for x in xrange(20):
773+
df['String%03d' % x] = 'string%03d' % x
774+
for x in xrange(20):
775+
df['datetime%03d' % x] = datetime.datetime(2001, 1, 2, 0, 0)
776+
777+
print "\nbig_put frame (creation of df) [rows->%s] -> %5.2f" % (len(df.index), time.time() - start_time)
778+
fn = 'big_put.h5'
779+
780+
try:
781+
782+
start_time = time.time()
783+
store = HDFStore(fn, mode='w')
784+
store.put('df', df)
785+
store.close()
786+
787+
print df.get_dtype_counts()
788+
print "big_put frame [shape->%s] -> %5.2f" % (df.shape, time.time() - start_time)
789+
790+
finally:
791+
os.remove(fn)
792+
740793
def test_big_table_panel(self):
741794
raise nose.SkipTest('no big table panel')
742795

@@ -823,15 +876,23 @@ def test_table_index_incompatible_dtypes(self):
823876

824877
def test_table_values_dtypes_roundtrip(self):
825878
df1 = DataFrame({'a': [1, 2, 3]}, dtype='f8')
826-
self.store.append('df1', df1)
827-
assert df1.dtypes == self.store['df1'].dtypes
879+
self.store.append('df_f8', df1)
880+
assert df1.dtypes == self.store['df_f8'].dtypes
828881

829882
df2 = DataFrame({'a': [1, 2, 3]}, dtype='i8')
830-
self.store.append('df2', df2)
831-
assert df2.dtypes == self.store['df2'].dtypes
883+
self.store.append('df_i8', df2)
884+
assert df2.dtypes == self.store['df_i8'].dtypes
832885

833886
# incompatible dtype
834-
self.assertRaises(Exception, self.store.append, 'df2', df1)
887+
self.assertRaises(Exception, self.store.append, 'df_i8', df1)
888+
889+
#df1 = DataFrame({'a': Series([1, 2, 3], dtype='f4')})
890+
#self.store.append('df_f4', df1)
891+
#assert df1.dtypes == self.store['df_f4'].dtypes
892+
893+
#df2 = DataFrame({'a': Series([1, 2, 3], dtype='i4')})
894+
#self.store.append('df_i4', df2)
895+
#assert df2.dtypes == self.store['df_i4'].dtypes
835896

836897
def test_table_mixed_dtypes(self):
837898

@@ -1165,27 +1226,35 @@ def test_tuple_index(self):
11651226
idx = [(0., 1.), (2., 3.), (4., 5.)]
11661227
data = np.random.randn(30).reshape((3, 10))
11671228
DF = DataFrame(data, index=idx, columns=col)
1229+
warnings.filterwarnings('ignore', category=PerformanceWarning)
11681230
self._check_roundtrip(DF, tm.assert_frame_equal)
1231+
warnings.filterwarnings('always', category=PerformanceWarning)
11691232

11701233
def test_index_types(self):
11711234
values = np.random.randn(2)
11721235

11731236
func = lambda l, r: tm.assert_series_equal(l, r, True, True, True)
11741237

1238+
warnings.filterwarnings('ignore', category=PerformanceWarning)
11751239
ser = Series(values, [0, 'y'])
11761240
self._check_roundtrip(ser, func)
1241+
warnings.filterwarnings('always', category=PerformanceWarning)
11771242

11781243
ser = Series(values, [datetime.datetime.today(), 0])
11791244
self._check_roundtrip(ser, func)
11801245

11811246
ser = Series(values, ['y', 0])
11821247
self._check_roundtrip(ser, func)
11831248

1249+
warnings.filterwarnings('ignore', category=PerformanceWarning)
11841250
ser = Series(values, [datetime.date.today(), 'a'])
11851251
self._check_roundtrip(ser, func)
1252+
warnings.filterwarnings('always', category=PerformanceWarning)
11861253

1254+
warnings.filterwarnings('ignore', category=PerformanceWarning)
11871255
ser = Series(values, [1.23, 'b'])
11881256
self._check_roundtrip(ser, func)
1257+
warnings.filterwarnings('always', category=PerformanceWarning)
11891258

11901259
ser = Series(values, [1, 1.53])
11911260
self._check_roundtrip(ser, func)
@@ -1456,6 +1525,13 @@ def test_select(self):
14561525
expected = df[df.A > 0].reindex(columns=['A', 'B'])
14571526
tm.assert_frame_equal(expected, result)
14581527

1528+
# all a data columns
1529+
self.store.remove('df')
1530+
self.store.append('df', df, data_columns=True)
1531+
result = self.store.select('df', ['A > 0'], columns=['A', 'B'])
1532+
expected = df[df.A > 0].reindex(columns=['A', 'B'])
1533+
tm.assert_frame_equal(expected, result)
1534+
14591535
# with a data column, but different columns
14601536
self.store.remove('df')
14611537
self.store.append('df', df, data_columns=['A'])
@@ -1776,7 +1852,6 @@ def test_legacy_table_read(self):
17761852
store.select('df2', typ='legacy_frame')
17771853

17781854
# old version warning
1779-
import warnings
17801855
warnings.filterwarnings('ignore', category=IncompatibilityWarning)
17811856
self.assertRaises(
17821857
Exception, store.select, 'wp1', Term('minor_axis', '=', 'B'))
@@ -1915,9 +1990,11 @@ def test_tseries_indices_frame(self):
19151990

19161991
def test_unicode_index(self):
19171992
unicode_values = [u'\u03c3', u'\u03c3\u03c3']
1918-
1993+
warnings.filterwarnings('ignore', category=PerformanceWarning)
19191994
s = Series(np.random.randn(len(unicode_values)), unicode_values)
19201995
self._check_roundtrip(s, tm.assert_series_equal)
1996+
warnings.filterwarnings('always', category=PerformanceWarning)
1997+
19211998

19221999
def test_store_datetime_mixed(self):
19232000
df = DataFrame(

0 commit comments

Comments
 (0)