diff --git a/RELEASE.rst b/RELEASE.rst index 63accf42c470d..d2b9952829619 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -22,6 +22,43 @@ Where to get it * Binary installers on PyPI: http://pypi.python.org/pypi/pandas * Documentation: http://pandas.pydata.org +pandas 0.10.1 +============= + +**Release date:** 2013-??-?? + +**New features** + +**Improvements to existing features** + + - ``HDFStore`` + - enables storing of multi-index dataframes (closes GH1277_) + - support data column indexing and selection, via ``data_columns`` keyword in append + - support write chunking to reduce memory footprint, via ``chunksize`` keyword to append + - support automagic indexing via ``index`` keywork to append + - support ``expectedrows`` keyword in append to inform ``PyTables`` about the expected tablesize + - support ``start`` and ``stop`` keywords in select to limit the row selection space + - added ``get_store`` context manager to automatically import with pandas + - added column filtering via ``columns`` keyword in select + - added methods append_to_multiple/select_as_multiple/select_as_coordinates to do multiple-table append/selection + - added support for datetime64 in columns + - added method ``unique`` to select the unique values in an indexable or data column + +**Bug fixes** + + - ``HDFStore`` + - correctly handle ``nan`` elements in string columns; serialize via the ``nan_rep`` keyword to append + - raise correctly on non-implemented column types (unicode/date) + - handle correctly ``Term`` passed types (e.g. ``index<1000``, when index is ``Int64``), (closes GH512_) + +**API Changes** + + - ``HDFStore`` + - removed keyword ``compression`` from ``put`` (replaced by keyword ``complib`` to be consistent across library) + +.. _GH512: https://github.com/pydata/pandas/issues/512 +.. _GH1277: https://github.com/pydata/pandas/issues/1277 + pandas 0.10.0 ============= diff --git a/doc/source/io.rst b/doc/source/io.rst index c73240725887f..bf9c913909dee 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1030,6 +1030,17 @@ Deletion of the object specified by the key del store['wp'] store +Closing a Store + +.. ipython:: python + + + # closing a store + store.close() + + # Working with, and automatically closing the store with the context manager. + with get_store('store.h5') as store: + store.keys() .. ipython:: python :suppress: @@ -1095,7 +1106,7 @@ Storing Mixed Types in a Table ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Storing mixed-dtype data is supported. Strings are store as a fixed-width using the maximum size of the appended column. Subsequent appends will truncate strings at this length. -Passing ``min_itemsize = { `values` : size }`` as a parameter to append will set a larger minimum for the string columns. Storing ``floats, strings, ints, bools`` are currently supported. +Passing ``min_itemsize = { `values` : size }`` as a parameter to append will set a larger minimum for the string columns. Storing ``floats, strings, ints, bools, datetime64`` are currently supported. For string columns, passing ``nan_rep = 'my_nan_rep'`` to append will change the default nan representation on disk (which converts to/from `np.nan`), this defaults to `nan`. .. ipython:: python @@ -1103,6 +1114,11 @@ Passing ``min_itemsize = { `values` : size }`` as a parameter to append will set df_mixed['string'] = 'string' df_mixed['int'] = 1 df_mixed['bool'] = True + df_mixed['datetime64'] = Timestamp('20010102') + + # make sure that we have datetime64[ns] types + df_mixed = df_mixed.convert_objects() + df_mixed.ix[3:5,['A','B','string','datetime64']] = np.nan store.append('df_mixed', df_mixed, min_itemsize = { 'values' : 50 }) df_mixed1 = store.select('df_mixed') @@ -1112,10 +1128,33 @@ Passing ``min_itemsize = { `values` : size }`` as a parameter to append will set # we have provided a minimum string column size store.root.df_mixed.table +It is ok to store ``np.nan`` in a ``float or string``. Make sure to do a ``convert_objects()`` on the frame before storing a ``np.nan`` in a datetime64 column. Storing a column with a ``np.nan`` in a ``int, bool`` will currently throw an ``Exception`` as these columns will have converted to ``object`` type. + +Storing Multi-Index DataFrames +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Storing multi-index dataframes as tables is very similar to storing/selecting from homogenous index DataFrames. + +.. ipython:: python + + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], + ['one', 'two', 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['foo', 'bar']) + df_mi = DataFrame(np.random.randn(10, 3), index=index, + columns=['A', 'B', 'C']) + df_mi + + store.append('df_mi',df_mi) + store.select('df_mi') + + # the levels are automatically included as data columns + store.select('df_mi', Term('foo=bar')) + Querying a Table ~~~~~~~~~~~~~~~~ - ``select`` and ``delete`` operations have an optional criteria that can be specified to select/delete only a subset of the data. This allows one to have a very large on-disk table and retrieve only a portion of the data. @@ -1128,7 +1167,7 @@ Valid terms can be created from ``dict, list, tuple, or string``. Objects can be - ``dict(field = 'index', op = '>', value = '20121114')`` - ``('index', '>', '20121114')`` - - ``'index>20121114'`` + - ``'index > 20121114'`` - ``('index', '>', datetime(2012,11,14))`` - ``('index', ['20121114','20121115'])`` - ``('major_axis', '=', Timestamp('2012/11/14'))`` @@ -1143,14 +1182,30 @@ Queries are built up using a list of ``Terms`` (currently only **anding** of ter store store.select('wp',[ Term('major_axis>20000102'), Term('minor_axis', '=', ['A','B']) ]) +The ``columns`` keyword can be supplied to select to filter a list of the return columns, this is equivalent to passing a ``Term('columns',list_of_columns_to_filter)`` + +.. ipython:: python + + store.select('df', columns = ['A','B']) + +Start and Stop parameters can be specified to limit the total search space. These are in terms of the total number of rows in a table. + +.. ipython:: python + + # this is effectively what the storage of a Panel looks like + wp.to_frame() + + # limiting the search + store.select('wp',[ Term('major_axis>20000102'), Term('minor_axis', '=', ['A','B']) ], start=0, stop=10) + + Indexing ~~~~~~~~ -You can create an index for a table with ``create_table_index`` after data is already in the table (after and ``append/put`` operation). Creating a table index is **highly** encouraged. This will speed your queries a great deal when you use a ``select`` with the indexed dimension as the ``where``. It is not automagically done now because you may want to index different axes than the default (except in the case of a DataFrame, where it almost always makes sense to index the ``index``. +You can create/modify an index for a table with ``create_table_index`` after data is already in the table (after and ``append/put`` operation). Creating a table index is **highly** encouraged. This will speed your queries a great deal when you use a ``select`` with the indexed dimension as the ``where``. **Indexes are automagically created (starting 0.10.1)** on the indexables and any data columns you specify. This behavior can be turned off by passing ``index=False`` to ``append``. .. ipython:: python - # create an index - store.create_table_index('df') + # we have automagically already created an index (in the first section) i = store.root.df.table.cols.index.index i.optlevel, i.kind @@ -1160,6 +1215,90 @@ You can create an index for a table with ``create_table_index`` after data is al i.optlevel, i.kind +Query via Data Columns +~~~~~~~~~~~~~~~~~~~~~~ +You can designate (and index) certain columns that you want to be able to perform queries (other than the `indexable` columns, which you can always query). For instance say you want to perform this common operation, on-disk, and return just the frame that matches this query. + +.. ipython:: python + + df_dc = df.copy() + df_dc['string'] = 'foo' + df_dc.ix[4:6,'string'] = np.nan + df_dc.ix[7:9,'string'] = 'bar' + df_dc['string2'] = 'cool' + df_dc + + # on-disk operations + store.append('df_dc', df_dc, data_columns = ['B','C','string','string2']) + store.select('df_dc',[ Term('B>0') ]) + + # getting creative + store.select('df_dc',[ 'B > 0', 'C > 0', 'string == foo' ]) + + # this is in-memory version of this type of selection + df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == 'foo')] + + # we have automagically created this index and that the B/C/string/string2 columns are stored separately as ``PyTables`` columns + store.root.df_dc.table + +There is some performance degredation by making lots of columns into `data columns`, so it is up to the user to designate these. In addition, you cannot change data columns (nor indexables) after the first append/put operation (Of course you can simply read in the data and create a new table!) + +Advanced Queries +~~~~~~~~~~~~~~~~ + +**Unique** + +To retrieve the *unique* values of an indexable or data column, use the method ``unique``. This will, for example, enable you to get the index very quickly. Note ``nan`` are excluded from the result set. + +.. ipython:: python + + store.unique('df_dc','index') + store.unique('df_dc','string') + +**Replicating or** + +``not`` and ``or`` conditions are unsupported at this time; however, ``or`` operations are easy to replicate, by repeately applying the criteria to the table, and then ``concat`` the results. + +.. ipython:: python + + crit1 = [ Term('B>0'), Term('C>0'), Term('string=foo') ] + crit2 = [ Term('B<0'), Term('C>0'), Term('string=foo') ] + + concat([ store.select('df_dc',c) for c in [ crit1, crit2 ] ]) + +**Table Object** + +If you want to inspect the table object, retrieve via ``get_table``. You could use this progamatically to say get the number of rows in the table. + +.. ipython:: python + + store.get_table('df_dc').nrows + +Multiple Table Queries +~~~~~~~~~~~~~~~~~~~~~~ + +New in 0.10.1 are the methods ``append_to_multple`` and ``select_as_multiple``, that can perform appending/selecting from multiple tables at once. The idea is to have one table (call it the selector table) that you index most/all of the columns, and perform your queries. The other table(s) are data tables that are indexed the same the selector table. You can then perform a very fast query on the selector table, yet get lots of data back. This method works similar to having a very wide-table, but is more efficient in terms of queries. + +Note, **THE USER IS RESPONSIBLE FOR SYNCHRONIZING THE TABLES**. This means, append to the tables in the same order; ``append_to_multiple`` splits a single object to multiple tables, given a specification (as a dictionary). This dictionary is a mapping of the table names to the 'columns' you want included in that table. Pass a `None` for a single table (optional) to let it have the remaining columns. The argument ``selector`` defines which table is the selector table. + +.. ipython:: python + + df_mt = DataFrame(randn(8, 6), index=date_range('1/1/2000', periods=8), + columns=['A', 'B', 'C', 'D', 'E', 'F']) + df_mt['foo'] = 'bar' + + # you can also create the tables individually + store.append_to_multiple({ 'df1_mt' : ['A','B'], 'df2_mt' : None }, df_mt, selector = 'df1_mt') + store + + # indiviual tables were created + store.select('df1_mt') + store.select('df2_mt') + + # as a multiple + store.select_as_multiple(['df1_mt','df2_mt'], where = [ 'A>0','B>0' ], selector = 'df1_mt') + + Delete from a Table ~~~~~~~~~~~~~~~~~~~ You can delete from a table selectively by specifying a ``where``. In deleting rows, it is important to understand the ``PyTables`` deletes rows by erasing the rows, then **moving** the following data. Thus deleting can potentially be a very expensive operation depending on the orientation of your data. This is especially true in higher dimensional objects (``Panel`` and ``Panel4D``). To get optimal deletion speed, it pays to have the dimension you are deleting be the first of the ``indexables``. @@ -1184,6 +1323,33 @@ It should be clear that a delete operation on the ``major_axis`` will be fairly store.remove('wp', 'major_axis>20000102' ) store.select('wp') +Please note that HDF5 **DOES NOT RECLAIM SPACE** in the h5 files automatically. Thus, repeatedly deleting (or removing nodes) and adding again **WILL TEND TO INCREASE THE FILE SIZE**. To *clean* the file, use ``ptrepack`` (see below). + +Compression +~~~~~~~~~~~ +``PyTables`` allows the stored data to be compressed. Tthis applies to all kinds of stores, not just tables. + + - Pass ``complevel=int`` for a compression level (1-9, with 0 being no compression, and the default) + - Pass ``complib=lib`` where lib is any of ``zlib, bzip2, lzo, blosc`` for whichever compression library you prefer. + +``HDFStore`` will use the file based compression scheme if no overriding ``complib`` or ``complevel`` options are provided. ``blosc`` offers very fast compression, and is my most used. Note that ``lzo`` and ``bzip2`` may not be installed (by Python) by default. + +Compression for all objects within the file + + - ``store_compressed = HDFStore('store_compressed.h5', complevel=9, complib='blosc')`` + +Or on-the-fly compression (this only applies to tables). You can turn off file compression for a specific table by passing ``complevel=0`` + + - ``store.append('df', df, complib='zlib', complevel=5)`` + +**ptrepack** + +``PyTables`` offer better write performance when compressed after writing them, as opposed to turning on compression at the very beginning. You can use the supplied ``PyTables`` utility ``ptrepack``. In addition, ``ptrepack`` can change compression levels after the fact. + + - ``ptrepack --chunkshape=auto --propindexes --complevel=9 --complib=blosc in.h5 out.h5`` + +Furthermore ``ptrepack in.h5 out.h5`` will *repack* the file to allow you to reuse previously deleted space (alternatively, one can simply remove the file and write again). + Notes & Caveats ~~~~~~~~~~~~~~~ @@ -1216,14 +1382,9 @@ Performance - ``Tables`` come with a writing performance penalty as compared to regular stores. The benefit is the ability to append/delete and query (potentially very large amounts of data). Write times are generally longer as compared with regular stores. Query times can be quite fast, especially on an indexed axis. - - ``Tables`` can (as of 0.10.0) be expressed as different types. - - - ``AppendableTable`` which is a similiar table to past versions (this is the default). - - ``WORMTable`` (pending implementation) - is available to faciliate very fast writing of tables that are also queryable (but CANNOT support appends) - - - ``Tables`` offer better performance when compressed after writing them (as opposed to turning on compression at the very beginning) - use the pytables utilities ``ptrepack`` to rewrite the file (and also can change compression methods) - - Duplicate rows can be written, but are filtered out in selection (with the last items being selected; thus a table is unique on major, minor pairs) + - You can pass ``chunksize=an integer`` to ``append``, to change the writing chunksize (default is 50000). This will signficantly lower your memory usage on writing. + - You can pass ``expectedrows=an integer`` to the first ``append``, to set the TOTAL number of expectedrows that ``PyTables`` will expected. This will optimize read/write performance. + - Duplicate rows can be written to tables, but are filtered out in selection (with the last items being selected; thus a table is unique on major, minor pairs) Experimental ~~~~~~~~~~~~ diff --git a/doc/source/v0.10.1.txt b/doc/source/v0.10.1.txt new file mode 100644 index 0000000000000..b8137fda540cd --- /dev/null +++ b/doc/source/v0.10.1.txt @@ -0,0 +1,129 @@ +.. _whatsnew_0101: + +v0.10.1 (January ??, 2013) +--------------------------- + +This is a minor release from 0.10.0 and includes many new features and +enhancements along with a large number of bug fixes. There are also a number of +important API changes that long-time pandas users should pay close attention +to. + +API changes +~~~~~~~~~~~ + +New features +~~~~~~~~~~~~ + +HDFStore +~~~~~~~~ + +.. ipython:: python + :suppress: + :okexcept: + + os.remove('store.h5') + +You can designate (and index) certain columns that you want to be able to perform queries on a table, by passing a list to ``data_columns`` + +.. ipython:: python + + store = HDFStore('store.h5') + df = DataFrame(randn(8, 3), index=date_range('1/1/2000', periods=8), + columns=['A', 'B', 'C']) + df['string'] = 'foo' + df.ix[4:6,'string'] = np.nan + df.ix[7:9,'string'] = 'bar' + df['string2'] = 'cool' + df + + # on-disk operations + store.append('df', df, data_columns = ['B','C','string','string2']) + store.select('df',[ 'B > 0', 'string == foo' ]) + + # this is in-memory version of this type of selection + df[(df.B > 0) & (df.string == 'foo')] + +Retrieving unique values in an indexable or data column. + +.. ipython:: python + + store.unique('df','index') + store.unique('df','string') + +You can now store ``datetime64`` in data columns + +.. ipython:: python + + df_mixed = df.copy() + df_mixed['datetime64'] = Timestamp('20010102') + df_mixed.ix[3:4,['A','B']] = np.nan + + store.append('df_mixed', df_mixed) + df_mixed1 = store.select('df_mixed') + df_mixed1 + df_mixed1.get_dtype_counts() + +You can pass ``columns`` keyword to select to filter a list of the return columns, this is equivalent to passing a ``Term('columns',list_of_columns_to_filter)`` + +.. ipython:: python + + store.select('df',columns = ['A','B']) + +``HDFStore`` now serializes multi-index dataframes when appending tables. + +.. ipython:: python + + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], + ['one', 'two', 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['foo', 'bar']) + df = DataFrame(np.random.randn(10, 3), index=index, + columns=['A', 'B', 'C']) + df + + store.append('mi',df) + store.select('mi') + + # the levels are automatically included as data columns + store.select('mi', Term('foo=bar')) + +Multi-table creation via ``append_to_multiple`` and selection via ``select_as_multiple`` can create/select from multiple tables and return a combined result, by using ``where`` on a selector table. + +.. ipython:: python + + df_mt = DataFrame(randn(8, 6), index=date_range('1/1/2000', periods=8), + columns=['A', 'B', 'C', 'D', 'E', 'F']) + df_mt['foo'] = 'bar' + + # you can also create the tables individually + store.append_to_multiple({ 'df1_mt' : ['A','B'], 'df2_mt' : None }, df_mt, selector = 'df1_mt') + store + + # indiviual tables were created + store.select('df1_mt') + store.select('df2_mt') + + # as a multiple + store.select_as_multiple(['df1_mt','df2_mt'], where = [ 'A>0','B>0' ], selector = 'df1_mt') + +.. ipython:: python + :suppress: + + store.close() + import os + os.remove('store.h5') + +**Enhancements** + +- You can pass ``nan_rep = 'my_nan_rep'`` to append, to change the default nan representation on disk (which converts to/from `np.nan`), this defaults to `nan`. +- You can pass ``index`` to ``append``. This defaults to ``True``. This will automagically create indicies on the *indexables* and *data columns* of the table +- You can pass ``chunksize=an integer`` to ``append``, to change the writing chunksize (default is 50000). This will signficantly lower your memory usage on writing. +- You can pass ``expectedrows=an integer`` to the first ``append``, to set the TOTAL number of expectedrows that ``PyTables`` will expected. This will optimize read/write performance. +- ``Select`` now supports passing ``start`` and ``stop`` to provide selection space limiting in selection. + + +See the `full release notes +`__ or issue tracker +on GitHub for a complete list. + diff --git a/doc/source/whatsnew.rst b/doc/source/whatsnew.rst index 82ed64680f1eb..6c125c45a2599 100644 --- a/doc/source/whatsnew.rst +++ b/doc/source/whatsnew.rst @@ -16,6 +16,8 @@ What's New These are new features and improvements of note in each release. +.. include:: v0.10.1.txt + .. include:: v0.10.0.txt .. include:: v0.9.1.txt diff --git a/pandas/__init__.py b/pandas/__init__.py index 1d45727257eeb..6c58c708b8306 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -32,7 +32,7 @@ from pandas.io.parsers import (read_csv, read_table, read_clipboard, read_fwf, to_clipboard, ExcelFile, ExcelWriter) -from pandas.io.pytables import HDFStore, Term +from pandas.io.pytables import HDFStore, Term, get_store from pandas.util.testing import debug from pandas.tools.describe import value_range diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 175845fd38a2b..346dfb7c8b4ce 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -21,7 +21,6 @@ from pandas.tseries.api import PeriodIndex, DatetimeIndex from pandas.core.common import adjoin from pandas.core.algorithms import match, unique, factorize -from pandas.core.strings import str_len from pandas.core.categorical import Categorical from pandas.core.common import _asarray_tuplesafe, _try_sort from pandas.core.internals import BlockManager, make_block, form_blocks @@ -36,7 +35,7 @@ from contextlib import contextmanager # versioning attribute -_version = '0.10' +_version = '0.10.1' class IncompatibilityWarning(Warning): pass @@ -79,6 +78,13 @@ class IncompatibilityWarning(Warning): pass 'WidePanel': 'wide_table', } +# axes map +_AXES_MAP = { + DataFrame : [0], + Panel : [1,2], + Panel4D : [1,2,3], +} + # oh the troubles to reduce import time _table_mod = None _table_supports_index = False @@ -108,28 +114,7 @@ def get_store(path, mode='a', complevel=None, complib=None, Parameters ---------- - path : string - File path to HDF5 file - mode : {'a', 'w', 'r', 'r+'}, default 'a' - - ``'r'`` - Read-only; no data can be modified. - ``'w'`` - Write; a new file is created (an existing file with the same - name would be deleted). - ``'a'`` - Append; an existing file is opened for reading and writing, - and if the file does not exist it is created. - ``'r+'`` - It is similar to ``'a'``, but the file must already exist. - complevel : int, 1-9, default 0 - If a complib is specified compression will be applied - where possible - complib : {'zlib', 'bzip2', 'lzo', 'blosc', None}, default None - If complevel is > 0 apply compression to objects written - in the store wherever possible - fletcher32 : bool, default False - If applying compression use the fletcher32 checksum + same as HDFStore Examples -------- @@ -336,7 +321,7 @@ def get(self, key): raise KeyError('No object named %s in the file' % key) return self._read_group(group) - def select(self, key, where=None, **kwargs): + def select(self, key, where=None, start=None, stop=None, columns=None, **kwargs): """ Retrieve pandas object stored in file, optionally based on where criteria @@ -344,16 +329,102 @@ def select(self, key, where=None, **kwargs): Parameters ---------- key : object + + Optional Parameters + ------------------- where : list of Term (or convertable) objects, optional + start : integer (defaults to None), row number to start selection + stop : integer (defaults to None), row number to stop selection + columns : a list of columns that if not None, will limit the return columns """ group = self.get_node(key) if group is None: raise KeyError('No object named %s in the file' % key) - return self._read_group(group, where, **kwargs) + return self._read_group(group, where=where, start=start, stop=stop, columns=columns, **kwargs) + + def select_as_coordinates(self, key, where=None, **kwargs): + """ + return the selection as a Coordinates. Note that start/stop/columns parematers are inapplicable here. + + Parameters + ---------- + key : object - def put(self, key, value, table=False, append=False, - compression=None, **kwargs): + Optional Parameters + ------------------- + where : list of Term (or convertable) objects, optional + """ + return self.get_table(key).read_coordinates(where = where, **kwargs) + + def unique(self, key, column, **kwargs): + """ + return a single column uniquely from the table. This is generally only useful to select an indexable + + Parameters + ---------- + key : object + column: the column of interest + + Exceptions + ---------- + raises KeyError if the column is not found (or key is not a valid store) + raises ValueError if the column can not be extracted indivually (it is part of a data block) + + """ + return self.get_table(key).read_column(column = column, **kwargs) + + def select_as_multiple(self, keys, where=None, selector=None, columns=None, **kwargs): + """ Retrieve pandas objects from multiple tables + + Parameters + ---------- + keys : a list of the tables + selector : the table to apply the where criteria (defaults to keys[0] if not supplied) + columns : the columns I want back + + Exceptions + ---------- + raise if any of the keys don't refer to tables or if they are not ALL THE SAME DIMENSIONS + """ + + # default to single select + if isinstance(keys, (list,tuple)) and len(keys) == 1: + keys = keys[0] + if isinstance(keys,basestring): + return self.select(key = keys, where=where, columns = columns, **kwargs) + + if not isinstance(keys, (list,tuple)): + raise Exception("keys must be a list/tuple") + + if len(keys) == 0: + raise Exception("keys must have a non-zero length") + + if selector is None: + selector = keys[0] + + # collect the tables + tbls = [ self.get_table(k) for k in keys ] + + # validate rows + nrows = tbls[0].nrows + for t in tbls: + if t.nrows != nrows: + raise Exception("all tables must have exactly the same nrows!") + + # select coordinates from the selector table + c = self.select_as_coordinates(selector, where) + + # collect the returns objs + objs = [ t.read(where = c, columns = columns) for t in tbls ] + + # axis is the concentation axes + axis = list(set([ t.non_index_axes[0][0] for t in tbls ]))[0] + + # concat and return + return concat(objs, axis = axis, verify_integrity = True) + + def put(self, key, value, table=False, append=False, **kwargs): """ Store object in HDFStore @@ -368,15 +439,10 @@ def put(self, key, value, table=False, append=False, append : boolean, default False For table data structures, append the input data to the existing table - compression : {None, 'blosc', 'lzo', 'zlib'}, default None - Use a compression algorithm to compress the data - If None, the compression settings specified in the ctor will - be used. """ - self._write_to_group(key, value, table=table, append=append, - comp=compression, **kwargs) + self._write_to_group(key, value, table=table, append=append, **kwargs) - def remove(self, key, where=None): + def remove(self, key, where=None, start=None, stop=None): """ Remove pandas object partially by specifying the where condition @@ -384,9 +450,12 @@ def remove(self, key, where=None): ---------- key : string Node to remove or delete rows from - where : list - For Table node, delete specified rows. See HDFStore.select for more - information + + Optional Parameters + ------------------- + where : list of Term (or convertable) objects, optional + start : integer (defaults to None), row number to start selection + stop : integer (defaults to None), row number to stop selection Returns ------- @@ -406,11 +475,11 @@ def remove(self, key, where=None): if not _is_table_type(group): raise Exception('can only remove with where on objects written as tables') t = create_table(self, group) - return t.delete(where) + return t.delete(where = where, start=start, stop=stop) return None - def append(self, key, value, **kwargs): + def append(self, key, value, columns = None, **kwargs): """ Append to Table in file. Node must already exist and be Table format. @@ -418,15 +487,83 @@ def append(self, key, value, **kwargs): Parameters ---------- key : object - value : {Series, DataFrame, Panel} + value : {Series, DataFrame, Panel, Panel4D} + + Optional Parameters + ------------------- + data_columns : list of columns to create as data columns + min_itemsize : dict of columns that specify minimum string sizes + nan_rep : string to use as string nan represenation + chunksize : size to chunk the writing + expectedrows : expected TOTAL row size of this table Notes ----- Does *not* check if data being appended overlaps with existing data in the table, so be careful """ + if columns is not None: + raise Exception("columns is not a supported keyword in append, try data_columns") + self._write_to_group(key, value, table=True, append=True, **kwargs) + def append_to_multiple(self, d, value, selector, data_columns = None, axes = None, **kwargs): + """ + Append to multiple tables + + Parameters + ---------- + d : a dict of table_name to table_columns, None is acceptable as the values of one node (this will get all the remaining columns) + value : a pandas object + selector : a string that designates the indexable table; all of its columns will be designed as data_columns, unless data_columns is passed, + in which case these are used + + Notes + ----- + axes parameter is currently not accepted + + """ + if axes is not None: + raise Exception("axes is currently not accepted as a paremter to append_to_multiple; you can create the tables indepdently instead") + + if not isinstance(d, dict): + raise Exception("append_to_multiple must have a dictionary specified as the way to split the value") + + if selector not in d: + raise Exception("append_to_multiple requires a selector that is in passed dict") + + # figure out the splitting axis (the non_index_axis) + axis = list(set(range(value.ndim))-set(_AXES_MAP[type(value)]))[0] + + # figure out how to split the value + remain_key = None + remain_values = [] + for k, v in d.items(): + if v is None: + if remain_key is not None: + raise Exception("append_to_multiple can only have one value in d that is None") + remain_key = k + else: + remain_values.extend(v) + if remain_key is not None: + ordered = value.axes[axis] + ordd = ordered-Index(remain_values) + ordd = sorted(ordered.get_indexer(ordd)) + d[remain_key] = ordered.take(ordd) + + # data_columns + if data_columns is None: + data_columns = d[selector] + + # append + for k, v in d.items(): + dc = data_columns if k == selector else None + + # compute the val + val = value.reindex_axis(v, axis = axis, copy = False) + + self.append(k, val, data_columns = dc, **kwargs) + def create_table_index(self, key, **kwargs): """ Create a pytables index on the table Paramaters @@ -464,13 +601,24 @@ def get_node(self, key): except: return None + def get_table(self, key): + """ return the table object for a key, raise if not in the file or a non-table """ + group = self.get_node(key) + if group is None: + raise KeyError('No object named %s in the file' % key) + if not _is_table_type(group): + raise Exception("cannot return a table object for a non-table") + t = create_table(self, group) + t.infer_axes() + return t + ###### private methods ###### def _get_handler(self, op, kind): return getattr(self, '_%s_%s' % (op, kind)) def _write_to_group(self, key, value, table=False, append=False, - comp=None, **kwargs): + complib=None, **kwargs): group = self.get_node(key) if group is None: paths = key.split('/') @@ -494,20 +642,19 @@ def _write_to_group(self, key, value, table=False, append=False, kind = '%s_table' % kind handler = self._get_handler(op='write', kind=kind) wrapper = lambda value: handler(group, value, append=append, - comp=comp, **kwargs) + complib=complib, **kwargs) else: if append: raise ValueError('Can only append to Tables') - if comp: + if complib: raise ValueError('Compression only supported on Tables') handler = self._get_handler(op='write', kind=kind) wrapper = lambda value: handler(group, value) - wrapper(value) group._v_attrs.pandas_type = kind group._v_attrs.pandas_version = _version - #group._v_attrs.meta = getattr(value,'meta',None) + wrapper(value) def _write_series(self, group, series): self._write_index(group, 'index', series.index) @@ -589,7 +736,7 @@ def _read_sparse_panel(self, group, where=None): def _write_frame(self, group, df): self._write_block_manager(group, df._data) - def _read_frame(self, group, where=None): + def _read_frame(self, group, where=None, **kwargs): return DataFrame(self._read_block_manager(group)) def _write_block_manager(self, group, data): @@ -631,34 +778,39 @@ def _write_wide(self, group, panel): panel._consolidate_inplace() self._write_block_manager(group, panel._data) - def _read_wide(self, group, where=None): + def _read_wide(self, group, where=None, **kwargs): return Panel(self._read_block_manager(group)) - def _write_ndim_table(self, group, obj, append=False, comp=None, axes=None, **kwargs): + def _write_ndim_table(self, group, obj, append=False, axes=None, index=True, **kwargs): if axes is None: - axes = [1,2,3] + axes = _AXES_MAP[type(obj)] t = create_table(self, group, typ = 'appendable_ndim') - t.write(axes=axes, obj=obj, - append=append, compression=comp, **kwargs) + t.write(axes=axes, obj=obj, append=append, **kwargs) + if index: + t.create_index(columns = index) def _read_ndim_table(self, group, where=None, **kwargs): t = create_table(self, group, **kwargs) - return t.read(where) + return t.read(where, **kwargs) - def _write_frame_table(self, group, df, append=False, comp=None, axes=None, **kwargs): + def _write_frame_table(self, group, df, append=False, axes=None, index=True, **kwargs): if axes is None: - axes = [0] - t = create_table(self, group, typ = 'appendable_frame') - t.write(axes=axes, obj=df, append=append, compression=comp, **kwargs) + axes = _AXES_MAP[type(df)] + + t = create_table(self, group, typ = 'appendable_frame' if df.index.nlevels == 1 else 'appendable_multiframe') + t.write(axes=axes, obj=df, append=append, **kwargs) + if index: + t.create_index(columns = index) _read_frame_table = _read_ndim_table - def _write_wide_table(self, group, panel, append=False, comp=None, axes=None, **kwargs): + def _write_wide_table(self, group, panel, append=False, axes=None, index=True, **kwargs): if axes is None: - axes = [1,2] + axes = _AXES_MAP[type(panel)] t = create_table(self, group, typ = 'appendable_panel') - t.write(axes=axes, obj=panel, - append=append, compression=comp, **kwargs) + t.write(axes=axes, obj=panel, append=append, **kwargs) + if index: + t.create_index(columns = index) _read_wide_table = _read_ndim_table @@ -847,14 +999,9 @@ def _read_group(self, group, where=None, **kwargs): kind = group._v_attrs.pandas_type kind = _LEGACY_MAP.get(kind, kind) handler = self._get_handler(op='read', kind=kind) - v = handler(group, where, **kwargs) - #if v is not None: - # meta = getattr(group._v_attrs,'meta',None) - # if meta is not None: - # v.meta = meta - return v - - def _read_series(self, group, where=None): + return handler(group, where=where, **kwargs) + + def _read_series(self, group, where=None, **kwargs): index = self._read_index(group, 'index') if len(index) > 0: values = _read_array(group, 'values') @@ -864,12 +1011,12 @@ def _read_series(self, group, where=None): name = getattr(group._v_attrs, 'name', None) return Series(values, index=index, name=name) - def _read_legacy_series(self, group, where=None): + def _read_legacy_series(self, group, where=None, **kwargs): index = self._read_index_legacy(group, 'index') values = _read_array(group, 'values') return Series(values, index=index) - def _read_legacy_frame(self, group, where=None): + def _read_legacy_frame(self, group, where=None, **kwargs): index = self._read_index_legacy(group, 'index') columns = self._read_index_legacy(group, 'columns') values = _read_array(group, 'values') @@ -894,7 +1041,9 @@ class IndexCol(object): pos : the position in the pytables """ - is_indexable = True + is_an_indexable = True + is_data_indexable = True + is_searchable = False def __init__(self, values = None, kind = None, typ = None, cname = None, itemsize = None, name = None, axis = None, kind_attr = None, pos = None, **kwargs): self.values = values @@ -948,6 +1097,9 @@ def __eq__(self, other): """ compare 2 col items """ return all([ getattr(self,a,None) == getattr(other,a,None) for a in ['name','cname','axis','pos'] ]) + def __ne__(self, other): + return not self.__eq__(other) + def copy(self): new_self = copy.copy(self) return new_self @@ -959,10 +1111,20 @@ def infer(self, table): new_self.get_attr() return new_self - def convert(self, values): - """ set the values from this selection """ - self.values = Index(_maybe_convert(values[self.cname], self.kind)) - + def convert(self, values, nan_rep): + """ set the values from this selection: take = take ownership """ + try: + values = values[self.cname] + except: + pass + self.values = Index(_maybe_convert(values, self.kind)) + return self + + def take_data(self): + """ return the values & release the memory """ + self.values, values = None, self.values + return values + @property def attrs(self): return self.table._v_attrs @@ -1006,7 +1168,7 @@ def validate_col(self, itemsize = None): # validate this column for string truncation (or reset to the max size) dtype = getattr(self,'dtype',None) - if self.kind == 'string' or (dtype is not None and dtype.startswith('string')): + if self.kind == 'string': c = self.col if c is not None: @@ -1044,14 +1206,31 @@ class DataCol(IndexCol): data : the actual data cname : the column name in the table to hold the data (typeically values) """ - is_indexable = False + is_an_indexable = False + is_data_indexable = False + is_searchable = False @classmethod - def create_for_block(cls, i, **kwargs): + def create_for_block(cls, i = None, name = None, cname = None, version = None, **kwargs): """ return a new datacol with the block i """ - return cls(name = 'values_%d' % i, cname = 'values_block_%d' % i, **kwargs) - def __init__(self, values = None, kind = None, typ = None, cname = None, data = None, **kwargs): + if cname is None: + cname = name or 'values_block_%d' % i + if name is None: + name = cname + + # prior to 0.10.1, we named values blocks like: values_block_0 an the name values_0 + try: + if version[0] == 0 and version[1] <= 10 and version[2] == 0: + m = re.search("values_block_(\d+)",name) + if m: + name = "values_%s" % m.groups()[0] + except: + pass + + return cls(name = name, cname = cname, **kwargs) + + def __init__(self, values = None, kind = None, typ = None, cname = None, data = None, block = None, **kwargs): super(DataCol, self).__init__(values = values, kind = kind, typ = typ, cname = cname, **kwargs) self.dtype = None self.dtype_attr = "%s_dtype" % self.name @@ -1064,17 +1243,100 @@ def __eq__(self, other): """ compare 2 col items """ return all([ getattr(self,a,None) == getattr(other,a,None) for a in ['name','cname','dtype','pos'] ]) - def set_data(self, data): + def set_data(self, data, dtype = None): self.data = data if data is not None: - if self.dtype is None: + if dtype is not None: + self.dtype = dtype + self.set_kind() + elif self.dtype is None: self.dtype = data.dtype.name + self.set_kind() def take_data(self): """ return the data & release the memory """ self.data, data = None, self.data return data + def set_kind(self): + # set my kind if we can + if self.dtype is not None: + if self.dtype.startswith('string'): + self.kind = 'string' + elif self.dtype.startswith('float'): + self.kind = 'float' + elif self.dtype.startswith('int'): + self.kind = 'integer' + elif self.dtype.startswith('date'): + self.kind = 'datetime' + + def set_atom(self, block, existing_col, min_itemsize, nan_rep, **kwargs): + """ create and setup my atom from the block b """ + + self.values = list(block.items) + dtype = block.dtype.name + inferred_type = lib.infer_dtype(block.values.flatten()) + + if inferred_type == 'datetime64': + self.set_atom_datetime64(block) + elif inferred_type == 'date': + raise NotImplementedError("date is not implemented as a table column") + elif inferred_type == 'unicode': + raise NotImplementedError("unicode is not implemented as a table column") + + ### this is basically a catchall; if say a datetime64 has nans then will end up here ### + elif inferred_type == 'string' or dtype == 'object': + self.set_atom_string(block, existing_col, min_itemsize, nan_rep) + else: + self.set_atom_data(block) + + return self + + def get_atom_string(self, block, itemsize): + return _tables().StringCol(itemsize = itemsize, shape = block.shape[0]) + + def set_atom_string(self, block, existing_col, min_itemsize, nan_rep): + # fill nan items with myself + data = block.fillna(nan_rep).values + + # itemsize is the maximum length of a string (along any dimension) + itemsize = lib.max_len_string_array(data.flatten()) + + # specified min_itemsize? + if isinstance(min_itemsize, dict): + min_itemsize = int(min_itemsize.get(self.name) or min_itemsize.get('values') or 0) + itemsize = max(min_itemsize or 0,itemsize) + + # check for column in the values conflicts + if existing_col is not None: + eci = existing_col.validate_col(itemsize) + if eci > itemsize: + itemsize = eci + + self.itemsize = itemsize + self.kind = 'string' + self.typ = self.get_atom_string(block, itemsize) + self.set_data(self.convert_string_data(data, itemsize)) + + def convert_string_data(self, data, itemsize): + return data.astype('S%s' % itemsize) + + def get_atom_data(self, block): + return getattr(_tables(),"%sCol" % self.kind.capitalize())(shape = block.shape[0]) + + def set_atom_data(self, block): + self.kind = block.dtype.name + self.typ = self.get_atom_data(block) + self.set_data(block.values.astype(self.typ._deftype)) + + def get_atom_datetime64(self, block): + return _tables().Int64Col(shape = block.shape[0]) + + def set_atom_datetime64(self, block): + self.kind = 'datetime64' + self.typ = self.get_atom_datetime64(block) + self.set_data(block.values.view('i8'),'datetime64') + @property def shape(self): return getattr(self.data,'shape',None) @@ -1099,21 +1361,42 @@ def validate_attr(self, append): raise Exception("appended items dtype do not match existing items dtype" " in table!") - def convert(self, values): + def convert(self, values, nan_rep): """ set the data from this selection (and convert to the correct dtype if we can) """ - self.set_data(values[self.cname]) + try: + values = values[self.cname] + except: + pass + self.set_data(values) # convert to the correct dtype if self.dtype is not None: - try: - self.data = self.data.astype(self.dtype) - except: - self.data = self.data.astype('O') + + # reverse converts + if self.dtype == 'datetime64': + self.data = np.asarray(self.data, dtype='M8[ns]') + elif self.dtype == 'date': + self.data = np.array([date.fromtimestamp(v) for v in self.data], dtype=object) + elif self.dtype == 'datetime': + self.data = np.array([datetime.fromtimestamp(v) for v in self.data], + dtype=object) + else: + + try: + self.data = self.data.astype(self.dtype) + except: + self.data = self.data.astype('O') + + # convert nans + if self.kind == 'string': + self.data = lib.array_replace_from_nan_rep(self.data.flatten(), nan_rep).reshape(self.data.shape) + return self def get_attr(self): """ get the data for this colummn """ self.values = getattr(self.attrs,self.kind_attr,None) self.dtype = getattr(self.attrs,self.dtype_attr,None) + self.set_kind() def set_attr(self): """ set the data for this colummn """ @@ -1121,6 +1404,23 @@ def set_attr(self): if self.dtype is not None: setattr(self.attrs,self.dtype_attr,self.dtype) +class DataIndexableCol(DataCol): + """ represent a data column that can be indexed """ + is_data_indexable = True + + @property + def is_searchable(self): + return self.kind == 'string' + + def get_atom_string(self, block, itemsize): + return _tables().StringCol(itemsize = itemsize) + + def get_atom_data(self, block): + return getattr(_tables(),"%sCol" % self.kind.capitalize())() + + def get_atom_datetime64(self, block): + return _tables().Int64Col() + class Table(object): """ represent a table: facilitate read/write of various types of tables @@ -1137,22 +1437,37 @@ class Table(object): These are attributes that are store in the main table node, they are necessary to recreate these tables when read back in. - index_axes: a list of tuples of the (original indexing axis and index column) + index_axes : a list of tuples of the (original indexing axis and index column) non_index_axes: a list of tuples of the (original index axis and columns on a non-indexing axis) - values_axes : a list of the columns which comprise the data of this table + values_axes : a list of the columns which comprise the data of this table + data_columns : a list of the columns that we are allowing indexing (these become single columns in values_axes) + nan_rep : the string to use for nan representations for string objects + levels : the names of levels """ table_type = None obj_type = None ndim = None + levels = 1 def __init__(self, parent, group, **kwargs): self.parent = parent self.group = group - self.version = getattr(group._v_attrs,'pandas_version',None) + + # compute our version + version = getattr(group._v_attrs,'pandas_version',None) + try: + self.version = tuple([ int(x) for x in version.split('.') ]) + if len(self.version) == 2: + self.version = self.version + (0,) + except: + self.version = (0,0,0) + self.index_axes = [] self.non_index_axes = [] self.values_axes = [] + self.data_columns = [] + self.nan_rep = None self.selection = None @property @@ -1166,7 +1481,12 @@ def pandas_type(self): def __repr__(self): """ return a pretty representatgion of myself """ self.infer_axes() - return "%s (typ->%s,nrows->%s,indexers->[%s])" % (self.pandas_type,self.table_type_short,self.nrows,','.join([ a.name for a in self.index_axes ])) + dc = ",dc->[%s]" % ','.join(self.data_columns) if len(self.data_columns) else '' + return "%s (typ->%s,nrows->%s,indexers->[%s]%s)" % (self.pandas_type, + self.table_type_short, + self.nrows, + ','.join([ a.name for a in self.index_axes ]), + dc) __str__ = __repr__ @@ -1190,6 +1510,11 @@ def validate(self, other): def nrows(self): return getattr(self.table,'nrows',None) + @property + def nrows_expected(self): + """ based on our axes, compute the expected nrows """ + return np.prod([ i.cvalues.shape[0] for i in self.index_axes ]) + @property def table(self): """ return the table group """ @@ -1242,7 +1567,12 @@ def data_orientation(self): def queryables(self): """ return a dict of the kinds allowable columns for this object """ - return dict([ (a.cname,a.kind) for a in self.index_axes ] + [ (self.obj_type._AXIS_NAMES[axis],None) for axis, values in self.non_index_axes ]) + + # compute the values_axes queryables + return dict([ (a.cname,a.kind) for a in self.index_axes ] + + [ (self.obj_type._AXIS_NAMES[axis],None) for axis, values in self.non_index_axes ] + + [ (v.cname,v.kind) for v in self.values_axes if v.name in set(self.data_columns) ] + ) def index_cols(self): """ return a list of my index cols """ @@ -1258,12 +1588,15 @@ def set_attrs(self): self.attrs.index_cols = self.index_cols() self.attrs.values_cols = self.values_cols() self.attrs.non_index_axes = self.non_index_axes + self.attrs.data_columns = self.data_columns + self.attrs.nan_rep = self.nan_rep + self.attrs.levels = self.levels def validate_version(self, where = None): """ are we trying to operate on an old version? """ if where is not None: - if self.version is None or float(self.version) < 0.1: - warnings.warn("where criteria is being ignored as we this version is too old (or not-defined) [%s]" % self.version, IncompatibilityWarning) + if self.version[0] <= 0 and self.version[1] <= 10 and self.version[2] < 1: + warnings.warn("where criteria is being ignored as we this version is too old (or not-defined) [%s]" % '.'.join([ str(x) for x in self.version ]), IncompatibilityWarning) @property def indexables(self): @@ -1276,21 +1609,28 @@ def indexables(self): # index columns self._indexables.extend([ IndexCol(name = name, axis = axis, pos = i) for i, (axis, name) in enumerate(self.attrs.index_cols) ]) - # data columns + # values columns + dc = set(self.data_columns) base_pos = len(self._indexables) - self._indexables.extend([ DataCol.create_for_block(i = i, pos = base_pos + i ) for i, c in enumerate(self.attrs.values_cols) ]) + def f(i, c): + klass = DataCol + if c in dc: + klass = DataIndexableCol + return klass.create_for_block(i = i, name = c, pos = base_pos + i, version = self.version) + + self._indexables.extend([ f(i,c) for i, c in enumerate(self.attrs.values_cols) ]) return self._indexables def create_index(self, columns = None, optlevel = None, kind = None): """ Create a pytables index on the specified columns - note: cannot index Time64Col() currently; PyTables must be >= 2.3.1 + note: cannot index Time64Col() currently; PyTables must be >= 2.3 Paramaters ---------- - columns : None or list_like (the indexers to index) + columns : False (don't create an index), True (create all columns index), None or list_like (the indexers to index) optlevel: optimization level (defaults to 6) kind : kind of index (defaults to 'medium') @@ -1301,9 +1641,11 @@ def create_index(self, columns = None, optlevel = None, kind = None): """ if not self.infer_axes(): return + if columns is False: return - if columns is None: - columns = [ self.index_axes[0].name ] + # index all indexables and data_columns + if columns is None or columns is True: + columns = [ a.cname for a in self.axes if a.is_data_indexable ] if not isinstance(columns, (tuple,list)): columns = [ columns ] @@ -1338,7 +1680,7 @@ def create_index(self, columns = None, optlevel = None, kind = None): if not v.is_indexed: v.createIndex(**kw) - def read_axes(self, where): + def read_axes(self, where, **kwargs): """ create and return the axes sniffed from the table: return boolean for success """ # validate the version @@ -1348,12 +1690,12 @@ def read_axes(self, where): if not self.infer_axes(): return False # create the selection - self.selection = Selection(self, where) + self.selection = Selection(self, where = where, **kwargs) values = self.selection.select() # convert the data for a in self.axes: - a.convert(values) + a.convert(values, nan_rep = self.nan_rep) return True @@ -1365,28 +1707,42 @@ def infer_axes(self): if table is None: return False - self.index_axes, self.values_axes = [ a.infer(self.table) for a in self.indexables if a.is_indexable ], [ a.infer(self.table) for a in self.indexables if not a.is_indexable ] self.non_index_axes = getattr(self.attrs,'non_index_axes',None) or [] - + self.data_columns = getattr(self.attrs,'data_columns',None) or [] + self.nan_rep = getattr(self.attrs,'nan_rep',None) + self.levels = getattr(self.attrs,'levels',None) or [] + self.index_axes = [ a.infer(self.table) for a in self.indexables if a.is_an_indexable ] + self.values_axes = [ a.infer(self.table) for a in self.indexables if not a.is_an_indexable ] return True - def get_data_blocks(self, obj): - """ return the data blocks for this obj """ - return obj._data.blocks + def get_object(self, obj): + """ return the data for this obj """ + return obj - def create_axes(self, axes, obj, validate = True, min_itemsize = None): + def create_axes(self, axes, obj, validate = True, nan_rep = None, data_columns = None, min_itemsize = None, **kwargs): """ create and return the axes leagcy tables create an indexable column, indexable index, non-indexable fields + + Parameters: + ----------- + axes: a list of the axes in order to create (names or numbers of the axes) + obj : the object to create axes on + validate: validate the obj against an existiing object already written + min_itemsize: a dict of the min size for a column in bytes + nan_rep : a values to use for string column nan_rep + data_columns : a list of columns that we want to create separate to allow indexing """ # map axes to numbers axes = [ obj._get_axis_number(a) for a in axes ] - # do we have an existing table (if so, use its axes)? + # do we have an existing table (if so, use its axes & data_columns) if self.infer_axes(): existing_table = self.copy() - axes = [ a.axis for a in existing_table.index_axes] + axes = [ a.axis for a in existing_table.index_axes] + data_columns = existing_table.data_columns + nan_rep = existing_table.nan_rep else: existing_table = None @@ -1396,6 +1752,18 @@ def create_axes(self, axes, obj, validate = True, min_itemsize = None): # create according to the new data self.non_index_axes = [] + self.data_columns = [] + + # nan_representation + if nan_rep is None: + nan_rep = 'nan' + self.nan_rep = nan_rep + + # convert the objects if we can to better divine dtypes + try: + obj = obj.convert_objects() + except: + pass # create axes to index and non_index index_axes_map = dict() @@ -1428,67 +1796,75 @@ def create_axes(self, axes, obj, validate = True, min_itemsize = None): for a in self.axes: a.maybe_set_size(min_itemsize = min_itemsize) - # reindex by our non_index_axes + # reindex by our non_index_axes & compute data_columns for a in self.non_index_axes: - obj = obj.reindex_axis(a[1], axis = a[0], copy = False) + obj = obj.reindex_axis(a[1], axis = a[0], copy = False) - blocks = self.get_data_blocks(obj) + # get out blocks + block_obj = self.get_object(obj) + blocks = None + + if data_columns is not None and len(self.non_index_axes): + axis = self.non_index_axes[0][0] + axis_labels = self.non_index_axes[0][1] + data_columns = [ c for c in data_columns if c in axis_labels ] + if len(data_columns): + blocks = block_obj.reindex_axis(Index(axis_labels)-Index(data_columns), axis = axis, copy = False)._data.blocks + for c in data_columns: + blocks.extend(block_obj.reindex_axis([ c ], axis = axis, copy = False)._data.blocks) + + if blocks is None: + blocks = block_obj._data.blocks # add my values self.values_axes = [] for i, b in enumerate(blocks): # shape of the data column are the indexable axes - shape = b.shape[0] - values = b.values + klass = DataCol + name = None - # a string column - if b.dtype.name == 'object': - - # itemsize is the maximum length of a string (along any dimension) - itemsize = _itemsize_string_array(values) + # we have a data_column + if data_columns and len(b.items) == 1 and b.items[0] in data_columns: + klass = DataIndexableCol + name = b.items[0] + self.data_columns.append(name) - # specified min_itemsize? - if isinstance(min_itemsize, dict): - itemsize = max(int(min_itemsize.get('values')),itemsize) - - # check for column in the values conflicts - if existing_table is not None and validate: - eci = existing_table.values_axes[i].validate_col(itemsize) - if eci > itemsize: - itemsize = eci - - atom = _tables().StringCol(itemsize = itemsize, shape = shape) - utype = 'S%s' % itemsize - kind = 'string' - - else: - atom = getattr(_tables(),"%sCol" % b.dtype.name.capitalize())(shape = shape) - utype = atom._deftype - kind = b.dtype.name - - # coerce data to this type try: - values = values.astype(utype) + existing_col = existing_table.values_axes[i] if existing_table is not None and validate else None + + col = klass.create_for_block(i = i, name = name, version = self.version) + col.set_atom(block = b, + existing_col = existing_col, + min_itemsize = min_itemsize, + nan_rep = nan_rep, + **kwargs) + col.set_pos(j) + + self.values_axes.append(col) + except (NotImplementedError): + raise except (Exception), detail: - raise Exception("cannot coerce data type -> [dtype->%s]" % b.dtype.name) - - dc = DataCol.create_for_block(i = i, values = list(b.items), kind = kind, typ = atom, data = values, pos = j) + raise Exception("cannot find the correct atom type -> [dtype->%s] %s" % (b.dtype.name,str(detail))) j += 1 - self.values_axes.append(dc) # validate the axes if we have an existing table if validate: self.validate(existing_table) - def process_axes(self, obj): + def process_axes(self, obj, columns=None): """ process axes filters """ + # reorder by any non_index_axes & limit to the select columns + for axis,labels in self.non_index_axes: + if columns is not None: + labels = Index(labels) & Index(columns) + obj = obj.reindex_axis(labels,axis=axis,copy=False) + def reindex(obj, axis, filt, ordered): - axis_name = obj._get_axis_name(axis) ordd = ordered & filt ordd = sorted(ordered.get_indexer(ordd)) - return obj.reindex_axis(ordered.take(ordd), axis = obj._get_axis_number(axis_name), copy = False) + return obj.reindex_axis(ordered.take(ordd), axis = obj._get_axis_number(axis), copy = False) # apply the selection filters (but keep in the same order) if self.selection.filter: @@ -1497,21 +1873,23 @@ def reindex(obj, axis, filt, ordered): return obj - def create_description(self, compression = None, complevel = None): + def create_description(self, complib = None, complevel = None, fletcher32 = False, expectedrows = None): """ create the description of the table from the axes & values """ - d = { 'name' : 'table' } + # expected rows estimate + if expectedrows is None: + expectedrows = max(self.nrows_expected,10000) + d = dict( name = 'table', expectedrows = expectedrows ) # description from the axes & values d['description'] = dict([ (a.cname,a.typ) for a in self.axes ]) - if compression: - complevel = self.complevel + if complib: if complevel is None: - complevel = 9 - filters = _tables().Filters(complevel=complevel, - complib=compression, - fletcher32=self.fletcher32) + complevel = self.complevel or 9 + filters = _tables().Filters(complevel = complevel, + complib = complib, + fletcher32 = fletcher32 or self.fletcher32) d['filters'] = filters elif self.filters is not None: d['filters'] = self.filters @@ -1521,6 +1899,41 @@ def create_description(self, compression = None, complevel = None): def read(self, **kwargs): raise NotImplementedError("cannot read on an abstract table: subclasses should implement") + def read_coordinates(self, where=None, **kwargs): + """ select coordinates (row numbers) from a table; return the coordinates object """ + + # validate the version + self.validate_version(where) + + # infer the data kind + if not self.infer_axes(): return False + + # create the selection + self.selection = Selection(self, where = where, **kwargs) + return Coordinates(self.selection.select_coords(), group = self.group, where = where) + + def read_column(self, column, **kwargs): + """ return a single column from the table, generally only indexables are interesting """ + + # validate the version + self.validate_version() + + # infer the data kind + if not self.infer_axes(): return False + + # find the axes + for a in self.axes: + if column == a.name: + + if not a.is_data_indexable: + raise ValueError("column [%s] can not be extracted individually; it is not data indexable" % column) + + # column must be an indexable or a data column + c = getattr(self.table.cols,column) + return Categorical.from_array(a.convert(c[:], nan_rep = self.nan_rep).take_data()).levels + + raise KeyError("column [%s] not found in the table" % column) + def write(self, **kwargs): raise NotImplementedError("cannot write on an abstract table") @@ -1566,11 +1979,11 @@ class LegacyTable(Table): def write(self, **kwargs): raise Exception("write operations are not allowed on legacy tables!") - def read(self, where=None): + def read(self, where=None, columns=None, **kwargs): """ we have n indexable columns, with an arbitrary number of data axes """ - if not self.read_axes(where): return None + if not self.read_axes(where=where, **kwargs): return None factors = [ Categorical.from_array(a.values) for a in self.index_axes ] levels = [ f.levels for f in factors ] @@ -1639,12 +2052,8 @@ def read(self, where=None): else: wp = concat(objs, axis = 0, verify_integrity = True) - # reorder by any non_index_axes - for axis,labels in self.non_index_axes: - wp = wp.reindex_axis(labels,axis=axis,copy=False) - # apply the selection filters & axis orderings - wp = self.process_axes(wp) + wp = self.process_axes(wp, columns=columns) return wp @@ -1665,8 +2074,9 @@ class AppendableTable(LegacyTable): _indexables = None table_type = 'appendable' - def write(self, axes, obj, append=False, compression=None, - complevel=None, min_itemsize = None, **kwargs): + def write(self, axes, obj, append=False, complib=None, + complevel=None, fletcher32=None, min_itemsize = None, chunksize = 50000, + expectedrows = None, **kwargs): # create the table if it doesn't exist (or get it if it does) if not append: @@ -1674,12 +2084,15 @@ def write(self, axes, obj, append=False, compression=None, self.handle.removeNode(self.group, 'table') # create the axes - self.create_axes(axes = axes, obj = obj, validate = append, min_itemsize = min_itemsize) + self.create_axes(axes = axes, obj = obj, validate = append, min_itemsize = min_itemsize, **kwargs) if 'table' not in self.group: # create the table - options = self.create_description(compression = compression, complevel = complevel) + options = self.create_description(complib = complib, + complevel = complevel, + fletcher32 = fletcher32, + expectedrows = expectedrows) # set the table attributes self.set_attrs() @@ -1695,10 +2108,9 @@ def write(self, axes, obj, append=False, compression=None, a.validate_and_set(table, append) # add the rows - self.write_data() - self.handle.flush() + self.write_data(chunksize) - def write_data(self): + def write_data(self, chunksize): """ fast writing of data: requires specific cython routines each axis shape """ # create the masks & values @@ -1706,13 +2118,8 @@ def write_data(self): for a in self.values_axes: # figure the mask: only do if we can successfully process this column, otherwise ignore the mask - try: - mask = np.isnan(a.data).all(axis=0) - masks.append(mask.astype('u1')) - except: - - # need to check for Nan in a non-numeric type column!!! - masks.append(np.zeros((a.data.shape[1:]), dtype = 'u1')) + mask = com.isnull(a.data).all(axis=0) + masks.append(mask.astype('u1')) # consolidate masks mask = masks[0] @@ -1720,21 +2127,42 @@ def write_data(self): m = mask & m # the arguments - args = [ a.cvalues for a in self.index_axes ] - values = [ a.data for a in self.values_axes ] + indexes = [ a.cvalues for a in self.index_axes ] + search = np.array([ a.is_searchable for a in self.values_axes ]).astype('u1') + values = [ a.take_data() for a in self.values_axes ] + + # write the chunks + rows = self.nrows_expected + chunks = int(rows / chunksize) + 1 + for i in xrange(chunks): + start_i = i*chunksize + end_i = min((i+1)*chunksize,rows) + + self.write_data_chunk(indexes = [ a[start_i:end_i] for a in indexes ], + mask = mask[start_i:end_i], + search = search, + values = [ v[:,start_i:end_i] for v in values ]) + + def write_data_chunk(self, indexes, mask, search, values): # get our function try: func = getattr(lib,"create_hdf_rows_%sd" % self.ndim) - args.append(mask) - args.append(values) + args = list(indexes) + args.extend([ mask, search, values ]) rows = func(*args) + except (Exception), detail: + raise Exception("cannot create row-data -> %s" % str(detail)) + + try: if len(rows): self.table.append(rows) + self.table.flush() except (Exception), detail: + import pdb; pdb.set_trace() raise Exception("tables cannot write this data -> %s" % str(detail)) - def delete(self, where = None): + def delete(self, where = None, **kwargs): # delete all rows (and return the nrows) if where is None or not len(where): @@ -1747,7 +2175,7 @@ def delete(self, where = None): # create the selection table = self.table - self.selection = Selection(self, where) + self.selection = Selection(self, where, **kwargs) values = self.selection.select_coords() # delete the rows in reverse order @@ -1779,7 +2207,7 @@ def delete(self, where = None): table.removeRows(start = rows[rows.index[0]], stop = rows[rows.index[-1]]+1) pg = g - self.handle.flush() + self.table.flush() # return the number of rows removed return ln @@ -1794,42 +2222,71 @@ class AppendableFrameTable(AppendableTable): def is_transposed(self): return self.index_axes[0].axis == 1 - def get_data_blocks(self, obj): + def get_object(self, obj): """ these are written transposed """ if self.is_transposed: obj = obj.T - return obj._data.blocks + return obj - def read(self, where=None): + def read(self, where=None, columns=None, **kwargs): - if not self.read_axes(where): return None + if not self.read_axes(where=where, **kwargs): return None index = self.index_axes[0].values frames = [] for a in self.values_axes: - columns = Index(a.values) + cols = Index(a.values) if self.is_transposed: values = a.cvalues - index_ = columns - columns_ = index + index_ = cols + cols_ = Index(index) else: values = a.cvalues.T - index_ = index - columns_ = columns + index_ = Index(index) + cols_ = cols + - block = make_block(values, columns_, columns_) - mgr = BlockManager([ block ], [ columns_, index_ ]) + # if we have a DataIndexableCol, its shape will only be 1 dim + if values.ndim == 1: + values = values.reshape(1,values.shape[0]) + + block = make_block(values, cols_, cols_) + mgr = BlockManager([ block ], [ cols_, index_ ]) frames.append(DataFrame(mgr)) - df = concat(frames, axis = 1, verify_integrity = True) - # sort the indicies & reorder the columns - for axis,labels in self.non_index_axes: - df = df.reindex_axis(labels,axis=axis,copy=False) + if len(frames) == 1: + df = frames[0] + else: + df = concat(frames, axis = 1, verify_integrity = True) # apply the selection filters & axis orderings - df = self.process_axes(df) + df = self.process_axes(df, columns=columns) + + return df + +class AppendableMultiFrameTable(AppendableFrameTable): + """ a frame with a multi-index """ + table_type = 'appendable_multiframe' + obj_type = DataFrame + ndim = 2 + + @property + def table_type_short(self): + return 'appendable_multi' + def write(self, obj, data_columns = None, **kwargs): + if data_columns is None: + data_columns = [] + for n in obj.index.names: + if n not in data_columns: + data_columns.insert(0,n) + self.levels = obj.index.names + return super(AppendableMultiFrameTable, self).write(obj = obj.reset_index(), data_columns = data_columns, **kwargs) + + def read(self, *args, **kwargs): + df = super(AppendableMultiFrameTable, self).read(*args, **kwargs) + df.set_index(self.levels, inplace=True) return df class AppendablePanelTable(AppendableTable): @@ -1838,11 +2295,11 @@ class AppendablePanelTable(AppendableTable): ndim = 3 obj_type = Panel - def get_data_blocks(self, obj): + def get_object(self, obj): """ these are written transposed """ if self.is_transposed: obj = obj.transpose(*self.data_orientation) - return obj._data.blocks + return obj @property def is_transposed(self): @@ -1856,7 +2313,8 @@ class AppendableNDimTable(AppendablePanelTable): # table maps _TABLE_MAP = { - 'appendable_frame' : AppendableFrameTable, + 'appendable_frame' : AppendableFrameTable, + 'appendable_multiframe' : AppendableMultiFrameTable, 'appendable_panel' : AppendablePanelTable, 'appendable_ndim' : AppendableNDimTable, 'worm' : WORMTable, @@ -1891,10 +2349,6 @@ def create_table(parent, group, typ = None, **kwargs): return _TABLE_MAP.get(tt)(parent, group, **kwargs) -def _itemsize_string_array(arr): - """ return the maximum size of elements in a strnig array """ - return max([ str_len(arr[v]).max() for v in range(arr.shape[0]) ]) - def _convert_index(index): if isinstance(index, DatetimeIndex): converted = index.asi8 @@ -2075,8 +2529,8 @@ class Term(object): """ - _ops = ['<=','<','>=','>','!=','='] - _search = re.compile("^(?P\w+)(?P%s)(?P.+)$" % '|'.join(_ops)) + _ops = ['<=','<','>=','>','!=','==','='] + _search = re.compile("^\s*(?P\w+)\s*(?P%s)\s*(?P.+)\s*$" % '|'.join(_ops)) def __init__(self, field, op = None, value = None, queryables = None): self.field = None @@ -2135,6 +2589,10 @@ def __init__(self, field, op = None, value = None, queryables = None): if self.field is None or self.op is None or self.value is None: raise Exception("Could not create this term [%s]" % str(self)) + # = vs == + if self.op == '==': + self.op = '=' + # we have valid conditions if self.op in ['>','>=','<','<=']: if hasattr(self.value,'__iter__') and len(self.value) > 1: @@ -2206,25 +2664,37 @@ def eval(self): raise Exception("passing a filterable condition to a non-table indexer [%s]" % str(self)) def convert_value(self, v): - - #### a little hacky here, need to really figure out what we should convert ####x - if self.field == 'index' or self.field == 'major_axis': - if self.kind == 'datetime64' : - return [lib.Timestamp(v).value, None] - elif isinstance(v, datetime) or hasattr(v,'timetuple') or self.kind == 'date': - return [time.mktime(v.timetuple()), None] - elif self.kind == 'integer': - v = int(float(v)) - return [v, v] - elif self.kind == 'float': - v = float(v) - return [v, v] + """ convert the expression that is in the term to something that is accepted by pytables """ + + if self.kind == 'datetime64' : + return [lib.Timestamp(v).value, None] + elif isinstance(v, datetime) or hasattr(v,'timetuple') or self.kind == 'date': + return [time.mktime(v.timetuple()), None] + elif self.kind == 'integer': + v = int(float(v)) + return [v, v] + elif self.kind == 'float': + v = float(v) + return [v, v] elif not isinstance(v, basestring): return [str(v), None] # string quoting return ["'" + v + "'", v] +class Coordinates(object): + """ holds a returned coordinates list, useful to select the same rows from different tables + + coordinates : holds the array of coordinates + group : the source group + where : the source where + """ + + def __init__(self, values, group, where, **kwargs): + self.values = values + self.group = group + self.where = where + class Selection(object): """ Carries out a selection operation on a tables.Table object. @@ -2233,24 +2703,33 @@ class Selection(object): ---------- table : a Table object where : list of Terms (or convertable to) + start, stop: indicies to start and/or stop selection """ - def __init__(self, table, where=None): + def __init__(self, table, where=None, start=None, stop=None, **kwargs): self.table = table self.where = where + self.start = start + self.stop = stop self.condition = None self.filter = None - self.terms = self.generate(where) - - # create the numexpr & the filter - if self.terms: - conds = [ t.condition for t in self.terms if t.condition is not None ] - if len(conds): - self.condition = "(%s)" % ' & '.join(conds) - self.filter = [] - for t in self.terms: - if t.filter is not None: - self.filter.append(t.filter) + self.terms = None + self.coordinates = None + + if isinstance(where, Coordinates): + self.coordinates = where.values + else: + self.terms = self.generate(where) + + # create the numexpr & the filter + if self.terms: + conds = [ t.condition for t in self.terms if t.condition is not None ] + if len(conds): + self.condition = "(%s)" % ' & '.join(conds) + self.filter = [] + for t in self.terms: + if t.filter is not None: + self.filter.append(t.filter) def generate(self, where): """ where can be a : dict,list,tuple,string """ @@ -2259,9 +2738,12 @@ def generate(self, where): if not isinstance(where, (list,tuple)): where = [ where ] else: - # do we have all list/tuple + + # make this a list of we think that we only have a sigle term & no operands inside any terms if not any([ isinstance(w, (list,tuple,Term)) for w in where ]): - where = [ where ] + + if not any([ isinstance(w,basestring) and Term._search.match(w) for w in where ]): + where = [ where ] queryables = self.table.queryables() return [ Term(c, queryables = queryables) for c in where ] @@ -2271,15 +2753,19 @@ def select(self): generate the selection """ if self.condition is not None: - return self.table.table.readWhere(self.condition) - else: - return self.table.table.read() + return self.table.table.readWhere(self.condition, start=self.start, stop=self.stop) + elif self.coordinates is not None: + return self.table.table.readCoordinates(self.coordinates) + return self.table.table.read(start=self.start,stop=self.stop) def select_coords(self): """ generate the selection """ - return self.table.table.getWhereList(self.condition, sort = True) + if self.condition is None: + return np.arange(self.table.nrows) + + return self.table.table.getWhereList(self.condition, start=self.start, stop=self.stop, sort = True) def _get_index_factory(klass): diff --git a/pandas/io/tests/legacy_0.10.h5 b/pandas/io/tests/legacy_0.10.h5 new file mode 100644 index 0000000000000..b1439ef16361a Binary files /dev/null and b/pandas/io/tests/legacy_0.10.h5 differ diff --git a/pandas/io/tests/legacy_table.h5 b/pandas/io/tests/legacy_table.h5 index 1c90382d9125c..5f4089efc15c3 100644 Binary files a/pandas/io/tests/legacy_table.h5 and b/pandas/io/tests/legacy_table.h5 differ diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 2b0d1cda89392..6f11ebdaaa7b3 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -4,7 +4,7 @@ import sys import warnings -from datetime import datetime +import datetime import numpy as np from pandas import (Series, DataFrame, Panel, MultiIndex, bdate_range, @@ -93,16 +93,17 @@ def test_versioning(self): self.store.remove('df1') self.store.append('df1', df[:10]) self.store.append('df1', df[10:]) - self.assert_(self.store.root.a._v_attrs.pandas_version == '0.10') - self.assert_(self.store.root.b._v_attrs.pandas_version == '0.10') - self.assert_(self.store.root.df1._v_attrs.pandas_version == '0.10') + self.assert_(self.store.root.a._v_attrs.pandas_version == '0.10.1') + self.assert_(self.store.root.b._v_attrs.pandas_version == '0.10.1') + self.assert_(self.store.root.df1._v_attrs.pandas_version == '0.10.1') # write a file and wipe its versioning self.store.remove('df2') self.store.append('df2', df) + + # this is an error because its table_type is appendable, but no version info self.store.get_node('df2')._v_attrs.pandas_version = None - self.store.select('df2') - self.store.select('df2', [ Term('index','>',df.index[2]) ]) + self.assertRaises(Exception, self.store.select,'df2') def test_meta(self): raise nose.SkipTest('no meta') @@ -202,12 +203,12 @@ def test_put_string_index(self): def test_put_compression(self): df = tm.makeTimeDataFrame() - self.store.put('c', df, table=True, compression='zlib') + self.store.put('c', df, table=True, complib='zlib') tm.assert_frame_equal(self.store['c'], df) # can't compress if table=False self.assertRaises(ValueError, self.store.put, 'b', df, - table=False, compression='zlib') + table=False, complib='zlib') def test_put_compression_blosc(self): tm.skip_if_no_package('tables', '2.2', app='blosc support') @@ -215,9 +216,9 @@ def test_put_compression_blosc(self): # can't compress if table=False self.assertRaises(ValueError, self.store.put, 'b', df, - table=False, compression='blosc') + table=False, complib='blosc') - self.store.put('c', df, table=True, compression='blosc') + self.store.put('c', df, table=True, complib='blosc') tm.assert_frame_equal(self.store['c'], df) def test_put_integer(self): @@ -287,6 +288,13 @@ def test_append(self): self.store.append('wp1', wp_append2) tm.assert_panel_equal(self.store['wp1'], wp) + # dtype issues - mizxed type in a single object column + df = DataFrame(data=[[1,2],[0,1],[1,2],[0,0]]) + df['mixed_column'] = 'testing' + df.ix[2,'mixed_column'] = np.nan + self.store.remove('df') + self.store.append('df', df) + tm.assert_frame_equal(self.store['df'],df) def test_append_frame_column_oriented(self): @@ -373,11 +381,15 @@ def test_append_with_strings(self): wp = tm.makePanel() wp2 = wp.rename_axis(dict([ (x,"%s_extra" % x) for x in wp.minor_axis ]), axis = 2) + def check_col(key,name,size): + self.assert_(getattr(self.store.get_table(key).table.description,name).itemsize == size) + self.store.append('s1', wp, min_itemsize = 20) self.store.append('s1', wp2) expected = concat([ wp, wp2], axis = 2) expected = expected.reindex(minor_axis = sorted(expected.minor_axis)) tm.assert_panel_equal(self.store['s1'], expected) + check_col('s1','minor_axis',20) # test dict format self.store.append('s2', wp, min_itemsize = { 'minor_axis' : 20 }) @@ -385,6 +397,7 @@ def test_append_with_strings(self): expected = concat([ wp, wp2], axis = 2) expected = expected.reindex(minor_axis = sorted(expected.minor_axis)) tm.assert_panel_equal(self.store['s2'], expected) + check_col('s2','minor_axis',20) # apply the wrong field (similar to #1) self.store.append('s3', wp, min_itemsize = { 'major_axis' : 20 }) @@ -396,55 +409,189 @@ def test_append_with_strings(self): # avoid truncation on elements df = DataFrame([[123,'asdqwerty'], [345,'dggnhebbsdfbdfb']]) - self.store.append('df_big',df, min_itemsize = { 'values' : 1024 }) + self.store.append('df_big',df) tm.assert_frame_equal(self.store.select('df_big'), df) + check_col('df_big','values_block_1',15) # appending smaller string ok df2 = DataFrame([[124,'asdqy'], [346,'dggnhefbdfb']]) self.store.append('df_big',df2) expected = concat([ df, df2 ]) tm.assert_frame_equal(self.store.select('df_big'), expected) + check_col('df_big','values_block_1',15) # avoid truncation on elements df = DataFrame([[123,'asdqwerty'], [345,'dggnhebbsdfbdfb']]) - self.store.append('df_big2',df, min_itemsize = { 'values' : 10 }) + self.store.append('df_big2',df, min_itemsize = { 'values' : 50 }) tm.assert_frame_equal(self.store.select('df_big2'), df) + check_col('df_big2','values_block_1',50) # bigger string on next append - self.store.append('df_new',df, min_itemsize = { 'values' : 16 }) + self.store.append('df_new',df) df_new = DataFrame([[124,'abcdefqhij'], [346, 'abcdefghijklmnopqrtsuvwxyz']]) self.assertRaises(Exception, self.store.append, 'df_new',df_new) + # with nans + self.store.remove('df') + df = tm.makeTimeDataFrame() + df['string'] = 'foo' + df.ix[1:4,'string'] = np.nan + df['string2'] = 'bar' + df.ix[4:8,'string2'] = np.nan + df['string3'] = 'bah' + df.ix[1:,'string3'] = np.nan + self.store.append('df',df) + result = self.store.select('df') + tm.assert_frame_equal(result,df) + + + def test_append_with_data_columns(self): + + df = tm.makeTimeDataFrame() + self.store.remove('df') + self.store.append('df', df[:2], data_columns = ['B']) + self.store.append('df', df[2:]) + tm.assert_frame_equal(self.store['df'], df) + + # check that we have indicies created + assert(self.store.handle.root.df.table.cols.index.is_indexed == True) + assert(self.store.handle.root.df.table.cols.B.is_indexed == True) + + # data column searching + result = self.store.select('df', [ Term('B>0') ]) + expected = df[df.B>0] + tm.assert_frame_equal(result, expected) + + # data column searching (with an indexable and a data_columns) + result = self.store.select('df', [ Term('B>0'), Term('index','>',df.index[3]) ]) + df_new = df.reindex(index=df.index[4:]) + expected = df_new[df_new.B>0] + tm.assert_frame_equal(result, expected) + + # data column selection with a string data_column + df_new = df.copy() + df_new['string'] = 'foo' + df_new['string'][1:4] = np.nan + df_new['string'][5:6] = 'bar' + self.store.remove('df') + self.store.append('df', df_new, data_columns = ['string']) + result = self.store.select('df', [ Term('string', '=', 'foo') ]) + expected = df_new[df_new.string == 'foo'] + tm.assert_frame_equal(result, expected) + + # using min_itemsize and a data column + def check_col(key,name,size): + self.assert_(getattr(self.store.get_table(key).table.description,name).itemsize == size) + + self.store.remove('df') + self.store.append('df', df_new, data_columns = ['string'], min_itemsize = { 'string' : 30 }) + check_col('df','string',30) + self.store.remove('df') + self.store.append('df', df_new, data_columns = ['string'], min_itemsize = 30) + check_col('df','string',30) + self.store.remove('df') + self.store.append('df', df_new, data_columns = ['string'], min_itemsize = { 'values' : 30 }) + check_col('df','string',30) + + df_new['string2'] = 'foobarbah' + df_new['string_block1'] = 'foobarbah1' + df_new['string_block2'] = 'foobarbah2' + self.store.remove('df') + self.store.append('df', df_new, data_columns = ['string','string2'], min_itemsize = { 'string' : 30, 'string2' : 40, 'values' : 50 }) + check_col('df','string',30) + check_col('df','string2',40) + check_col('df','values_block_1',50) + + # multiple data columns + df_new = df.copy() + df_new['string'] = 'foo' + df_new['string'][1:4] = np.nan + df_new['string'][5:6] = 'bar' + df_new['string2'] = 'foo' + df_new['string2'][2:5] = np.nan + df_new['string2'][7:8] = 'bar' + self.store.remove('df') + self.store.append('df', df_new, data_columns = ['A','B','string','string2']) + result = self.store.select('df', [ Term('string', '=', 'foo'), Term('string2=foo'), Term('A>0'), Term('B<0') ]) + expected = df_new[(df_new.string == 'foo') & (df_new.string2 == 'foo') & (df_new.A > 0) & (df_new.B < 0)] + tm.assert_frame_equal(result, expected) + + # yield an empty frame + result = self.store.select('df', [ Term('string', '=', 'foo'), Term('string2=bar'), Term('A>0'), Term('B<0') ]) + expected = df_new[(df_new.string == 'foo') & (df_new.string2 == 'bar') & (df_new.A > 0) & (df_new.B < 0)] + tm.assert_frame_equal(result, expected) + + # doc example + df_dc = df.copy() + df_dc['string'] = 'foo' + df_dc.ix[4:6,'string'] = np.nan + df_dc.ix[7:9,'string'] = 'bar' + df_dc['string2'] = 'cool' + df_dc['datetime'] = Timestamp('20010102') + df_dc = df_dc.convert_objects() + df_dc.ix[3:5,['A','B','datetime']] = np.nan + + self.store.remove('df_dc') + self.store.append('df_dc', df_dc, data_columns = ['B','C','string','string2','datetime']) + result = self.store.select('df_dc',[ Term('B>0') ]) + + expected = df_dc[df_dc.B > 0] + tm.assert_frame_equal(result, expected) + + result = self.store.select('df_dc',[ 'B > 0', 'C > 0', 'string == foo' ]) + expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == 'foo')] + tm.assert_frame_equal(result, expected) + def test_create_table_index(self): + + def col(t,column): + return getattr(self.store.get_table(t).table.cols,column) + + # index=False wp = tm.makePanel() - self.store.append('p5', wp) - self.store.create_table_index('p5') + self.store.append('p5', wp, index=False) + self.store.create_table_index('p5', columns = ['major_axis']) + assert(col('p5','major_axis').is_indexed == True) + assert(col('p5','minor_axis').is_indexed == False) - assert(self.store.handle.root.p5.table.cols.major_axis.is_indexed == True) - assert(self.store.handle.root.p5.table.cols.minor_axis.is_indexed == False) + # index=True + self.store.append('p5i', wp, index=True) + assert(col('p5i','major_axis').is_indexed == True) + assert(col('p5i','minor_axis').is_indexed == True) # default optlevels - assert(self.store.handle.root.p5.table.cols.major_axis.index.optlevel == 6) - assert(self.store.handle.root.p5.table.cols.major_axis.index.kind == 'medium') + self.store.get_table('p5').create_index() + assert(col('p5','major_axis').index.optlevel == 6) + assert(col('p5','minor_axis').index.kind == 'medium') # let's change the indexing scheme self.store.create_table_index('p5') - assert(self.store.handle.root.p5.table.cols.major_axis.index.optlevel == 6) - assert(self.store.handle.root.p5.table.cols.major_axis.index.kind == 'medium') + assert(col('p5','major_axis').index.optlevel == 6) + assert(col('p5','minor_axis').index.kind == 'medium') self.store.create_table_index('p5', optlevel=9) - assert(self.store.handle.root.p5.table.cols.major_axis.index.optlevel == 9) - assert(self.store.handle.root.p5.table.cols.major_axis.index.kind == 'medium') + assert(col('p5','major_axis').index.optlevel == 9) + assert(col('p5','minor_axis').index.kind == 'medium') self.store.create_table_index('p5', kind='full') - assert(self.store.handle.root.p5.table.cols.major_axis.index.optlevel == 9) - assert(self.store.handle.root.p5.table.cols.major_axis.index.kind == 'full') + assert(col('p5','major_axis').index.optlevel == 9) + assert(col('p5','minor_axis').index.kind == 'full') self.store.create_table_index('p5', optlevel=1, kind='light') - assert(self.store.handle.root.p5.table.cols.major_axis.index.optlevel == 1) - assert(self.store.handle.root.p5.table.cols.major_axis.index.kind == 'light') - + assert(col('p5','major_axis').index.optlevel == 1) + assert(col('p5','minor_axis').index.kind == 'light') + + # data columns df = tm.makeTimeDataFrame() - self.store.append('f', df[:10]) - self.store.append('f', df[10:]) - self.store.create_table_index('f') + df['string'] = 'foo' + df['string2'] = 'bar' + self.store.append('f', df, data_columns=['string','string2']) + assert(col('f','index').is_indexed == True) + assert(col('f','string').is_indexed == True) + assert(col('f','string2').is_indexed == True) + + # specify index=columns + self.store.append('f2', df, index=['string'], data_columns=['string','string2']) + assert(col('f2','index').is_indexed == False) + assert(col('f2','string').is_indexed == True) + assert(col('f2','string2').is_indexed == False) # try to index a non-table self.store.put('f2', df) @@ -474,24 +621,86 @@ def test_create_table_index(self): tables.__version__ = original - def test_big_table(self): - raise nose.SkipTest('no big table') + def test_big_table_frame(self): + raise nose.SkipTest('no big table frame') # create and write a big table - wp = Panel(np.random.randn(20, 1000, 1000), items= [ 'Item%s' % i for i in xrange(20) ], - major_axis=date_range('1/1/2000', periods=1000), minor_axis = [ 'E%s' % i for i in xrange(1000) ]) + df = DataFrame(np.random.randn(2000*100, 100), index = range(2000*100), columns = [ 'E%03d' % i for i in xrange(100) ]) + for x in range(20): + df['String%03d' % x] = 'string%03d' % x + + import time + x = time.time() + try: + store = HDFStore(self.scratchpath) + store.append('df',df) + rows = store.root.df.table.nrows + recons = store.select('df') + finally: + store.close() + os.remove(self.scratchpath) + + print "\nbig_table frame [%s] -> %5.2f" % (rows,time.time()-x) + + + def test_big_table2_frame(self): + # this is a really big table: 2.5m rows x 300 float columns, 20 string columns + raise nose.SkipTest('no big table2 frame') + + # create and write a big table + print "\nbig_table2 start" + import time + start_time = time.time() + df = DataFrame(np.random.randn(2.5*1000*1000, 300), index = range(int(2.5*1000*1000)), columns = [ 'E%03d' % i for i in xrange(300) ]) + for x in range(20): + df['String%03d' % x] = 'string%03d' % x + + print "\nbig_table2 frame (creation of df) [rows->%s] -> %5.2f" % (len(df.index),time.time()-start_time) + fn = 'big_table2.h5' + + try: + + def f(chunksize): + store = HDFStore(fn,mode = 'w') + store.append('df',df,chunksize=chunksize) + r = store.root.df.table.nrows + store.close() + return r + + for c in [ 10000, 50000, 100000, 250000 ]: + start_time = time.time() + print "big_table2 frame [chunk->%s]" % c + rows = f(c) + print "big_table2 frame [rows->%s,chunk->%s] -> %5.2f" % (rows,c,time.time()-start_time) + + finally: + os.remove(fn) + + def test_big_table_panel(self): + raise nose.SkipTest('no big table panel') + + # create and write a big table + wp = Panel(np.random.randn(20, 1000, 1000), items= [ 'Item%03d' % i for i in xrange(20) ], + major_axis=date_range('1/1/2000', periods=1000), minor_axis = [ 'E%03d' % i for i in xrange(1000) ]) wp.ix[:,100:200,300:400] = np.nan + for x in range(100): + wp['String%03d'] = 'string%03d' % x + + import time + x = time.time() try: store = HDFStore(self.scratchpath) - store._debug_memory = True - store.append('wp',wp) + store.prof_append('wp',wp) + rows = store.root.wp.table.nrows recons = store.select('wp') finally: store.close() os.remove(self.scratchpath) + print "\nbig_table panel [%s] -> %5.2f" % (rows,time.time()-x) + def test_append_diff_item_order(self): raise nose.SkipTest('append diff item order') @@ -503,6 +712,30 @@ def test_append_diff_item_order(self): self.assertRaises(Exception, self.store.put, 'panel', wp2, append=True) + def test_append_hierarchical(self): + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], + ['one', 'two', 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['foo', 'bar']) + df = DataFrame(np.random.randn(10, 3), index=index, + columns=['A', 'B', 'C']) + + self.store.append('mi',df) + result = self.store.select('mi') + tm.assert_frame_equal(result, df) + + def test_append_misc(self): + + df = tm.makeDataFrame() + self.store.append('df',df,chunksize=1) + result = self.store.select('df') + tm.assert_frame_equal(result, df) + + self.store.append('df1',df,expectedrows=10) + result = self.store.select('df1') + tm.assert_frame_equal(result, df) + def test_table_index_incompatible_dtypes(self): df1 = DataFrame({'a': [1, 2, 3]}) df2 = DataFrame({'a': [4, 5, 6]}, @@ -527,51 +760,66 @@ def test_table_values_dtypes_roundtrip(self): def test_table_mixed_dtypes(self): # frame - def _make_one_df(): - df = tm.makeDataFrame() - df['obj1'] = 'foo' - df['obj2'] = 'bar' - df['bool1'] = df['A'] > 0 - df['bool2'] = df['B'] > 0 - df['bool3'] = True - df['int1'] = 1 - df['int2'] = 2 - return df.consolidate() - - df1 = _make_one_df() - - self.store.append('df1_mixed', df1) - tm.assert_frame_equal(self.store.select('df1_mixed'), df1) + df = tm.makeDataFrame() + df['obj1'] = 'foo' + df['obj2'] = 'bar' + df['bool1'] = df['A'] > 0 + df['bool2'] = df['B'] > 0 + df['bool3'] = True + df['int1'] = 1 + df['int2'] = 2 + df['timestamp1'] = Timestamp('20010102') + df['timestamp2'] = Timestamp('20010103') + df['datetime1'] = datetime.datetime(2001,1,2,0,0) + df['datetime2'] = datetime.datetime(2001,1,3,0,0) + df.ix[3:6,['obj1']] = np.nan + df = df.consolidate().convert_objects() + + self.store.append('df1_mixed', df) + tm.assert_frame_equal(self.store.select('df1_mixed'), df) # panel - def _make_one_panel(): - wp = tm.makePanel() - wp['obj1'] = 'foo' - wp['obj2'] = 'bar' - wp['bool1'] = wp['ItemA'] > 0 - wp['bool2'] = wp['ItemB'] > 0 - wp['int1'] = 1 - wp['int2'] = 2 - return wp.consolidate() - p1 = _make_one_panel() - - self.store.append('p1_mixed', p1) - tm.assert_panel_equal(self.store.select('p1_mixed'), p1) + wp = tm.makePanel() + wp['obj1'] = 'foo' + wp['obj2'] = 'bar' + wp['bool1'] = wp['ItemA'] > 0 + wp['bool2'] = wp['ItemB'] > 0 + wp['int1'] = 1 + wp['int2'] = 2 + wp = wp.consolidate() + + self.store.append('p1_mixed', wp) + tm.assert_panel_equal(self.store.select('p1_mixed'), wp) # ndim - def _make_one_p4d(): - wp = tm.makePanel4D() - wp['obj1'] = 'foo' - wp['obj2'] = 'bar' - wp['bool1'] = wp['l1'] > 0 - wp['bool2'] = wp['l2'] > 0 - wp['int1'] = 1 - wp['int2'] = 2 - return wp.consolidate() - - p4d = _make_one_p4d() - self.store.append('p4d_mixed', p4d) - tm.assert_panel4d_equal(self.store.select('p4d_mixed'), p4d) + wp = tm.makePanel4D() + wp['obj1'] = 'foo' + wp['obj2'] = 'bar' + wp['bool1'] = wp['l1'] > 0 + wp['bool2'] = wp['l2'] > 0 + wp['int1'] = 1 + wp['int2'] = 2 + wp = wp.consolidate() + + self.store.append('p4d_mixed', wp) + tm.assert_panel4d_equal(self.store.select('p4d_mixed'), wp) + + def test_unimplemented_dtypes_table_columns(self): + #### currently not supported dtypes #### + for n,f in [ ('unicode',u'\u03c3'), ('date',datetime.date(2001,1,2)) ]: + df = tm.makeDataFrame() + df[n] = f + self.assertRaises(NotImplementedError, self.store.append, 'df1_%s' % n, df) + + # frame + df = tm.makeDataFrame() + df['obj1'] = 'foo' + df['obj2'] = 'bar' + df['datetime1'] = datetime.date(2001,1,2) + df = df.consolidate().convert_objects() + + # this fails because we have a date in the object block...... + self.assertRaises(Exception, self.store.append, 'df_unimplemented', df) def test_remove(self): ts = tm.makeTimeSeries() @@ -737,10 +985,10 @@ def test_terms(self): ('major_axis', '20121114'), ('major_axis', '>', '20121114'), (('major_axis', ['20121114','20121114']),), - ('major_axis', datetime(2012,11,14)), - 'major_axis>20121114', - 'major_axis>20121114', - 'major_axis>20121114', + ('major_axis', datetime.datetime(2012,11,14)), + 'major_axis> 20121114', + 'major_axis >20121114', + 'major_axis > 20121114', (('minor_axis', ['A','B']),), (('minor_axis', ['A','B']),), ((('minor_axis', ['A','B']),),), @@ -844,14 +1092,13 @@ def test_index_types(self): ser = Series(values, [0, 'y']) self._check_roundtrip(ser, func) - ser = Series(values, [datetime.today(), 0]) + ser = Series(values, [datetime.datetime.today(), 0]) self._check_roundtrip(ser, func) ser = Series(values, ['y', 0]) self._check_roundtrip(ser, func) - from datetime import date - ser = Series(values, [date.today(), 'a']) + ser = Series(values, [datetime.date.today(), 'a']) self._check_roundtrip(ser, func) ser = Series(values, [1.23, 'b']) @@ -863,7 +1110,7 @@ def test_index_types(self): ser = Series(values, [1, 5]) self._check_roundtrip(ser, func) - ser = Series(values, [datetime(2012, 1, 1), datetime(2012, 1, 2)]) + ser = Series(values, [datetime.datetime(2012, 1, 1), datetime.datetime(2012, 1, 2)]) self._check_roundtrip(ser, func) def test_timeseries_preepoch(self): @@ -1104,6 +1351,33 @@ def test_select(self): #self.assertRaises(Exception, self.store.select, # 'wp2', ('column', ['A', 'D'])) + # select with columns= + df = tm.makeTimeDataFrame() + self.store.remove('df') + self.store.append('df',df) + result = self.store.select('df', columns = ['A','B']) + expected = df.reindex(columns = ['A','B']) + tm.assert_frame_equal(expected, result) + + # equivalentsly + result = self.store.select('df', [ ('columns', ['A','B']) ]) + expected = df.reindex(columns = ['A','B']) + tm.assert_frame_equal(expected, result) + + # with a data column + self.store.remove('df') + self.store.append('df',df, data_columns = ['A']) + result = self.store.select('df', [ 'A > 0' ], columns = ['A','B']) + expected = df[df.A > 0].reindex(columns = ['A','B']) + tm.assert_frame_equal(expected, result) + + # with a data column, but different columns + self.store.remove('df') + self.store.append('df',df, data_columns = ['A']) + result = self.store.select('df', [ 'A > 0' ], columns = ['C','D']) + expected = df[df.A > 0].reindex(columns = ['C','D']) + tm.assert_frame_equal(expected, result) + def test_panel_select(self): wp = tm.makePanel() self.store.put('wp', wp, table=True) @@ -1148,11 +1422,161 @@ def test_frame_select(self): self.store.append('df_float', df) self.store.select('df_float', [ Term("index<10.0"), Term("columns", "=", ["A"]) ]) + # invalid terms + df = tm.makeTimeDataFrame() + self.store.append('df_time', df) + self.assertRaises(Exception, self.store.select, 'df_time', [ Term("index>0") ]) + # can't select if not written as table #self.store['frame'] = df #self.assertRaises(Exception, self.store.select, # 'frame', [crit1, crit2]) + def test_unique(self): + df = tm.makeTimeDataFrame() + + + def check(x, y): + self.assert_((np.unique(x) == np.unique(y)).all() == True) + + self.store.remove('df') + self.store.append('df', df) + + # error + self.assertRaises(KeyError, self.store.unique, 'df','foo') + + # valid + result = self.store.unique('df','index') + check(result.values,df.index.values) + + # not a data indexable column + self.assertRaises(ValueError, self.store.unique, 'df','values_block_0') + + # a data column + df2 = df.copy() + df2['string'] = 'foo' + self.store.append('df2',df2,data_columns = ['string']) + result = self.store.unique('df2','string') + check(result.values,df2['string'].unique()) + + # a data column with NaNs, result excludes the NaNs + df3 = df.copy() + df3['string'] = 'foo' + df3.ix[4:6,'string'] = np.nan + self.store.append('df3',df3,data_columns = ['string']) + result = self.store.unique('df3','string') + check(result.values,df3['string'].valid().unique()) + + def test_coordinates(self): + df = tm.makeTimeDataFrame() + + self.store.remove('df') + self.store.append('df', df) + + # all + c = self.store.select_as_coordinates('df') + assert((c.values == np.arange(len(df.index))).all() == True) + + # get coordinates back & test vs frame + self.store.remove('df') + + df = DataFrame(dict(A = range(5), B = range(5))) + self.store.append('df', df) + c = self.store.select_as_coordinates('df',[ 'index<3' ]) + assert((c.values == np.arange(3)).all() == True) + result = self.store.select('df', where = c) + expected = df.ix[0:2,:] + tm.assert_frame_equal(result,expected) + + c = self.store.select_as_coordinates('df', [ 'index>=3', 'index<=4' ]) + assert((c.values == np.arange(2)+3).all() == True) + result = self.store.select('df', where = c) + expected = df.ix[3:4,:] + tm.assert_frame_equal(result,expected) + + # multiple tables + self.store.remove('df1') + self.store.remove('df2') + df1 = tm.makeTimeDataFrame() + df2 = tm.makeTimeDataFrame().rename(columns = lambda x: "%s_2" % x) + self.store.append('df1',df1, data_columns = ['A','B']) + self.store.append('df2',df2) + + c = self.store.select_as_coordinates('df1', [ 'A>0','B>0' ]) + df1_result = self.store.select('df1',c) + df2_result = self.store.select('df2',c) + result = concat([ df1_result, df2_result ], axis=1) + + expected = concat([ df1, df2 ], axis=1) + expected = expected[(expected.A > 0) & (expected.B > 0)] + tm.assert_frame_equal(result, expected) + + def test_append_to_multiple(self): + df1 = tm.makeTimeDataFrame() + df2 = tm.makeTimeDataFrame().rename(columns = lambda x: "%s_2" % x) + df2['foo'] = 'bar' + df = concat([ df1, df2 ], axis=1) + + # exceptions + self.assertRaises(Exception, self.store.append_to_multiple, { 'df1' : ['A','B'], 'df2' : None }, df, selector = 'df3') + self.assertRaises(Exception, self.store.append_to_multiple, { 'df1' : None, 'df2' : None }, df, selector = 'df3') + self.assertRaises(Exception, self.store.append_to_multiple, 'df1', df, 'df1') + + # regular operation + self.store.append_to_multiple({ 'df1' : ['A','B'], 'df2' : None }, df, selector = 'df1') + result = self.store.select_as_multiple(['df1','df2'], where = [ 'A>0','B>0' ], selector = 'df1') + expected = df[(df.A > 0) & (df.B > 0)] + tm.assert_frame_equal(result, expected) + + + def test_select_as_multiple(self): + df1 = tm.makeTimeDataFrame() + df2 = tm.makeTimeDataFrame().rename(columns = lambda x: "%s_2" % x) + df2['foo'] = 'bar' + self.store.append('df1',df1, data_columns = ['A','B']) + self.store.append('df2',df2) + + # exceptions + self.assertRaises(Exception, self.store.select_as_multiple, None, where = [ 'A>0','B>0' ], selector = 'df1') + self.assertRaises(Exception, self.store.select_as_multiple, [ None ], where = [ 'A>0','B>0' ], selector = 'df1') + + # default select + result = self.store.select('df1', ['A>0','B>0']) + expected = self.store.select_as_multiple([ 'df1' ], where = [ 'A>0','B>0' ], selector = 'df1') + tm.assert_frame_equal(result, expected) + expected = self.store.select_as_multiple( 'df1' , where = [ 'A>0','B>0' ], selector = 'df1') + tm.assert_frame_equal(result, expected) + + # multiple + result = self.store.select_as_multiple(['df1','df2'], where = [ 'A>0','B>0' ], selector = 'df1') + expected = concat([ df1, df2 ], axis=1) + expected = expected[(expected.A > 0) & (expected.B > 0)] + tm.assert_frame_equal(result, expected) + + # multiple (diff selector) + result = self.store.select_as_multiple(['df1','df2'], where = [ Term('index', '>', df2.index[4]) ], selector = 'df2') + expected = concat([ df1, df2 ], axis=1) + expected = expected[5:] + tm.assert_frame_equal(result, expected) + + # test excpection for diff rows + self.store.append('df3',tm.makeTimeDataFrame(nper=50)) + self.assertRaises(Exception, self.store.select_as_multiple, ['df1','df3'], where = [ 'A>0','B>0' ], selector = 'df1') + + def test_start_stop(self): + + df = DataFrame(dict(A = np.random.rand(20), B = np.random.rand(20))) + self.store.append('df', df) + + result = self.store.select('df', [ Term("columns", "=", ["A"]) ], start=0, stop=5) + expected = df.ix[0:4,['A']] + tm.assert_frame_equal(result, expected) + + # out of range + result = self.store.select('df', [ Term("columns", "=", ["A"]) ], start=30, stop=40) + assert(len(result) == 0) + assert(type(result) == DataFrame) + def test_select_filter_corner(self): df = DataFrame(np.random.randn(50, 100)) df.index = ['%.3d' % c for c in df.index] @@ -1230,14 +1654,25 @@ def test_legacy_table_read(self): # force the frame store.select('df2', typ = 'legacy_frame') - # old version (this still throws an exception though) + self.assertRaises(Exception, store.select, 'wp1', Term('minor_axis','=','B')) + + # old version warning import warnings warnings.filterwarnings('ignore', category=IncompatibilityWarning) - self.assertRaises(Exception, store.select, 'wp1', Term('minor_axis','=','B')) + df2 = store.select('df2') + store.select('df2', Term('index', '>', df2.index[2])) warnings.filterwarnings('always', category=IncompatibilityWarning) store.close() + def test_legacy_0_10_read(self): + # legacy from 0.10 + pth = curpath() + store = HDFStore(os.path.join(pth, 'legacy_0.10.h5'), 'r') + for k in store.keys(): + store.select(k) + store.close() + def test_legacy_table_write(self): # legacy table types pth = curpath() @@ -1252,7 +1687,7 @@ def test_legacy_table_write(self): store.close() def test_store_datetime_fractional_secs(self): - dt = datetime(2012, 1, 2, 3, 4, 5, 123456) + dt = datetime.datetime(2012, 1, 2, 3, 4, 5, 123456) series = Series([0], [dt]) self.store['a'] = series self.assertEquals(self.store['a'].index[0], dt) @@ -1307,13 +1742,13 @@ def test_store_datetime_mixed(self): df['d'] = ts.index[:3] self._check_roundtrip(df, tm.assert_frame_equal) - def test_cant_write_multiindex_table(self): - # for now, #1848 - df = DataFrame(np.random.randn(10, 4), - index=[np.arange(5).repeat(2), - np.tile(np.arange(2), 5)]) + #def test_cant_write_multiindex_table(self): + # # for now, #1848 + # df = DataFrame(np.random.randn(10, 4), + # index=[np.arange(5).repeat(2), + # np.tile(np.arange(2), 5)]) - self.assertRaises(Exception, self.store.put, 'foo', df, table=True) + # self.assertRaises(Exception, self.store.put, 'foo', df, table=True) def curpath(): pth, _ = os.path.split(os.path.abspath(__file__)) diff --git a/pandas/lib.pyx b/pandas/lib.pyx index d904d86f183c3..39911c88e5686 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -15,7 +15,9 @@ from cpython cimport (PyDict_New, PyDict_GetItem, PyDict_SetItem, PyList_Check, PyFloat_Check, PyString_Check, PyTuple_SetItem, - PyTuple_New) + PyTuple_New, + PyObject_SetAttrString) + cimport cpython isnan = np.isnan @@ -740,33 +742,61 @@ def clean_index_list(list obj): return maybe_convert_objects(converted), 0 -from cpython cimport (PyDict_New, PyDict_GetItem, PyDict_SetItem, - PyDict_Contains, PyDict_Keys, - Py_INCREF, PyTuple_SET_ITEM, - PyTuple_SetItem, - PyTuple_New, - PyObject_SetAttrString) +@cython.boundscheck(False) +@cython.wraparound(False) +def max_len_string_array(ndarray[object, ndim=1] arr): + """ return the maximum size of elements in a 1-dim string array """ + cdef: + int i, m, l + length = arr.shape[0] + + m = 0 + for i from 0 <= i < length: + l = len(arr[i]) + + if l > m: + m = l + + return m @cython.boundscheck(False) @cython.wraparound(False) -def create_hdf_rows_2d(ndarray indexer0, ndarray[np.uint8_t, ndim=1] mask, - list values): +def array_replace_from_nan_rep(ndarray[object, ndim=1] arr, object nan_rep, object replace = None): + """ replace the values in the array with replacement if they are nan_rep; return the same array """ + + cdef int length = arr.shape[0] + cdef int i = 0 + if replace is None: + replace = np.nan + + for i from 0 <= i < length: + if arr[i] == nan_rep: + arr[i] = replace + + return arr + +@cython.boundscheck(False) +@cython.wraparound(False) +def create_hdf_rows_2d(ndarray indexer0, + ndarray[np.uint8_t, ndim=1] mask, + ndarray[np.uint8_t, ndim=1] searchable, + list values): """ return a list of objects ready to be converted to rec-array format """ cdef: int i, b, n_indexer0, n_blocks, tup_size - ndarray v list l - object tup, val + object tup, val, v n_indexer0 = indexer0.shape[0] n_blocks = len(values) tup_size = n_blocks+1 l = [] + for i from 0 <= i < n_indexer0: if not mask[i]: - + tup = PyTuple_New(tup_size) val = indexer0[i] PyTuple_SET_ITEM(tup, 0, val) @@ -774,7 +804,9 @@ def create_hdf_rows_2d(ndarray indexer0, ndarray[np.uint8_t, ndim=1] mask, for b from 0 <= b < n_blocks: - v = values[b][:, i] + v = values[b][:, i] + if searchable[b]: + v = v[0] PyTuple_SET_ITEM(tup, b+1, v) Py_INCREF(v) @@ -785,14 +817,15 @@ def create_hdf_rows_2d(ndarray indexer0, ndarray[np.uint8_t, ndim=1] mask, @cython.boundscheck(False) @cython.wraparound(False) def create_hdf_rows_3d(ndarray indexer0, ndarray indexer1, - ndarray[np.uint8_t, ndim=2] mask, list values): + ndarray[np.uint8_t, ndim=2] mask, + ndarray[np.uint8_t, ndim=1] searchable, + list values): """ return a list of objects ready to be converted to rec-array format """ cdef: int i, j, b, n_indexer0, n_indexer1, n_blocks, tup_size - ndarray v list l - object tup, val + object tup, val, v n_indexer0 = indexer0.shape[0] n_indexer1 = indexer1.shape[0] @@ -818,6 +851,8 @@ def create_hdf_rows_3d(ndarray indexer0, ndarray indexer1, for b from 0 <= b < n_blocks: v = values[b][:, i, j] + if searchable[b]: + v = v[0] PyTuple_SET_ITEM(tup, b+2, v) Py_INCREF(v) @@ -828,14 +863,15 @@ def create_hdf_rows_3d(ndarray indexer0, ndarray indexer1, @cython.boundscheck(False) @cython.wraparound(False) def create_hdf_rows_4d(ndarray indexer0, ndarray indexer1, ndarray indexer2, - ndarray[np.uint8_t, ndim=3] mask, list values): + ndarray[np.uint8_t, ndim=3] mask, + ndarray[np.uint8_t, ndim=1] searchable, + list values): """ return a list of objects ready to be converted to rec-array format """ cdef: int i, j, k, b, n_indexer0, n_indexer1, n_indexer2, n_blocks, tup_size - ndarray v list l - object tup, val + object tup, val, v n_indexer0 = indexer0.shape[0] n_indexer1 = indexer1.shape[0] @@ -868,6 +904,8 @@ def create_hdf_rows_4d(ndarray indexer0, ndarray indexer1, ndarray indexer2, for b from 0 <= b < n_blocks: v = values[b][:, i, j, k] + if searchable[b]: + v = v[0] PyTuple_SET_ITEM(tup, b+3, v) Py_INCREF(v) diff --git a/vb_suite/hdfstore_bench.py b/vb_suite/hdfstore_bench.py index d43d8b60a9cf0..23303f335af7e 100644 --- a/vb_suite/hdfstore_bench.py +++ b/vb_suite/hdfstore_bench.py @@ -220,3 +220,33 @@ def remove(f): query_store_table = Benchmark("store.select('df12', [ ('index', '>', df.index[10000]), ('index', '<', df.index[15000]) ])", setup12, cleanup = "store.close()", start_date=start_date) +#---------------------------------------------------------------------- +# select from a panel table + +setup13 = common_setup + """ +p = Panel(randn(20, 1000, 1000), items= [ 'Item%03d' % i for i in xrange(20) ], + major_axis=date_range('1/1/2000', periods=1000), minor_axis = [ 'E%03d' % i for i in xrange(1000) ]) + +remove(f) +store = HDFStore(f) +store.append('p1',p) +""" + +read_store_table_panel = Benchmark("store.select('p1')", setup13, cleanup = "store.close()", + start_date=start_date) + + +#---------------------------------------------------------------------- +# write to a panel table + +setup14 = common_setup + """ +p = Panel(randn(20, 1000, 1000), items= [ 'Item%03d' % i for i in xrange(20) ], + major_axis=date_range('1/1/2000', periods=1000), minor_axis = [ 'E%03d' % i for i in xrange(1000) ]) + +remove(f) +store = HDFStore(f) +""" + +write_store_table_panel = Benchmark("store.append('p2',p)", setup14, cleanup = "store.close()", + start_date=start_date) +