From e317087b0e97c0c6637ac0aac70299cdb2f64336 Mon Sep 17 00:00:00 2001 From: Alex Gaudio Date: Tue, 27 Aug 2013 19:48:33 -0400 Subject: [PATCH] BUG: HDFStore.append_to_multiple - ensures rows are synchronized before writing adds dropna kwarg + docstring + tests + documentation + release note python3 compatible --- doc/source/io.rst | 30 +++++++++++++++++++----------- doc/source/release.rst | 2 ++ pandas/io/pytables.py | 13 ++++++++++++- pandas/io/tests/test_pytables.py | 26 ++++++++++++++++++++++++++ 4 files changed, 59 insertions(+), 12 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 92042b6fe58c5..67cbe35144461 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -2170,29 +2170,38 @@ multiple tables at once. The idea is to have one table (call it the selector table) that you index most/all of the columns, and perform your queries. The other table(s) are data tables with an index matching the selector table's index. You can then perform a very fast query -on the selector table, yet get lots of data back. This method works similar to -having a very wide table, but is more efficient in terms of queries. +on the selector table, yet get lots of data back. This method is similar to +having a very wide table, but enables more efficient queries. -Note, **THE USER IS RESPONSIBLE FOR SYNCHRONIZING THE TABLES**. This -means, append to the tables in the same order; ``append_to_multiple`` -splits a single object to multiple tables, given a specification (as a -dictionary). This dictionary is a mapping of the table names to the -'columns' you want included in that table. Pass a `None` for a single -table (optional) to let it have the remaining columns. The argument -``selector`` defines which table is the selector table. +The ``append_to_multiple`` method splits a given single DataFrame +into multiple tables according to ``d``, a dictionary that maps the +table names to a list of 'columns' you want in that table. If `None` +is used in place of a list, that table will have the remaining +unspecified columns of the given DataFrame. The argument ``selector`` +defines which table is the selector table (which you can make queries from). +The argument ``dropna`` will drop rows from the input DataFrame to ensure +tables are synchronized. This means that if a row for one of the tables +being written to is entirely ``np.NaN``, that row will be dropped from all tables. + +If ``dropna`` is False, **THE USER IS RESPONSIBLE FOR SYNCHRONIZING THE TABLES**. +Remember that entirely ``np.Nan`` rows are not written to the HDFStore, so if +you choose to call ``dropna=False``, some tables may have more rows than others, +and therefore ``select_as_multiple`` may not work or it may return unexpected +results. .. ipython:: python df_mt = DataFrame(randn(8, 6), index=date_range('1/1/2000', periods=8), columns=['A', 'B', 'C', 'D', 'E', 'F']) df_mt['foo'] = 'bar' + df_mt.ix[1, ('A', 'B')] = np.nan # you can also create the tables individually store.append_to_multiple({'df1_mt': ['A', 'B'], 'df2_mt': None }, df_mt, selector='df1_mt') store - # indiviual tables were created + # individual tables were created store.select('df1_mt') store.select('df2_mt') @@ -2200,7 +2209,6 @@ table (optional) to let it have the remaining columns. The argument store.select_as_multiple(['df1_mt', 'df2_mt'], where=['A>0', 'B>0'], selector = 'df1_mt') -.. _io.hdf5-delete: Delete from a Table ~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/release.rst b/doc/source/release.rst index a03b647b310b6..bbadba61c0135 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -95,6 +95,8 @@ pandas 0.13 - ``HDFStore`` + - ``append_to_multiple`` automatically synchronizes writing rows to multiple + tables and adds a ``dropna`` kwarg (:issue:`4698`) - handle a passed ``Series`` in table format (:issue:`4330`) - added an ``is_open`` property to indicate if the underlying file handle is_open; a closed store will now report 'CLOSED' when viewing the store (rather than raising an error) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 600f886c57c65..5ab63d016c3b8 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -786,7 +786,7 @@ def append(self, key, value, format=None, append=True, columns=None, dropna=None kwargs = self._validate_format(format, kwargs) self._write_to_group(key, value, append=append, dropna=dropna, **kwargs) - def append_to_multiple(self, d, value, selector, data_columns=None, axes=None, **kwargs): + def append_to_multiple(self, d, value, selector, data_columns=None, axes=None, dropna=True, **kwargs): """ Append to multiple tables @@ -798,6 +798,9 @@ def append_to_multiple(self, d, value, selector, data_columns=None, axes=None, * selector : a string that designates the indexable table; all of its columns will be designed as data_columns, unless data_columns is passed, in which case these are used + data_columns : list of columns to create as data columns, or True to use all columns + dropna : if evaluates to True, drop rows from all tables if any single + row in each table has all NaN Notes ----- @@ -840,6 +843,14 @@ def append_to_multiple(self, d, value, selector, data_columns=None, axes=None, * if data_columns is None: data_columns = d[selector] + # ensure rows are synchronized across the tables + if dropna: + idxs = (value[cols].dropna(how='all').index for cols in d.values()) + valid_index = next(idxs) + for index in idxs: + valid_index = valid_index.intersection(index) + value = value.ix[valid_index] + # append for k, v in d.items(): dc = data_columns if k == selector else None diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 66f3d3766ee3e..6941452075f4b 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -2902,6 +2902,32 @@ def test_append_to_multiple(self): expected = df[(df.A > 0) & (df.B > 0)] tm.assert_frame_equal(result, expected) + def test_append_to_multiple_dropna(self): + df1 = tm.makeTimeDataFrame() + df2 = tm.makeTimeDataFrame().rename(columns=lambda x: "%s_2" % x) + df1.ix[1, ['A', 'B']] = np.nan + df = concat([df1, df2], axis=1) + + with ensure_clean(self.path) as store: + # dropna=True should guarantee rows are synchronized + store.append_to_multiple( + {'df1': ['A', 'B'], 'df2': None}, df, selector='df1', + dropna=True) + result = store.select_as_multiple(['df1', 'df2']) + expected = df.dropna() + tm.assert_frame_equal(result, expected) + tm.assert_index_equal(store.select('df1').index, + store.select('df2').index) + + # dropna=False shouldn't synchronize row indexes + store.append_to_multiple( + {'df1': ['A', 'B'], 'df2': None}, df, selector='df1', + dropna=False) + self.assertRaises( + ValueError, store.select_as_multiple, ['df1', 'df2']) + assert not store.select('df1').index.equals( + store.select('df2').index) + def test_select_as_multiple(self): df1 = tm.makeTimeDataFrame()