diff --git a/.gitignore b/.gitignore index 6b00558fb3b19..4dd19a4946e26 100644 --- a/.gitignore +++ b/.gitignore @@ -17,6 +17,7 @@ .idea .vagrant .noseids +.ipynb_checkpoints # Compiled source # ################### diff --git a/doc/source/io.rst b/doc/source/io.rst index 65f887288cc6d..6d30e864c4f1c 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -2410,6 +2410,10 @@ for some advanced strategies There is a ``PyTables`` indexing bug which may appear when querying stores using an index. If you see a subset of results being returned, upgrade to ``PyTables`` >= 3.2. Stores created previously will need to be rewritten using the updated version. +.. warning:: + + As of version 0.17.0, ``HDFStore`` will not drop rows that have all missing values by default. Previously, if all values (except the index) were missing, ``HDFStore`` would not write those rows to disk. + .. ipython:: python :suppress: :okexcept: @@ -2486,6 +2490,8 @@ Closing a Store, Context Manager import os os.remove('store.h5') + + Read/Write API ~~~~~~~~~~~~~~ @@ -2504,6 +2510,65 @@ similar to how ``read_csv`` and ``to_csv`` work. (new in 0.11.0) os.remove('store_tl.h5') + +As of version 0.17.0, HDFStore will no longer drop rows that are all missing by default. This behavior can be enabled by setting ``dropna=True``. + +.. ipython:: python + :suppress: + + import os + +.. ipython:: python + + df_with_missing = pd.DataFrame({'col1':[0, np.nan, 2], + 'col2':[1, np.nan, np.nan]}) + df_with_missing + + df_with_missing.to_hdf('file.h5', 'df_with_missing', + format = 'table', mode='w') + + pd.read_hdf('file.h5', 'df_with_missing') + + df_with_missing.to_hdf('file.h5', 'df_with_missing', + format = 'table', mode='w', dropna=True) + pd.read_hdf('file.h5', 'df_with_missing') + + +.. ipython:: python + :suppress: + + os.remove('file.h5') + +This is also true for the major axis of a ``Panel``: + +.. ipython:: python + + matrix = [[[np.nan, np.nan, np.nan],[1,np.nan,np.nan]], + [[np.nan, np.nan, np.nan], [np.nan,5,6]], + [[np.nan, np.nan, np.nan],[np.nan,3,np.nan]]] + + panel_with_major_axis_all_missing = Panel(matrix, + items=['Item1', 'Item2','Item3'], + major_axis=[1,2], + minor_axis=['A', 'B', 'C']) + + panel_with_major_axis_all_missing + + panel_with_major_axis_all_missing.to_hdf('file.h5', 'panel', + dropna = True, + format='table', + mode='w') + reloaded = read_hdf('file.h5', 'panel') + reloaded + + +.. ipython:: python + :suppress: + + os.remove('file.h5') + + + .. _io.hdf5-fixed: Fixed Format diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 83e5ec5b1d107..80c9d16b1dcf7 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -295,6 +295,9 @@ Usually you simply want to know which values are null. None == None np.nan == np.nan + +.. _whatsnew_0170.api_breaking.other: + Other API Changes ^^^^^^^^^^^^^^^^^ @@ -330,6 +333,52 @@ Other API Changes ``raise ValueError`` All other public methods (names not beginning with underscores) =============================== =============================================================== + +- default behavior for HDFStore write functions with ``format='table'`` is now to keep rows that are all missing except for index. Previously, the behavior was to drop rows that were all missing save the index. The previous behavior can be replicated using the ``dropna=True`` option. (:issue:`9382`) + +Previously, + +.. ipython:: python + + df_with_missing = pd.DataFrame({'col1':[0, np.nan, 2], + 'col2':[1, np.nan, np.nan]}) + + df_with_missing + + +.. code-block:: python + + In [28]: + df_with_missing.to_hdf('file.h5', 'df_with_missing', format='table', mode='w') + + pd.read_hdf('file.h5', 'df_with_missing') + + Out [28]: + col1 col2 + 0 0 1 + 2 2 NaN + + +New behavior: + +.. ipython:: python + :suppress: + + import os + +.. ipython:: python + + df_with_missing.to_hdf('file.h5', 'df_with_missing', format = 'table', mode='w') + + pd.read_hdf('file.h5', 'df_with_missing') + +.. ipython:: python + :suppress: + + os.remove('file.h5') + +See :ref:`documentation ` for more details. + .. _whatsnew_0170.deprecations: Deprecations diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 292871000cafb..0d4b83d15ad3b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -922,6 +922,8 @@ def to_hdf(self, path_or_buf, key, **kwargs): in the store wherever possible fletcher32 : bool, default False If applying compression use the fletcher32 checksum + dropna : boolean, default False. + If true, ALL nan rows will not be written to store. """ diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 9e1a272ec5621..2c9ffe6b74536 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -220,7 +220,7 @@ class DuplicateWarning(Warning): """ with config.config_prefix('io.hdf'): - config.register_option('dropna_table', True, dropna_doc, + config.register_option('dropna_table', False, dropna_doc, validator=config.is_bool) config.register_option( 'default_format', None, format_doc, @@ -817,7 +817,7 @@ def put(self, key, value, format=None, append=False, **kwargs): This will force Table format, append the input data to the existing. encoding : default None, provide an encoding for strings - dropna : boolean, default True, do not write an ALL nan row to + dropna : boolean, default False, do not write an ALL nan row to the store settable by the option 'io.hdf.dropna_table' """ if format is None: @@ -899,7 +899,7 @@ def append(self, key, value, format=None, append=True, columns=None, chunksize : size to chunk the writing expectedrows : expected TOTAL row size of this table encoding : default None, provide an encoding for strings - dropna : boolean, default True, do not write an ALL nan row to + dropna : boolean, default False, do not write an ALL nan row to the store settable by the option 'io.hdf.dropna_table' Notes ----- @@ -919,7 +919,7 @@ def append(self, key, value, format=None, append=True, columns=None, **kwargs) def append_to_multiple(self, d, value, selector, data_columns=None, - axes=None, dropna=True, **kwargs): + axes=None, dropna=False, **kwargs): """ Append to multiple tables @@ -934,7 +934,7 @@ def append_to_multiple(self, d, value, selector, data_columns=None, data_columns : list of columns to create as data columns, or True to use all columns dropna : if evaluates to True, drop rows from all tables if any single - row in each table has all NaN + row in each table has all NaN. Default False. Notes ----- @@ -3787,7 +3787,7 @@ class AppendableTable(LegacyTable): def write(self, obj, axes=None, append=False, complib=None, complevel=None, fletcher32=None, min_itemsize=None, - chunksize=None, expectedrows=None, dropna=True, **kwargs): + chunksize=None, expectedrows=None, dropna=False, **kwargs): if not append and self.is_exists: self._handle.remove_node(self.group, 'table') @@ -3827,7 +3827,7 @@ def write(self, obj, axes=None, append=False, complib=None, # add the rows self.write_data(chunksize, dropna=dropna) - def write_data(self, chunksize, dropna=True): + def write_data(self, chunksize, dropna=False): """ we form the data into a 2-d including indexes,values,mask write chunk-by-chunk """ diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index ea30fb14251f4..210852d83094f 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -1040,6 +1040,28 @@ def test_append_all_nans(self): store.append('df2', df[10:], dropna=False) tm.assert_frame_equal(store['df2'], df) + # Test to make sure defaults are to not drop. + # Corresponding to Issue 9382 + df_with_missing = DataFrame({'col1':[0, np.nan, 2], 'col2':[1, np.nan, np.nan]}) + + with ensure_clean_path(self.path) as path: + df_with_missing.to_hdf(path, 'df_with_missing', format = 'table') + reloaded = read_hdf(path, 'df_with_missing') + tm.assert_frame_equal(df_with_missing, reloaded) + + matrix = [[[np.nan, np.nan, np.nan],[1,np.nan,np.nan]], + [[np.nan, np.nan, np.nan], [np.nan,5,6]], + [[np.nan, np.nan, np.nan],[np.nan,3,np.nan]]] + + panel_with_missing = Panel(matrix, items=['Item1', 'Item2','Item3'], + major_axis=[1,2], + minor_axis=['A', 'B', 'C']) + + with ensure_clean_path(self.path) as path: + panel_with_missing.to_hdf(path, 'panel_with_missing', format='table') + reloaded_panel = read_hdf(path, 'panel_with_missing') + tm.assert_panel_equal(panel_with_missing, reloaded_panel) + def test_append_frame_column_oriented(self): with ensure_clean_store(self.path) as store: @@ -4885,7 +4907,6 @@ def test_complex_append(self): result = store.select('df') assert_frame_equal(pd.concat([df, df], 0), result) - def _test_sort(obj): if isinstance(obj, DataFrame): return obj.reindex(sorted(obj.index))