diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt index 3c3742c968642..012a35d9d2f6e 100644 --- a/doc/source/whatsnew/v0.16.1.txt +++ b/doc/source/whatsnew/v0.16.1.txt @@ -29,6 +29,46 @@ API changes +Backwards incompatible API changes +~~~~~~~~~~~ +- default behavior for HDF write functions with "table" format is now to keep rows that are all missing except for index. (:issue:`9382`) + + Previously, + + .. code-block:: python + In [1]: + df_with_missing = pd.DataFrame({'col1':[0, np.nan, 2], 'col2':[1, np.nan, 3]}) + df_with_missing.to_hdf('file.h5', 'df_with_missing', format = 't') + + df_without_missing = pd.DataFrame({'col1':[0, -1, 2], 'col2':[1, -1, 3]}) + df_without_missing.to_hdf('file.h5', 'df_without_missing') + + pd.read_hdf('file.h5', 'df_with_missing') + pd.read_hdf('file.h5', 'df_without_missing') + + Out [1]: + col1 col2 + 0 0 1 + 2 2 3 + col1 col2 + 0 0 1 + 1 -1 -1 + 2 2 3 + + + +New behavior: + + .. ipython-block:: python + df_with_missing = pd.DataFrame({'col1':[0, np.nan, 2], 'col2':[1, np.nan, 3]}) + df_with_missing.to_hdf('file.h5', 'df_with_missing', format = 't') + + df_without_missing = pd.DataFrame({'col1':[0, -1, 2], 'col2':[1, -1, 3]}) + df_without_missing.to_hdf('file.h5', 'df_without_missing') + + pd.read_hdf('file.h5', 'df_with_missing') + pd.read_hdf('file.h5', 'df_without_missing') + diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 4dc777b71dd45..ac1a43656addf 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -219,7 +219,7 @@ class DuplicateWarning(Warning): """ with config.config_prefix('io.hdf'): - config.register_option('dropna_table', True, dropna_doc, + config.register_option('dropna_table', False, dropna_doc, validator=config.is_bool) config.register_option( 'default_format', None, format_doc, @@ -801,8 +801,8 @@ def put(self, key, value, format=None, append=False, **kwargs): This will force Table format, append the input data to the existing. encoding : default None, provide an encoding for strings - dropna : boolean, default True, do not write an ALL nan row to - the store settable by the option 'io.hdf.dropna_table' + dropna : boolean, default False. if True do not write an ALL nan + row to the store. Settable by the option 'io.hdf.dropna_table' """ if format is None: format = get_option("io.hdf.default_format") or 'fixed' @@ -883,8 +883,8 @@ def append(self, key, value, format=None, append=True, columns=None, chunksize : size to chunk the writing expectedrows : expected TOTAL row size of this table encoding : default None, provide an encoding for strings - dropna : boolean, default True, do not write an ALL nan row to - the store settable by the option 'io.hdf.dropna_table' + dropna : boolean, default False. If true, do not write an ALL nan + row to the store. settable by the option 'io.hdf.dropna_table' Notes ----- Does *not* check if data being appended overlaps with existing @@ -903,7 +903,7 @@ def append(self, key, value, format=None, append=True, columns=None, **kwargs) def append_to_multiple(self, d, value, selector, data_columns=None, - axes=None, dropna=True, **kwargs): + axes=None, dropna=False, **kwargs): """ Append to multiple tables @@ -918,7 +918,7 @@ def append_to_multiple(self, d, value, selector, data_columns=None, data_columns : list of columns to create as data columns, or True to use all columns dropna : if evaluates to True, drop rows from all tables if any single - row in each table has all NaN + row in each table has all NaN. Default False. Notes ----- @@ -3741,7 +3741,7 @@ class AppendableTable(LegacyTable): def write(self, obj, axes=None, append=False, complib=None, complevel=None, fletcher32=None, min_itemsize=None, - chunksize=None, expectedrows=None, dropna=True, **kwargs): + chunksize=None, expectedrows=None, dropna=False, **kwargs): if not append and self.is_exists: self._handle.remove_node(self.group, 'table') @@ -3778,7 +3778,7 @@ def write(self, obj, axes=None, append=False, complib=None, # add the rows self.write_data(chunksize, dropna=dropna) - def write_data(self, chunksize, dropna=True): + def write_data(self, chunksize, dropna=False): """ we form the data into a 2-d including indexes,values,mask write chunk-by-chunk """ diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index a15149e341f4d..e568a24b0efed 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -4588,6 +4588,14 @@ def test_duplicate_column_name(self): other = read_hdf(path, 'df') tm.assert_frame_equal(df, other) + def test_all_missing_values(self): + # Test corresponding to Issue 9382 + df_with_missing = DataFrame({'col1':[0, np.nan, 2], 'col2':[1, np.nan, np.nan]}) + + with ensure_clean_path(self.path) as path: + df_with_missing.to_hdf(path, 'df_with_missing', format = 't') + reloaded = read_hdf(path, 'df_with_missing') + tm.assert_frame_equal(df_with_missing, reloaded) def _test_sort(obj): if isinstance(obj, DataFrame):