diff --git a/doc/source/whatsnew/v0.16.2.txt b/doc/source/whatsnew/v0.16.2.txt index 627c79f7289b7..25c7e97923082 100644 --- a/doc/source/whatsnew/v0.16.2.txt +++ b/doc/source/whatsnew/v0.16.2.txt @@ -92,6 +92,6 @@ Bug Fixes - Bug in ``SparseSeries`` constructor ignores input data name (:issue:`10258`) - Bug where infer_freq infers timerule (WOM-5XXX) unsupported by to_offset (:issue:`9425`) - +- Bug in ``DataFrame.to_hdf()`` where table format would raise a seemingly unrelated error for invalid (non-string) column names. This is now explicitly forbidden. (:issue:`9057`) - Bug to handle masking empty ``DataFrame``(:issue:`10126`) - Bug where MySQL interface could not handle numeric table/column names (:issue:`10255`) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 4cbc7aeaa3df7..8948592358636 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -257,6 +257,7 @@ def _tables(): def to_hdf(path_or_buf, key, value, mode=None, complevel=None, complib=None, append=None, **kwargs): """ store this object, close it if we opened it """ + if append: f = lambda store: store.append(key, value, **kwargs) else: @@ -1535,6 +1536,12 @@ def maybe_set_size(self, min_itemsize=None, **kwargs): self.typ = _tables( ).StringCol(itemsize=min_itemsize, pos=self.pos) + def validate(self, handler, append, **kwargs): + self.validate_names() + + def validate_names(self): + pass + def validate_and_set(self, handler, append, **kwargs): self.set_table(handler.table) self.validate_col() @@ -2080,6 +2087,10 @@ class DataIndexableCol(DataCol): """ represent a data column that can be indexed """ is_data_indexable = True + def validate_names(self): + if not Index(self.values).is_object(): + raise ValueError("cannot have non-object label DataIndexableCol") + def get_atom_string(self, block, itemsize): return _tables().StringCol(itemsize=itemsize) @@ -3756,6 +3767,9 @@ def write(self, obj, axes=None, append=False, complib=None, min_itemsize=min_itemsize, **kwargs) + for a in self.axes: + a.validate(self, append) + if not self.is_exists: # create the table diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 7d9c3c051344f..f671e61e90084 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -4640,6 +4640,35 @@ def test_colums_multiindex_modified(self): df_loaded = read_hdf(path, 'df', columns=cols2load) self.assertTrue(cols2load_original == cols2load) + def test_to_hdf_with_object_column_names(self): + # GH9057 + # Writing HDF5 table format should only work for string-like + # column types + + types_should_fail = [ tm.makeIntIndex, tm.makeFloatIndex, + tm.makeDateIndex, tm.makeTimedeltaIndex, + tm.makePeriodIndex ] + types_should_run = [ tm.makeStringIndex, tm.makeCategoricalIndex ] + + if compat.PY3: + types_should_run.append(tm.makeUnicodeIndex) + else: + types_should_fail.append(tm.makeUnicodeIndex) + + for index in types_should_fail: + df = DataFrame(np.random.randn(10, 2), columns=index(2)) + with ensure_clean_path(self.path) as path: + with self.assertRaises(ValueError, + msg="cannot have non-object label DataIndexableCol"): + df.to_hdf(path, 'df', format='table', data_columns=True) + + for index in types_should_run: + df = DataFrame(np.random.randn(10, 2), columns=index(2)) + with ensure_clean_path(self.path) as path: + df.to_hdf(path, 'df', format='table', data_columns=True) + result = pd.read_hdf(path, 'df', where="index = [{0}]".format(df.index[0])) + assert(len(result)) + def _test_sort(obj): if isinstance(obj, DataFrame):