diff --git a/doc/source/io.rst b/doc/source/io.rst index 90bb762f1a1ba..9fd2ea1eb83ec 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -2745,6 +2745,9 @@ Notes & Caveats need to serialize these operations in a single thread in a single process. You will corrupt your data otherwise. See the issue (:`2397`) for more information. + - If you use locks to manage write access between multiple processes, you + may want to use :py:func:`~os.fsync` before releasing write locks. For + convenience you can use ``store.flush(fsync=True)`` to do this for you. - ``PyTables`` only supports fixed-width string columns in ``tables``. The sizes of a string based indexing column (e.g. *columns* or *minor_axis*) are determined as the maximum size diff --git a/doc/source/release.rst b/doc/source/release.rst index 49de8dddd7210..eaf1b601b2d3c 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -275,6 +275,8 @@ API Changes - store `datetime.date` objects as ordinals rather then timetuples to avoid timezone issues (:issue:`2852`), thanks @tavistmorph and @numpand - ``numexpr`` 2.2.2 fixes incompatiblity in PyTables 2.4 (:issue:`4908`) + - ``flush`` now accepts an ``fsync`` parameter, which defaults to ``False`` + (:issue:`5364`) - ``JSON`` - added ``date_unit`` parameter to specify resolution of timestamps. diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 05528d5c0d407..5919589978903 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -10,6 +10,7 @@ import copy import itertools import warnings +import os import numpy as np from pandas import (Series, TimeSeries, DataFrame, Panel, Panel4D, Index, @@ -525,12 +526,26 @@ def is_open(self): return False return bool(self._handle.isopen) - def flush(self): + def flush(self, fsync=False): """ - Force all buffered modifications to be written to disk + Force all buffered modifications to be written to disk. + + Parameters + ---------- + fsync : bool (default False) + call ``os.fsync()`` on the file handle to force writing to disk. + + Notes + ----- + Without ``fsync=True``, flushing may not guarantee that the OS writes + to disk. With fsync, the operation will block until the OS claims the + file has been written; however, other caching layers may still + interfere. """ if self._handle is not None: self._handle.flush() + if fsync: + os.fsync(self._handle.fileno()) def get(self, key): """ @@ -4072,5 +4087,4 @@ def timeit(key, df, fn=None, remove=True, **kwargs): store.close() if remove: - import os os.remove(fn) diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index fe60352845316..a08073bd7bd35 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -465,6 +465,7 @@ def test_flush(self): with ensure_clean(self.path) as store: store['a'] = tm.makeTimeSeries() store.flush() + store.flush(fsync=True) def test_get(self):