From 9e938dd7cff3d8c0cadb38243fc44f94b738c9ea Mon Sep 17 00:00:00 2001 From: "Christopher J. Markiewicz" Date: Fri, 4 Jun 2021 08:20:44 -0400 Subject: [PATCH 1/7] TEST: Constrain gzip outputs to have deterministic checksums by default --- nibabel/tests/test_openers.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/nibabel/tests/test_openers.py b/nibabel/tests/test_openers.py index ae25785bb4..8d6781aed5 100644 --- a/nibabel/tests/test_openers.py +++ b/nibabel/tests/test_openers.py @@ -12,6 +12,8 @@ from gzip import GzipFile from io import BytesIO, UnsupportedOperation from distutils.version import StrictVersion +import hashlib +import time from numpy.compat.py3k import asstr, asbytes from ..openers import Opener, ImageOpener, HAVE_INDEXED_GZIP, BZ2File @@ -347,3 +349,34 @@ def test_iter(): lobj = Opener(Lunk('')) with pytest.raises(TypeError): list(lobj) + + +def md5sum(fname): + with open(fname, "rb") as fobj: + return hashlib.md5(fobj.read()).hexdigest() + + +def test_bitwise_determinism(): + with InTemporaryDirectory(): + msg = b"Hello, I'd like to have an argument." + # Canonical reference: No filename, no mtime + # Use default compresslevel + with open("ref.gz", "wb") as fobj: + with GzipFile(filename="", mode="wb", + compresslevel=1, fileobj=fobj, + mtime=0) as gzobj: + gzobj.write(msg) + anon_chksum = md5sum("ref.gz") + + # Different times, different filenames + now = time.time() + with mock.patch("time.time") as t: + t.return_value = now + with Opener("a.gz", "wb") as fobj: + fobj.write(msg) + t.return_value = now + 1 + with Opener("b.gz", "wb") as fobj: + fobj.write(msg) + + assert md5sum("a.gz") == anon_chksum + assert md5sum("b.gz") == anon_chksum From 9df85a4029ee0c0c1f2e276dcf6a72c5f84b7572 Mon Sep 17 00:00:00 2001 From: "Christopher J. Markiewicz" Date: Fri, 4 Jun 2021 08:22:24 -0400 Subject: [PATCH 2/7] ENH: Add DeterministicGzipFile wrapper class --- nibabel/openers.py | 15 +++++++++ nibabel/tests/test_openers.py | 62 ++++++++++++++++++++++++++++++++++- 2 files changed, 76 insertions(+), 1 deletion(-) diff --git a/nibabel/openers.py b/nibabel/openers.py index 612346fcfb..cb378e341e 100644 --- a/nibabel/openers.py +++ b/nibabel/openers.py @@ -40,6 +40,21 @@ HAVE_INDEXED_GZIP = False +class DeterministicGzipFile(gzip.GzipFile): + """ Deterministic variant of GzipFile + + This writer does not add filename information to the header, and defaults + to a modification time (``mtime``) of 0 seconds. + """ + def __init__(self, filename=None, mode=None, compresslevel=9, fileobj=None, mtime=0): + if mode and 'b' not in mode: + mode += 'b' + if filename: + fileobj = self.myfileobj = open(filename, mode or 'rb') + return super().__init__(filename="", mode=mode, compresslevel=compresslevel, + fileobj=fileobj, mtime=mtime) + + def _gzip_open(filename, mode='rb', compresslevel=9, keep_open=False): # use indexed_gzip if possible for faster read access. If keep_open == diff --git a/nibabel/tests/test_openers.py b/nibabel/tests/test_openers.py index 8d6781aed5..4e89578d68 100644 --- a/nibabel/tests/test_openers.py +++ b/nibabel/tests/test_openers.py @@ -16,7 +16,7 @@ import time from numpy.compat.py3k import asstr, asbytes -from ..openers import Opener, ImageOpener, HAVE_INDEXED_GZIP, BZ2File +from ..openers import Opener, ImageOpener, HAVE_INDEXED_GZIP, BZ2File, DeterministicGzipFile from ..tmpdirs import InTemporaryDirectory from ..volumeutils import BinOpener @@ -356,6 +356,66 @@ def md5sum(fname): return hashlib.md5(fobj.read()).hexdigest() +def test_DeterministicGzipFile(): + with InTemporaryDirectory(): + msg = b"Hello, I'd like to have an argument." + + # No filename, no mtime + with open("ref.gz", "wb") as fobj: + with GzipFile(filename="", mode="wb", fileobj=fobj, mtime=0) as gzobj: + gzobj.write(msg) + anon_chksum = md5sum("ref.gz") + + with DeterministicGzipFile("default.gz", "wb") as fobj: + internal_fobj = fobj.myfileobj + fobj.write(msg) + # Check that myfileobj is being closed by GzipFile.close() + # This is in case GzipFile changes its internal implementation + assert internal_fobj.closed + + assert md5sum("default.gz") == anon_chksum + + # No filename, current mtime + now = time.time() + with open("ref.gz", "wb") as fobj: + with GzipFile(filename="", mode="wb", fileobj=fobj, mtime=now) as gzobj: + gzobj.write(msg) + now_chksum = md5sum("ref.gz") + + with DeterministicGzipFile("now.gz", "wb", mtime=now) as fobj: + fobj.write(msg) + + assert md5sum("now.gz") == now_chksum + + # Change in default behavior + with mock.patch("time.time") as t: + t.return_value = now + + # GzipFile will use time.time() + with open("ref.gz", "wb") as fobj: + with GzipFile(filename="", mode="wb", fileobj=fobj) as gzobj: + gzobj.write(msg) + assert md5sum("ref.gz") == now_chksum + + # DeterministicGzipFile will use 0 + with DeterministicGzipFile("now.gz", "wb") as fobj: + fobj.write(msg) + assert md5sum("now.gz") == anon_chksum + + # GzipFile is filename dependent, DeterministicGzipFile is independent + with GzipFile("filenameA.gz", mode="wb", mtime=0) as gzobj: + gzobj.write(msg) + fnameA_chksum = md5sum("filenameA.gz") + assert fnameA_chksum != anon_chksum + + with DeterministicGzipFile("filenameA.gz", "wb") as fobj: + fobj.write(msg) + + # But the contents are the same with different filenames + assert md5sum("filenameA.gz") == anon_chksum + + + def test_bitwise_determinism(): with InTemporaryDirectory(): msg = b"Hello, I'd like to have an argument." From a27fd89734eef1b3de762806d0b91dff950a0272 Mon Sep 17 00:00:00 2001 From: "Christopher J. Markiewicz" Date: Fri, 4 Jun 2021 08:24:00 -0400 Subject: [PATCH 3/7] ENH: Open files with DeterministicGzipFile, expose mtime to Opener --- nibabel/openers.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nibabel/openers.py b/nibabel/openers.py index cb378e341e..ed9a3152d4 100644 --- a/nibabel/openers.py +++ b/nibabel/openers.py @@ -55,7 +55,7 @@ def __init__(self, filename=None, mode=None, compresslevel=9, fileobj=None, mtim fileobj=fileobj, mtime=mtime) -def _gzip_open(filename, mode='rb', compresslevel=9, keep_open=False): +def _gzip_open(filename, mode='rb', compresslevel=9, mtime=0, keep_open=False): # use indexed_gzip if possible for faster read access. If keep_open == # True, we tell IndexedGzipFile to keep the file handle open. Otherwise @@ -65,7 +65,7 @@ def _gzip_open(filename, mode='rb', compresslevel=9, keep_open=False): # Fall-back to built-in GzipFile else: - gzip_file = gzip.GzipFile(filename, mode, compresslevel) + gzip_file = DeterministicGzipFile(filename, mode, compresslevel, mtime=mtime) return gzip_file @@ -90,7 +90,7 @@ class Opener(object): passed to opening method when `fileish` is str. Change of defaults as for \*args """ - gz_def = (_gzip_open, ('mode', 'compresslevel', 'keep_open')) + gz_def = (_gzip_open, ('mode', 'compresslevel', 'mtime', 'keep_open')) bz2_def = (BZ2File, ('mode', 'buffering', 'compresslevel')) compress_ext_map = { '.gz': gz_def, From f4e3ae7eeeb4e4060a78964e2c8d7833577b5e86 Mon Sep 17 00:00:00 2001 From: "Christopher J. Markiewicz" Date: Fri, 4 Jun 2021 08:29:51 -0400 Subject: [PATCH 4/7] TEST: Check that mtime pass-through works --- nibabel/tests/test_openers.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/nibabel/tests/test_openers.py b/nibabel/tests/test_openers.py index 4e89578d68..c7f0b4adab 100644 --- a/nibabel/tests/test_openers.py +++ b/nibabel/tests/test_openers.py @@ -440,3 +440,12 @@ def test_bitwise_determinism(): assert md5sum("a.gz") == anon_chksum assert md5sum("b.gz") == anon_chksum + + # Users can still set mtime, but filenames will not be embedded + with Opener("filenameA.gz", "wb", mtime=0xCAFE10C0) as fobj: + fobj.write(msg) + with Opener("filenameB.gz", "wb", mtime=0xCAFE10C0) as fobj: + fobj.write(msg) + fnameA_chksum = md5sum("filenameA.gz") + fnameB_chksum = md5sum("filenameB.gz") + assert fnameA_chksum == fnameB_chksum != anon_chksum From 26f6f0a6691bd5dc14ed7fa4464aa81f546185de Mon Sep 17 00:00:00 2001 From: "Christopher J. Markiewicz" Date: Fri, 4 Jun 2021 08:30:33 -0400 Subject: [PATCH 5/7] FIX: Filename is no longer stored with the file objects, use our own copy --- nibabel/openers.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/nibabel/openers.py b/nibabel/openers.py index ed9a3152d4..e6329f61f1 100644 --- a/nibabel/openers.py +++ b/nibabel/openers.py @@ -158,10 +158,7 @@ def name(self): self._name will be None if object was created with a fileobj, otherwise it will be the filename. """ - try: - return self.fobj.name - except AttributeError: - return self._name + return self._name @property def mode(self): From 2fb5bf3792b8252fad472410399040ad0d604067 Mon Sep 17 00:00:00 2001 From: "Christopher J. Markiewicz" Date: Fri, 4 Jun 2021 09:08:38 -0400 Subject: [PATCH 6/7] TEST: Check cases when fileobj is passed --- nibabel/tests/test_openers.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/nibabel/tests/test_openers.py b/nibabel/tests/test_openers.py index c7f0b4adab..d7e5a0bd7b 100644 --- a/nibabel/tests/test_openers.py +++ b/nibabel/tests/test_openers.py @@ -415,6 +415,29 @@ def test_DeterministicGzipFile(): assert md5sum("filenameA.gz") == anon_chksum +def test_DeterministicGzipFile_fileobj(): + with InTemporaryDirectory(): + msg = b"Hello, I'd like to have an argument." + with open("ref.gz", "wb") as fobj: + with GzipFile(filename="", mode="wb", fileobj=fobj, mtime=0) as gzobj: + gzobj.write(msg) + ref_chksum = md5sum("ref.gz") + + with open("test.gz", "wb") as fobj: + with DeterministicGzipFile(filename="", mode="wb", fileobj=fobj) as gzobj: + gzobj.write(msg) + md5sum("test.gz") == ref_chksum + + with open("test.gz", "wb") as fobj: + with DeterministicGzipFile(fileobj=fobj, mode="wb") as gzobj: + gzobj.write(msg) + md5sum("test.gz") == ref_chksum + + with open("test.gz", "wb") as fobj: + with DeterministicGzipFile(filename="test.gz", mode="wb", fileobj=fobj) as gzobj: + gzobj.write(msg) + md5sum("test.gz") == ref_chksum + def test_bitwise_determinism(): with InTemporaryDirectory(): From b4b2c23236e6120de6f159892fbf09ff926901bf Mon Sep 17 00:00:00 2001 From: "Christopher J. Markiewicz" Date: Fri, 4 Jun 2021 09:09:44 -0400 Subject: [PATCH 7/7] FIX: Do not re-create fileobj when passed to DGF --- nibabel/openers.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nibabel/openers.py b/nibabel/openers.py index e6329f61f1..2d1af7579f 100644 --- a/nibabel/openers.py +++ b/nibabel/openers.py @@ -47,9 +47,11 @@ class DeterministicGzipFile(gzip.GzipFile): to a modification time (``mtime``) of 0 seconds. """ def __init__(self, filename=None, mode=None, compresslevel=9, fileobj=None, mtime=0): + # These two guards are copied from + # https://github.com/python/cpython/blob/6ab65c6/Lib/gzip.py#L171-L174 if mode and 'b' not in mode: mode += 'b' - if filename: + if fileobj is None: fileobj = self.myfileobj = open(filename, mode or 'rb') return super().__init__(filename="", mode=mode, compresslevel=compresslevel, fileobj=fileobj, mtime=mtime)