Skip to content

Commit 020181d

Browse files
authored
Merge pull request #243 from djarecka/kaczmarj-fix/test_hash_value_dir
updates to hash value for directory (updates to #237)
2 parents e025ee0 + 0341f64 commit 020181d

File tree

2 files changed

+94
-25
lines changed

2 files changed

+94
-25
lines changed

pydra/engine/helpers_file.py

Lines changed: 51 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -87,27 +87,66 @@ def hash_file(afile, chunk_len=8192, crypto=sha256, raise_notfound=True):
8787
return crypto_obj.hexdigest()
8888

8989

90-
def hash_dir(dirpath, raise_notfound=True):
90+
def hash_dir(
91+
dirpath,
92+
crypto=sha256,
93+
ignore_hidden_files=False,
94+
ignore_hidden_dirs=False,
95+
raise_notfound=True,
96+
):
97+
"""Compute hash of directory contents.
98+
99+
This function computes the hash of every file in directory `dirpath` and then
100+
computes the hash of that list of hashes to return a single hash value. The
101+
directory is traversed recursively.
102+
103+
Parameters
104+
----------
105+
dirpath : :obj:`str`
106+
Path to directory.
107+
crypto : :obj: `function`
108+
cryptographic hash functions
109+
ignore_hidden_files : :obj:`bool`
110+
If `True`, ignore filenames that begin with `.`.
111+
ignore_hidden_dirs : :obj:`bool`
112+
If `True`, ignore files in directories that begin with `.`.
113+
raise_notfound : :obj:`bool`
114+
If `True` and `dirpath` does not exist, raise `FileNotFound` exception. If
115+
`False` and `dirpath` does not exist, return `None`.
116+
117+
Returns
118+
-------
119+
hash : :obj:`str`
120+
Hash of the directory contents.
121+
"""
91122
from .specs import LazyField
92123

93124
if dirpath is None or isinstance(dirpath, LazyField) or isinstance(dirpath, list):
94125
return None
95126
if not Path(dirpath).is_dir():
96127
if raise_notfound:
97-
raise RuntimeError(f"Directory {dirpath} not found.")
128+
raise FileNotFoundError(f"Directory {dirpath} not found.")
98129
return None
99130

100-
def search_dir(path):
101-
path = Path(path)
102-
file_list = []
103-
for el in path.iterdir():
104-
if el.is_file():
105-
file_list.append(hash_file(el))
106-
else:
107-
file_list.append(search_dir(path / el))
108-
return file_list
131+
file_hashes = []
132+
for dpath, dirnames, filenames in os.walk(dirpath):
133+
# Sort in-place to guarantee order.
134+
dirnames.sort()
135+
filenames.sort()
136+
dpath = Path(dpath)
137+
if ignore_hidden_dirs and dpath.name.startswith(".") and str(dpath) != dirpath:
138+
continue
139+
for filename in filenames:
140+
if ignore_hidden_files and filename.startswith("."):
141+
continue
142+
this_hash = hash_file(dpath / filename)
143+
file_hashes.append(this_hash)
109144

110-
return search_dir(dirpath)
145+
crypto_obj = crypto()
146+
for h in file_hashes:
147+
crypto_obj.update(h.encode())
148+
149+
return crypto_obj.hexdigest()
111150

112151

113152
def _parse_mount_table(exit_code, output):

pydra/engine/tests/test_helpers.py

Lines changed: 43 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import os
2+
import hashlib
23
from pathlib import Path
4+
import random
35
import platform
46

57
import pytest
@@ -143,26 +145,54 @@ def test_hash_value_dir(tmpdir):
143145
with open(file_2, "w") as f:
144146
f.write("hi")
145147

146-
assert hash_value(tmpdir, tp=Directory) == hash_value([file_1, file_2], tp=File)
147-
assert hash_value(tmpdir, tp=Directory) == helpers_file.hash_dir(tmpdir)
148+
test_sha = hashlib.sha256()
149+
for fx in [file_1, file_2]:
150+
test_sha.update(helpers_file.hash_file(fx).encode())
151+
152+
bad_sha = hashlib.sha256()
153+
for fx in [file_2, file_1]:
154+
bad_sha.update(helpers_file.hash_file(fx).encode())
155+
156+
orig_hash = helpers_file.hash_dir(tmpdir)
157+
158+
assert orig_hash == test_sha.hexdigest()
159+
assert orig_hash != bad_sha.hexdigest()
160+
assert orig_hash == hash_value(tmpdir, tp=Directory)
148161

149162

150163
def test_hash_value_nested(tmpdir):
164+
hidden = tmpdir.mkdir(".hidden")
151165
nested = tmpdir.mkdir("nested")
152166
file_1 = tmpdir.join("file_1.txt")
153-
file_2 = nested.join("file_2.txt")
154-
file_3 = nested.join("file_3.txt")
155-
with open(file_1, "w") as f:
156-
f.write("hello")
157-
with open(file_2, "w") as f:
158-
f.write("hi")
159-
with open(file_3, "w") as f:
160-
f.write("hola")
167+
file_2 = hidden.join("file_2.txt")
168+
file_3 = nested.join(".file_3.txt")
169+
file_4 = nested.join("file_4.txt")
170+
171+
test_sha = hashlib.sha256()
172+
for fx in [file_1, file_2, file_3, file_4]:
173+
with open(fx, "w") as f:
174+
f.write(str(random.randint(0, 1000)))
175+
test_sha.update(helpers_file.hash_file(fx).encode())
176+
177+
orig_hash = helpers_file.hash_dir(tmpdir)
161178

162-
assert hash_value(tmpdir, tp=Directory) == hash_value(
163-
[file_1, [file_2, file_3]], tp=File
179+
assert orig_hash == test_sha.hexdigest()
180+
assert orig_hash == hash_value(tmpdir, tp=Directory)
181+
182+
nohidden_hash = helpers_file.hash_dir(
183+
tmpdir, ignore_hidden_dirs=True, ignore_hidden_files=True
164184
)
165-
assert hash_value(tmpdir, tp=Directory) == helpers_file.hash_dir(tmpdir)
185+
nohiddendirs_hash = helpers_file.hash_dir(tmpdir, ignore_hidden_dirs=True)
186+
nohiddenfiles_hash = helpers_file.hash_dir(tmpdir, ignore_hidden_files=True)
187+
188+
assert orig_hash != nohidden_hash
189+
assert orig_hash != nohiddendirs_hash
190+
assert orig_hash != nohiddenfiles_hash
191+
192+
file_3.remove()
193+
assert helpers_file.hash_dir(tmpdir) == nohiddenfiles_hash
194+
hidden.remove()
195+
assert helpers_file.hash_dir(tmpdir) == nohidden_hash
166196

167197

168198
def test_get_available_cpus():

0 commit comments

Comments
 (0)