Skip to content

Commit f8e817a

Browse files
committed
merging
2 parents e025ee0 + aacdba9 commit f8e817a

File tree

2 files changed

+88
-25
lines changed

2 files changed

+88
-25
lines changed

pydra/engine/helpers_file.py

Lines changed: 45 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -87,27 +87,60 @@ def hash_file(afile, chunk_len=8192, crypto=sha256, raise_notfound=True):
8787
return crypto_obj.hexdigest()
8888

8989

90-
def hash_dir(dirpath, raise_notfound=True):
90+
def hash_dir(
91+
dirpath, ignore_hidden_files=False, ignore_hidden_dirs=False, raise_notfound=True
92+
):
93+
"""Compute hash of directory contents.
94+
95+
This function computes the hash of every file in directory `dirpath` and then
96+
computes the hash of that list of hashes to return a single hash value. The
97+
directory is traversed recursively.
98+
99+
Parameters
100+
----------
101+
dirpath : :obj:`str`
102+
Path to directory.
103+
ignore_hidden_files : :obj:`bool`
104+
If `True`, ignore filenames that begin with `.`.
105+
ignore_hidden_dirs : :obj:`bool`
106+
If `True`, ignore files in directories that begin with `.`.
107+
raise_notfound : :obj:`bool`
108+
If `True` and `dirpath` does not exist, raise `FileNotFound` exception. If
109+
`False` and `dirpath` does not exist, return `None`.
110+
111+
Returns
112+
-------
113+
hash : :obj:`str`
114+
Hash of the directory contents.
115+
"""
91116
from .specs import LazyField
92117

93118
if dirpath is None or isinstance(dirpath, LazyField) or isinstance(dirpath, list):
94119
return None
95120
if not Path(dirpath).is_dir():
96121
if raise_notfound:
97-
raise RuntimeError(f"Directory {dirpath} not found.")
122+
raise FileNotFoundError(f"Directory {dirpath} not found.")
98123
return None
99124

100-
def search_dir(path):
101-
path = Path(path)
102-
file_list = []
103-
for el in path.iterdir():
104-
if el.is_file():
105-
file_list.append(hash_file(el))
106-
else:
107-
file_list.append(search_dir(path / el))
108-
return file_list
125+
file_hashes = []
126+
for dpath, dirnames, filenames in os.walk(dirpath):
127+
# Sort in-place to guarantee order.
128+
dirnames.sort()
129+
filenames.sort()
130+
dpath = Path(dpath)
131+
if ignore_hidden_dirs and dpath.name.startswith(".") and str(dpath) != dirpath:
132+
continue
133+
for filename in filenames:
134+
if ignore_hidden_files and filename.startswith("."):
135+
continue
136+
this_hash = hash_file(dpath / filename)
137+
file_hashes.append(this_hash)
138+
139+
sha = sha256()
140+
for h in file_hashes:
141+
sha.update(h.encode())
109142

110-
return search_dir(dirpath)
143+
return sha.hexdigest()
111144

112145

113146
def _parse_mount_table(exit_code, output):

pydra/engine/tests/test_helpers.py

Lines changed: 43 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import os
2+
import hashlib
23
from pathlib import Path
4+
import random
35
import platform
46

57
import pytest
@@ -143,26 +145,54 @@ def test_hash_value_dir(tmpdir):
143145
with open(file_2, "w") as f:
144146
f.write("hi")
145147

146-
assert hash_value(tmpdir, tp=Directory) == hash_value([file_1, file_2], tp=File)
147-
assert hash_value(tmpdir, tp=Directory) == helpers_file.hash_dir(tmpdir)
148+
test_sha = hashlib.sha256()
149+
for fx in [file_1, file_2]:
150+
test_sha.update(helpers_file.hash_file(fx).encode())
151+
152+
bad_sha = hashlib.sha256()
153+
for fx in [file_2, file_1]:
154+
bad_sha.update(helpers_file.hash_file(fx).encode())
155+
156+
orig_hash = helpers_file.hash_dir(tmpdir)
157+
158+
assert orig_hash == test_sha.hexdigest()
159+
assert orig_hash != bad_sha.hexdigest()
160+
assert orig_hash == hash_value(tmpdir, tp=Directory)
148161

149162

150163
def test_hash_value_nested(tmpdir):
164+
hidden = tmpdir.mkdir(".hidden")
151165
nested = tmpdir.mkdir("nested")
152166
file_1 = tmpdir.join("file_1.txt")
153-
file_2 = nested.join("file_2.txt")
154-
file_3 = nested.join("file_3.txt")
155-
with open(file_1, "w") as f:
156-
f.write("hello")
157-
with open(file_2, "w") as f:
158-
f.write("hi")
159-
with open(file_3, "w") as f:
160-
f.write("hola")
167+
file_2 = hidden.join("file_2.txt")
168+
file_3 = nested.join(".file_3.txt")
169+
file_4 = nested.join("file_4.txt")
170+
171+
test_sha = hashlib.sha256()
172+
for fx in [file_1, file_2, file_3, file_4]:
173+
with open(fx, "w") as f:
174+
f.write(str(random.randint(0, 1000)))
175+
test_sha.update(helpers_file.hash_file(fx).encode())
176+
177+
orig_hash = helpers_file.hash_dir(tmpdir)
161178

162-
assert hash_value(tmpdir, tp=Directory) == hash_value(
163-
[file_1, [file_2, file_3]], tp=File
179+
assert orig_hash == test_sha.hexdigest()
180+
assert orig_hash == hash_value(tmpdir, tp=Directory)
181+
182+
nohidden_hash = helpers_file.hash_dir(
183+
tmpdir, ignore_hidden_dirs=True, ignore_hidden_files=True
164184
)
165-
assert hash_value(tmpdir, tp=Directory) == helpers_file.hash_dir(tmpdir)
185+
nohiddendirs_hash = helpers_file.hash_dir(tmpdir, ignore_hidden_dirs=True)
186+
nohiddenfiles_hash = helpers_file.hash_dir(tmpdir, ignore_hidden_files=True)
187+
188+
assert orig_hash != nohidden_hash
189+
assert orig_hash != nohiddendirs_hash
190+
assert orig_hash != nohiddenfiles_hash
191+
192+
file_3.remove()
193+
assert helpers_file.hash_dir(tmpdir) == nohiddenfiles_hash
194+
hidden.remove()
195+
assert helpers_file.hash_dir(tmpdir) == nohidden_hash
166196

167197

168198
def test_get_available_cpus():

0 commit comments

Comments
 (0)