36
36
NS_UCO_VOCABULARY = rdflib .Namespace ("https://unifiedcyberontology.org/ontology/uco/vocabulary#" )
37
37
NS_XSD = rdflib .XSD
38
38
39
- def create_file_node (graph , filepath , node_iri = None , node_prefix = DEFAULT_PREFIX , disable_hashes = False , disable_mtime = False ):
39
+ # Shortcut syntax for defining an immutable named tuple is noted here:
40
+ # https://docs.python.org/3/library/typing.html#typing.NamedTuple
41
+ # via the "See also" box here: https://docs.python.org/3/library/collections.html#collections.namedtuple
42
+ class HashDict (typing .NamedTuple ):
43
+ filesize : int
44
+ md5 : str
45
+ sha1 : str
46
+ sha256 : str
47
+ sha512 : str
48
+
49
+ def create_file_node (
50
+ graph : rdflib .Graph ,
51
+ filepath : str ,
52
+ node_iri : typing .Optional [str ] = None ,
53
+ node_prefix : str = DEFAULT_PREFIX ,
54
+ disable_hashes : bool = False ,
55
+ disable_mtime : bool = False
56
+ ) -> rdflib .URIRef :
40
57
r"""
41
58
This function characterizes the file at filepath.
42
59
@@ -121,20 +138,20 @@ def create_file_node(graph, filepath, node_iri=None, node_prefix=DEFAULT_PREFIX,
121
138
))
122
139
123
140
# Compute hashes until they are re-computed and match once. (This is a lesson learned from working with a NAS that had a subtly faulty network cable.)
124
- successful_hashdict = None
125
- last_hashdict = dict ()
141
+
142
+ successful_hashdict : typing .Optional [HashDict ] = None
143
+ last_hashdict : typing .Optional [HashDict ] = None
126
144
for attempt_no in [0 , 1 , 2 , 3 ]:
127
- current_hashdict = dict ()
128
145
# Hash file's contents.
129
146
# This hashing logic was partially copied from DFXML's walk_to_dfxml.py.
130
147
md5obj = hashlib .md5 ()
131
148
sha1obj = hashlib .sha1 ()
132
149
sha256obj = hashlib .sha256 ()
133
150
sha512obj = hashlib .sha512 ()
134
151
stashed_error = None
152
+ byte_tally = 0
135
153
with open (filepath , "rb" ) as in_fh :
136
154
chunk_size = 2 ** 22
137
- byte_tally = 0
138
155
while True :
139
156
buf = b""
140
157
try :
@@ -149,13 +166,15 @@ def create_file_node(graph, filepath, node_iri=None, node_prefix=DEFAULT_PREFIX,
149
166
sha1obj .update (buf )
150
167
sha256obj .update (buf )
151
168
sha512obj .update (buf )
152
- current_hashdict ["filesize" ] = byte_tally
153
169
if not stashed_error is None :
154
170
raise stashed_error
155
- current_hashdict ["md5" ] = md5obj .hexdigest ()
156
- current_hashdict ["sha1" ] = sha1obj .hexdigest ()
157
- current_hashdict ["sha256" ] = sha256obj .hexdigest ()
158
- current_hashdict ["sha512" ] = sha512obj .hexdigest ()
171
+ current_hashdict = HashDict (
172
+ byte_tally ,
173
+ md5obj .hexdigest (),
174
+ sha1obj .hexdigest (),
175
+ sha256obj .hexdigest (),
176
+ sha512obj .hexdigest ()
177
+ )
159
178
if last_hashdict == current_hashdict :
160
179
successful_hashdict = current_hashdict
161
180
break
@@ -165,22 +184,23 @@ def create_file_node(graph, filepath, node_iri=None, node_prefix=DEFAULT_PREFIX,
165
184
del current_hashdict
166
185
if successful_hashdict is None :
167
186
raise ValueError ("Failed to confirm hashes of file %r." % filepath )
168
- if successful_hashdict [ " filesize" ] != file_stat .st_size :
187
+ if successful_hashdict . filesize != file_stat .st_size :
169
188
# TODO - Discuss with AC whether this should be something stronger, like an assertion error.
170
189
warnings .warn (
171
- "Inode file size and hashed file sizes disagree: %d vs. %d." ,
172
- file_stat .st_size ,
173
- successful_hashdict ["filesize" ]
190
+ "Inode file size and hashed file sizes disagree: %d vs. %d." % (
191
+ file_stat .st_size ,
192
+ successful_hashdict .filesize
193
+ )
174
194
)
175
195
# TODO - Discuss whether this property should be recorded even if hashes are not attempted.
176
196
graph .add ((
177
197
n_contentdata_facet ,
178
198
NS_UCO_OBSERVABLE .sizeInBytes ,
179
- rdflib .Literal (successful_hashdict [ " filesize" ] )
199
+ rdflib .Literal (successful_hashdict . filesize )
180
200
))
181
201
182
202
# Add confirmed hashes into graph.
183
- for key in successful_hashdict :
203
+ for key in successful_hashdict . _fields :
184
204
if not key in ("md5" , "sha1" , "sha256" , "sha512" ):
185
205
continue
186
206
n_hash = rdflib .BNode ()
@@ -199,10 +219,11 @@ def create_file_node(graph, filepath, node_iri=None, node_prefix=DEFAULT_PREFIX,
199
219
NS_UCO_TYPES .hashMethod ,
200
220
rdflib .Literal (key .upper (), datatype = NS_UCO_VOCABULARY .HashNameVocab )
201
221
))
222
+ hash_value = getattr (successful_hashdict , key )
202
223
graph .add ((
203
224
n_hash ,
204
225
NS_UCO_TYPES .hashValue ,
205
- rdflib .Literal (successful_hashdict [ key ] .upper (), datatype = NS_XSD .hexBinary )
226
+ rdflib .Literal (hash_value .upper (), datatype = NS_XSD .hexBinary )
206
227
))
207
228
208
229
return n_file
0 commit comments