Skip to content

Commit 4c68051

Browse files
authored
Merge pull request #16 from casework/AC-211
AC-211
2 parents 83fd6ed + 42000c9 commit 4c68051

File tree

16 files changed

+171
-88
lines changed

16 files changed

+171
-88
lines changed

case_utils/__init__.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,17 @@
1313

1414
__version__ = "0.2.1"
1515

16+
import typing
1617
import warnings
1718

18-
import rdflib.util
19+
import rdflib.util # type: ignore
1920

2021
from . import local_uuid
2122

22-
def guess_format(fpath, fmap=None):
23+
def guess_format(
24+
fpath : str,
25+
fmap : typing.Optional[typing.Dict[str, str]] = None
26+
) -> typing.Optional[str]:
2327
warnings.warn("The functionality in case_utils.guess_format is now upstream. Please revise your code to use rdflib.util.guess_format. The function arguments remain the same. case_utils.guess_format will be removed in case_utils 0.4.0.", DeprecationWarning)
2428

25-
return rdflib.util.guess_format(fpath, fmap)
29+
return rdflib.util.guess_format(fpath, fmap) # type: ignore

case_utils/case_file/__init__.py

Lines changed: 44 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,10 @@
2020
import datetime
2121
import hashlib
2222
import os
23+
import typing
24+
import warnings
2325

24-
import rdflib
26+
import rdflib # type: ignore
2527

2628
import case_utils
2729

@@ -34,7 +36,24 @@
3436
NS_UCO_VOCABULARY = rdflib.Namespace("https://unifiedcyberontology.org/ontology/uco/vocabulary#")
3537
NS_XSD = rdflib.XSD
3638

37-
def create_file_node(graph, filepath, node_iri=None, node_prefix=DEFAULT_PREFIX, disable_hashes=False, disable_mtime=False):
39+
# Shortcut syntax for defining an immutable named tuple is noted here:
40+
# https://docs.python.org/3/library/typing.html#typing.NamedTuple
41+
# via the "See also" box here: https://docs.python.org/3/library/collections.html#collections.namedtuple
42+
class HashDict(typing.NamedTuple):
43+
filesize : int
44+
md5 : str
45+
sha1 : str
46+
sha256 : str
47+
sha512 : str
48+
49+
def create_file_node(
50+
graph : rdflib.Graph,
51+
filepath : str,
52+
node_iri : typing.Optional[str] = None,
53+
node_prefix : str = DEFAULT_PREFIX,
54+
disable_hashes : bool = False,
55+
disable_mtime : bool = False
56+
) -> rdflib.URIRef:
3857
r"""
3958
This function characterizes the file at filepath.
4059
@@ -119,20 +138,20 @@ def create_file_node(graph, filepath, node_iri=None, node_prefix=DEFAULT_PREFIX,
119138
))
120139

121140
# Compute hashes until they are re-computed and match once. (This is a lesson learned from working with a NAS that had a subtly faulty network cable.)
122-
successful_hashdict = None
123-
last_hashdict = dict()
141+
142+
successful_hashdict : typing.Optional[HashDict] = None
143+
last_hashdict : typing.Optional[HashDict] = None
124144
for attempt_no in [0, 1, 2, 3]:
125-
current_hashdict = dict()
126145
# Hash file's contents.
127146
# This hashing logic was partially copied from DFXML's walk_to_dfxml.py.
128147
md5obj = hashlib.md5()
129148
sha1obj = hashlib.sha1()
130149
sha256obj = hashlib.sha256()
131150
sha512obj = hashlib.sha512()
132151
stashed_error = None
152+
byte_tally = 0
133153
with open(filepath, "rb") as in_fh:
134154
chunk_size = 2**22
135-
byte_tally = 0
136155
while True:
137156
buf = b""
138157
try:
@@ -147,13 +166,15 @@ def create_file_node(graph, filepath, node_iri=None, node_prefix=DEFAULT_PREFIX,
147166
sha1obj.update(buf)
148167
sha256obj.update(buf)
149168
sha512obj.update(buf)
150-
current_hashdict["filesize"] = byte_tally
151169
if not stashed_error is None:
152170
raise stashed_error
153-
current_hashdict["md5"] = md5obj.hexdigest()
154-
current_hashdict["sha1"] = sha1obj.hexdigest()
155-
current_hashdict["sha256"] = sha256obj.hexdigest()
156-
current_hashdict["sha512"] = sha512obj.hexdigest()
171+
current_hashdict = HashDict(
172+
byte_tally,
173+
md5obj.hexdigest(),
174+
sha1obj.hexdigest(),
175+
sha256obj.hexdigest(),
176+
sha512obj.hexdigest()
177+
)
157178
if last_hashdict == current_hashdict:
158179
successful_hashdict = current_hashdict
159180
break
@@ -163,22 +184,23 @@ def create_file_node(graph, filepath, node_iri=None, node_prefix=DEFAULT_PREFIX,
163184
del current_hashdict
164185
if successful_hashdict is None:
165186
raise ValueError("Failed to confirm hashes of file %r." % filepath)
166-
if successful_hashdict["filesize"] != file_stat.st_size:
187+
if successful_hashdict.filesize != file_stat.st_size:
167188
# TODO - Discuss with AC whether this should be something stronger, like an assertion error.
168-
_logger.warning(
169-
"Inode file size and hashed file sizes disagree: %d vs. %d.",
170-
file_stat.st_size,
171-
successful_hashdict["filesize"]
189+
warnings.warn(
190+
"Inode file size and hashed file sizes disagree: %d vs. %d." % (
191+
file_stat.st_size,
192+
successful_hashdict.filesize
193+
)
172194
)
173195
# TODO - Discuss whether this property should be recorded even if hashes are not attempted.
174196
graph.add((
175197
n_contentdata_facet,
176198
NS_UCO_OBSERVABLE.sizeInBytes,
177-
rdflib.Literal(successful_hashdict["filesize"])
199+
rdflib.Literal(successful_hashdict.filesize)
178200
))
179201

180202
# Add confirmed hashes into graph.
181-
for key in successful_hashdict:
203+
for key in successful_hashdict._fields:
182204
if not key in ("md5", "sha1", "sha256", "sha512"):
183205
continue
184206
n_hash = rdflib.BNode()
@@ -197,15 +219,16 @@ def create_file_node(graph, filepath, node_iri=None, node_prefix=DEFAULT_PREFIX,
197219
NS_UCO_TYPES.hashMethod,
198220
rdflib.Literal(key.upper(), datatype=NS_UCO_VOCABULARY.HashNameVocab)
199221
))
222+
hash_value = getattr(successful_hashdict, key)
200223
graph.add((
201224
n_hash,
202225
NS_UCO_TYPES.hashValue,
203-
rdflib.Literal(successful_hashdict[key].upper(), datatype=NS_XSD.hexBinary)
226+
rdflib.Literal(hash_value.upper(), datatype=NS_XSD.hexBinary)
204227
))
205228

206229
return n_file
207230

208-
def main():
231+
def main() -> None:
209232
import argparse
210233
parser = argparse.ArgumentParser()
211234
parser.add_argument("--base-prefix", default=DEFAULT_PREFIX)
@@ -234,7 +257,7 @@ def main():
234257
else:
235258
output_format = args.output_format
236259

237-
serialize_kwargs = {
260+
serialize_kwargs : typing.Dict[str, typing.Any] = {
238261
"format": output_format
239262
}
240263
if output_format == "json-ld":

case_utils/case_sparql_construct/__init__.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,15 @@
2020
import argparse
2121
import os
2222
import logging
23+
import typing
2324

24-
import rdflib.plugins.sparql
25+
import rdflib.plugins.sparql # type: ignore
2526

2627
import case_utils
2728

2829
_logger = logging.getLogger(os.path.basename(__file__))
2930

30-
def main():
31+
def main() -> None:
3132
parser = argparse.ArgumentParser()
3233
parser.add_argument("-d", "--debug", action="store_true")
3334
parser.add_argument("--disallow-empty-results", action="store_true", help="Raise error if no results are returned for query.")
@@ -74,7 +75,7 @@ def main():
7475
else:
7576
output_format = args.output_format
7677

77-
serialize_kwargs = {
78+
serialize_kwargs : typing.Dict[str, typing.Any] = {
7879
"format": output_format
7980
}
8081
if output_format == "json-ld":

case_utils/case_sparql_select/__init__.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,16 +33,16 @@
3333
import os
3434
import logging
3535

36-
import pandas as pd
37-
import rdflib.plugins.sparql
36+
import pandas as pd # type: ignore
37+
import rdflib.plugins.sparql # type: ignore
3838

3939
import case_utils
4040

4141
NS_XSD = rdflib.XSD
4242

4343
_logger = logging.getLogger(os.path.basename(__file__))
4444

45-
def main():
45+
def main() -> None:
4646
parser = argparse.ArgumentParser()
4747
parser.add_argument("-d", "--debug", action="store_true")
4848
parser.add_argument("--disallow-empty-results", action="store_true", help="Raise error if no results are returned for query.")

case_utils/local_uuid.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -21,17 +21,17 @@
2121
import sys
2222
import uuid
2323

24-
USE_DEMO_UUID = False
24+
USE_DEMO_UUID : bool = False
2525

26-
DEMO_UUID_COUNTER = 0
26+
DEMO_UUID_COUNTER : int = 0
2727

28-
def configure():
28+
def configure() -> None:
2929
global USE_DEMO_UUID
3030

3131
if os.getenv("DEMO_UUID_REQUESTING_NONRANDOM") == "NONRANDOM_REQUESTED":
3232
USE_DEMO_UUID = True
3333

34-
def demo_uuid():
34+
def demo_uuid() -> str:
3535
"""
3636
This function generates a repeatable UUID, drawing on non-varying elements of the environment and process call for entropy.
3737
@@ -52,16 +52,17 @@ def demo_uuid():
5252
parts.append(str(DEMO_UUID_COUNTER))
5353

5454
# Component: Present working directory, replacing $HOME with '~'.
55-
parts.append(os.getcwd().replace(os.getenv("HOME"), "~"))
55+
env_HOME : str = os.getenv("HOME", "/nonexistent")
56+
parts.append(os.getcwd().replace(env_HOME, "~"))
5657

5758
# Component: Argument vector.
5859
parts.extend(sys.argv)
5960

6061
return str(uuid.uuid5(uuid.NAMESPACE_URL, "/".join(parts)))
6162

62-
def local_uuid():
63+
def local_uuid() -> str:
6364
"""
64-
Generate either a UUID4, or if requested via environment configuration, a non-random demo UUID. Returns a string.
65+
Generate either a UUID4, or if requested via environment configuration, a non-random demo UUID.
6566
"""
6667
global USE_DEMO_UUID
6768
if USE_DEMO_UUID:

case_utils/py.typed

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# This software was developed at the National Institute of Standards
2+
# and Technology by employees of the Federal Government in the course
3+
# of their official duties. Pursuant to title 17 Section 105 of the
4+
# United States Code this software is not subject to copyright
5+
# protection and is in the public domain. NIST assumes no
6+
# responsibility whatsoever for its use by other parties, and makes
7+
# no guarantees, expressed or implied, about its quality,
8+
# reliability, or any other characteristic.
9+
#
10+
# We would appreciate acknowledgement if the software is used.
11+
12+
# This file is defined to support PEP 561:
13+
# https://www.python.org/dev/peps/pep-0561/

setup.cfg

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ classifiers =
1515
Programming Language :: Python :: 3
1616

1717
[options]
18+
include_package_data = true
1819
# TODO The constraint on pyparsing can be removed when rdflib Issue #1190 is resolved.
1920
# https://github.com/RDFLib/rdflib/issues/1190
2021
install_requires =
@@ -32,3 +33,6 @@ console_scripts =
3233
case_sparql_construct = case_utils.case_sparql_construct:main
3334
# Note that numpy (pandas dependency, and pandas is dependency of case_sparql_select) is only supported in Python >= 3.7.
3435
case_sparql_select = case_utils.case_sparql_select:main
36+
37+
[options.package_data]
38+
case_utils = py.typed

tests/Makefile

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ all: \
3232
check-case_sparql_construct \
3333
check-case_sparql_select \
3434
check-isomorphic_diff \
35+
check-mypy \
3536
download
3637

3738
.venv.done.log: \
@@ -79,6 +80,7 @@ all-case_sparql_select: \
7980
# These check calls are provided in preferred run-order.
8081
check: \
8182
check-isomorphic_diff \
83+
check-mypy \
8284
check-case_file \
8385
check-case_sparql_construct \
8486
check-case_sparql_select
@@ -116,6 +118,18 @@ check-isomorphic_diff: \
116118
--directory isomorphic_diff \
117119
check
118120

121+
# mypy is called against specific members of the tests directory to avoid descending into the virtual environment.
122+
check-mypy: \
123+
.venv.done.log
124+
source venv/bin/activate \
125+
&& mypy \
126+
$(top_srcdir)/case_utils \
127+
case_file \
128+
case_sparql_construct \
129+
case_utils \
130+
hexbinary \
131+
src
132+
119133
clean:
120134
@$(MAKE) \
121135
--directory case_sparql_select \

tests/case_file/test_case_file.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
import os
1717

1818
import pytest
19-
import rdflib.plugins.sparql
19+
import rdflib.plugins.sparql # type: ignore
2020

2121
_logger = logging.getLogger(os.path.basename(__file__))
2222

@@ -37,20 +37,24 @@
3737

3838
SRCDIR = os.path.dirname(__file__)
3939

40-
def load_graph(filename):
40+
def load_graph(
41+
filename : str
42+
) -> rdflib.Graph:
4143
in_graph = rdflib.Graph()
4244
in_graph.parse(filename)
4345
return in_graph
4446

4547
@pytest.fixture
46-
def graph_case_file():
48+
def graph_case_file() -> rdflib.Graph:
4749
return load_graph(os.path.join(SRCDIR, "sample.txt.ttl"))
4850

4951
@pytest.fixture
50-
def graph_case_file_disable_hashes():
52+
def graph_case_file_disable_hashes() -> rdflib.Graph:
5153
return load_graph(os.path.join(SRCDIR, "sample.txt-disable_hashes.ttl"))
5254

53-
def test_confirm_hashes(graph_case_file):
55+
def test_confirm_hashes(
56+
graph_case_file : rdflib.Graph
57+
) -> None:
5458
expected = {
5559
"MD5": "098F6BCD4621D373CADE4E832627B4F6",
5660
"SHA1": "A94A8FE5CCB19BA61C4C0873D391E987982FBBD3",
@@ -91,7 +95,10 @@ def test_confirm_hashes(graph_case_file):
9195

9296
assert expected == computed
9397

94-
def test_confirm_mtime(graph_case_file, graph_case_file_disable_hashes):
98+
def test_confirm_mtime(
99+
graph_case_file : rdflib.Graph,
100+
graph_case_file_disable_hashes : rdflib.Graph
101+
) -> None:
95102
query_confirm_mtime = """
96103
SELECT ?nFile
97104
WHERE {

0 commit comments

Comments
 (0)