casework · balon · Oct 8, 2021 · Oct 7, 2021 · Oct 7, 2021 · Oct 7, 2021
@@ -13,13 +13,17 @@
 
 __version__ = "0.2.1"
 
+import typing
 import warnings
 
-import rdflib.util
+import rdflib.util  # type: ignore
 
 from . import local_uuid
 
-def guess_format(fpath, fmap=None):
+def guess_format(
+  fpath : str,
+  fmap : typing.Optional[typing.Dict[str, str]] = None
+) -> typing.Optional[str]:
     warnings.warn("The functionality in case_utils.guess_format is now upstream.  Please revise your code to use rdflib.util.guess_format.  The function arguments remain the same.  case_utils.guess_format will be removed in case_utils 0.4.0.", DeprecationWarning)
 
-    return rdflib.util.guess_format(fpath, fmap)
+    return rdflib.util.guess_format(fpath, fmap)  # type: ignore
@@ -20,8 +20,10 @@
 import datetime
 import hashlib
 import os
+import typing
+import warnings
 
-import rdflib
+import rdflib  # type: ignore
 
 import case_utils
 
@@ -34,7 +36,24 @@
 NS_UCO_VOCABULARY = rdflib.Namespace("https://unifiedcyberontology.org/ontology/uco/vocabulary#")
 NS_XSD = rdflib.XSD
 
-def create_file_node(graph, filepath, node_iri=None, node_prefix=DEFAULT_PREFIX, disable_hashes=False, disable_mtime=False):
+# Shortcut syntax for defining an immutable named tuple is noted here:
+# https://docs.python.org/3/library/typing.html#typing.NamedTuple
+# via the "See also" box here: https://docs.python.org/3/library/collections.html#collections.namedtuple
+class HashDict(typing.NamedTuple):
+    filesize : int
+    md5 : str
+    sha1 : str
+    sha256 : str
+    sha512 : str
+
+def create_file_node(
+  graph : rdflib.Graph,
+  filepath : str,
+  node_iri : typing.Optional[str] = None,
+  node_prefix : str = DEFAULT_PREFIX,
+  disable_hashes : bool = False,
+  disable_mtime : bool = False
+) -> rdflib.URIRef:
     r"""
     This function characterizes the file at filepath.
 
@@ -119,20 +138,20 @@ def create_file_node(graph, filepath, node_iri=None, node_prefix=DEFAULT_PREFIX,
         ))
 
         # Compute hashes until they are re-computed and match once.  (This is a lesson learned from working with a NAS that had a subtly faulty network cable.)
-        successful_hashdict = None
-        last_hashdict = dict()
+
+        successful_hashdict : typing.Optional[HashDict] = None
+        last_hashdict : typing.Optional[HashDict] = None
         for attempt_no in [0, 1, 2, 3]:
-            current_hashdict = dict()
             # Hash file's contents.
             # This hashing logic was partially copied from DFXML's walk_to_dfxml.py.
             md5obj = hashlib.md5()
             sha1obj = hashlib.sha1()
             sha256obj = hashlib.sha256()
             sha512obj = hashlib.sha512()
             stashed_error = None
+            byte_tally = 0
             with open(filepath, "rb") as in_fh:
                 chunk_size = 2**22
-                byte_tally = 0
                 while True:
                     buf = b""
                     try:
@@ -147,13 +166,15 @@ def create_file_node(graph, filepath, node_iri=None, node_prefix=DEFAULT_PREFIX,
                     sha1obj.update(buf)
                     sha256obj.update(buf)
                     sha512obj.update(buf)
-                current_hashdict["filesize"] = byte_tally
             if not stashed_error is None:
                 raise stashed_error
-            current_hashdict["md5"] = md5obj.hexdigest()
-            current_hashdict["sha1"] = sha1obj.hexdigest()
-            current_hashdict["sha256"] = sha256obj.hexdigest()
-            current_hashdict["sha512"] = sha512obj.hexdigest()
+            current_hashdict = HashDict(
+              byte_tally,
+              md5obj.hexdigest(),
+              sha1obj.hexdigest(),
+              sha256obj.hexdigest(),
+              sha512obj.hexdigest()
+            )
             if last_hashdict == current_hashdict:
                 successful_hashdict = current_hashdict
                 break
@@ -163,22 +184,23 @@ def create_file_node(graph, filepath, node_iri=None, node_prefix=DEFAULT_PREFIX,
         del current_hashdict
         if successful_hashdict is None:
             raise ValueError("Failed to confirm hashes of file %r." % filepath)
-        if successful_hashdict["filesize"] != file_stat.st_size:
+        if successful_hashdict.filesize != file_stat.st_size:
             # TODO - Discuss with AC whether this should be something stronger, like an assertion error.
-            _logger.warning(
-              "Inode file size and hashed file sizes disagree: %d vs. %d.",
-              file_stat.st_size,
-              successful_hashdict["filesize"]
+            warnings.warn(
+              "Inode file size and hashed file sizes disagree: %d vs. %d." % (
+                file_stat.st_size,
+                successful_hashdict.filesize
+              )
             )
         # TODO - Discuss whether this property should be recorded even if hashes are not attempted.
         graph.add((
           n_contentdata_facet,
           NS_UCO_OBSERVABLE.sizeInBytes,
-          rdflib.Literal(successful_hashdict["filesize"])
+          rdflib.Literal(successful_hashdict.filesize)
         ))
 
         # Add confirmed hashes into graph.
-        for key in successful_hashdict:
+        for key in successful_hashdict._fields:
             if not key in ("md5", "sha1", "sha256", "sha512"):
                 continue
             n_hash = rdflib.BNode()
@@ -197,15 +219,16 @@ def create_file_node(graph, filepath, node_iri=None, node_prefix=DEFAULT_PREFIX,
               NS_UCO_TYPES.hashMethod,
               rdflib.Literal(key.upper(), datatype=NS_UCO_VOCABULARY.HashNameVocab)
             ))
+            hash_value = getattr(successful_hashdict, key)
             graph.add((
               n_hash,
               NS_UCO_TYPES.hashValue,
-              rdflib.Literal(successful_hashdict[key].upper(), datatype=NS_XSD.hexBinary)
+              rdflib.Literal(hash_value.upper(), datatype=NS_XSD.hexBinary)
             ))
 
     return n_file
 
-def main():
+def main() -> None:
     import argparse
     parser = argparse.ArgumentParser()
     parser.add_argument("--base-prefix", default=DEFAULT_PREFIX)
@@ -234,7 +257,7 @@ def main():
     else:
         output_format = args.output_format
 
-    serialize_kwargs = {
+    serialize_kwargs : typing.Dict[str, typing.Any] = {
       "format": output_format
     }
     if output_format == "json-ld":

@@ -20,14 +20,15 @@
 import argparse
 import os
 import logging
+import typing
 
-import rdflib.plugins.sparql
+import rdflib.plugins.sparql  # type: ignore
 
 import case_utils
 
 _logger = logging.getLogger(os.path.basename(__file__))
 
-def main():
+def main() -> None:
     parser = argparse.ArgumentParser()
     parser.add_argument("-d", "--debug", action="store_true")
     parser.add_argument("--disallow-empty-results", action="store_true", help="Raise error if no results are returned for query.")
@@ -74,7 +75,7 @@ def main():
     else:
         output_format = args.output_format
 
-    serialize_kwargs = {
+    serialize_kwargs : typing.Dict[str, typing.Any] = {
       "format": output_format
     }
     if output_format == "json-ld":

@@ -33,16 +33,16 @@
 import os
 import logging
 
-import pandas as pd
-import rdflib.plugins.sparql
+import pandas as pd  # type: ignore
+import rdflib.plugins.sparql  # type: ignore
 
 import case_utils
 
 NS_XSD = rdflib.XSD
 
 _logger = logging.getLogger(os.path.basename(__file__))
 
-def main():
+def main() -> None:
     parser = argparse.ArgumentParser()
     parser.add_argument("-d", "--debug", action="store_true")
     parser.add_argument("--disallow-empty-results", action="store_true", help="Raise error if no results are returned for query.")

@@ -21,17 +21,17 @@
 import sys
 import uuid
 
-USE_DEMO_UUID = False
+USE_DEMO_UUID : bool = False
 
-DEMO_UUID_COUNTER = 0
+DEMO_UUID_COUNTER : int = 0
 
-def configure():
+def configure() -> None:
     global USE_DEMO_UUID
 
     if os.getenv("DEMO_UUID_REQUESTING_NONRANDOM") == "NONRANDOM_REQUESTED":
         USE_DEMO_UUID = True
 
-def demo_uuid():
+def demo_uuid() -> str:
     """
     This function generates a repeatable UUID, drawing on non-varying elements of the environment and process call for entropy.
 
@@ -52,16 +52,17 @@ def demo_uuid():
     parts.append(str(DEMO_UUID_COUNTER))
 
     # Component: Present working directory, replacing $HOME with '~'.
-    parts.append(os.getcwd().replace(os.getenv("HOME"), "~"))
+    env_HOME : str = os.getenv("HOME", "/nonexistent")
+    parts.append(os.getcwd().replace(env_HOME, "~"))
 
     # Component: Argument vector.
     parts.extend(sys.argv)
 
     return str(uuid.uuid5(uuid.NAMESPACE_URL, "/".join(parts)))
 
-def local_uuid():
+def local_uuid() -> str:
     """
-    Generate either a UUID4, or if requested via environment configuration, a non-random demo UUID.  Returns a string.
+    Generate either a UUID4, or if requested via environment configuration, a non-random demo UUID.
     """
     global USE_DEMO_UUID
     if USE_DEMO_UUID:

@@ -0,0 +1,13 @@
+# This software was developed at the National Institute of Standards
+# and Technology by employees of the Federal Government in the course
+# of their official duties. Pursuant to title 17 Section 105 of the
+# United States Code this software is not subject to copyright
+# protection and is in the public domain. NIST assumes no
+# responsibility whatsoever for its use by other parties, and makes
+# no guarantees, expressed or implied, about its quality,
+# reliability, or any other characteristic.
+#
+# We would appreciate acknowledgement if the software is used.
+
+# This file is defined to support PEP 561:
+# https://www.python.org/dev/peps/pep-0561/
@@ -15,6 +15,7 @@ classifiers =
     Programming Language :: Python :: 3
 
 [options]
+include_package_data = true
 # TODO The constraint on pyparsing can be removed when rdflib Issue #1190 is resolved.
 # https://github.com/RDFLib/rdflib/issues/1190
 install_requires =
@@ -32,3 +33,6 @@ console_scripts =
     case_sparql_construct = case_utils.case_sparql_construct:main
     # Note that numpy (pandas dependency, and pandas is dependency of case_sparql_select) is only supported in Python >= 3.7.
     case_sparql_select = case_utils.case_sparql_select:main
+
+[options.package_data]
+case_utils = py.typed
@@ -32,6 +32,7 @@ all: \
   check-case_sparql_construct \
   check-case_sparql_select \
   check-isomorphic_diff \
+  check-mypy \
   download
 
 .venv.done.log: \
@@ -79,6 +80,7 @@ all-case_sparql_select: \
 # These check calls are provided in preferred run-order.
 check: \
   check-isomorphic_diff \
+  check-mypy \
   check-case_file \
   check-case_sparql_construct \
   check-case_sparql_select
@@ -116,6 +118,18 @@ check-isomorphic_diff: \
 	  --directory isomorphic_diff \
 	  check
 
+# mypy is called against specific members of the tests directory to avoid descending into the virtual environment.
+check-mypy: \
+  .venv.done.log
+	source venv/bin/activate \
+	  && mypy \
+	    $(top_srcdir)/case_utils \
+	    case_file \
+	    case_sparql_construct \
+	    case_utils \
+	    hexbinary \
+	    src
+
 clean:
 	@$(MAKE) \
 	  --directory case_sparql_select \

@@ -16,7 +16,7 @@
 import os
 
 import pytest
-import rdflib.plugins.sparql
+import rdflib.plugins.sparql  # type: ignore
 
 _logger = logging.getLogger(os.path.basename(__file__))
 
@@ -37,20 +37,24 @@
 
 SRCDIR = os.path.dirname(__file__)
 
-def load_graph(filename):
+def load_graph(
+  filename : str
+) -> rdflib.Graph:
     in_graph = rdflib.Graph()
     in_graph.parse(filename)
     return in_graph
 
 @pytest.fixture
-def graph_case_file():
+def graph_case_file() -> rdflib.Graph:
     return load_graph(os.path.join(SRCDIR, "sample.txt.ttl"))
 
 @pytest.fixture
-def graph_case_file_disable_hashes():
+def graph_case_file_disable_hashes() -> rdflib.Graph:
     return load_graph(os.path.join(SRCDIR, "sample.txt-disable_hashes.ttl"))
 
-def test_confirm_hashes(graph_case_file):
+def test_confirm_hashes(
+  graph_case_file : rdflib.Graph
+) -> None:
     expected = {
       "MD5": "098F6BCD4621D373CADE4E832627B4F6",
       "SHA1": "A94A8FE5CCB19BA61C4C0873D391E987982FBBD3",
@@ -91,7 +95,10 @@ def test_confirm_hashes(graph_case_file):
 
     assert expected == computed
 
-def test_confirm_mtime(graph_case_file, graph_case_file_disable_hashes):
+def test_confirm_mtime(
+  graph_case_file : rdflib.Graph,
+  graph_case_file_disable_hashes : rdflib.Graph
+) -> None:
     query_confirm_mtime = """
 SELECT ?nFile
 WHERE {