Skip to content

AC-211 #16

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Oct 8, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions case_utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,17 @@

__version__ = "0.2.1"

import typing
import warnings

import rdflib.util
import rdflib.util # type: ignore

from . import local_uuid

def guess_format(fpath, fmap=None):
def guess_format(
fpath : str,
fmap : typing.Optional[typing.Dict[str, str]] = None
) -> typing.Optional[str]:
warnings.warn("The functionality in case_utils.guess_format is now upstream. Please revise your code to use rdflib.util.guess_format. The function arguments remain the same. case_utils.guess_format will be removed in case_utils 0.4.0.", DeprecationWarning)

return rdflib.util.guess_format(fpath, fmap)
return rdflib.util.guess_format(fpath, fmap) # type: ignore
65 changes: 44 additions & 21 deletions case_utils/case_file/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,10 @@
import datetime
import hashlib
import os
import typing
import warnings

import rdflib
import rdflib # type: ignore

import case_utils

Expand All @@ -34,7 +36,24 @@
NS_UCO_VOCABULARY = rdflib.Namespace("https://unifiedcyberontology.org/ontology/uco/vocabulary#")
NS_XSD = rdflib.XSD

def create_file_node(graph, filepath, node_iri=None, node_prefix=DEFAULT_PREFIX, disable_hashes=False, disable_mtime=False):
# Shortcut syntax for defining an immutable named tuple is noted here:
# https://docs.python.org/3/library/typing.html#typing.NamedTuple
# via the "See also" box here: https://docs.python.org/3/library/collections.html#collections.namedtuple
class HashDict(typing.NamedTuple):
filesize : int
md5 : str
sha1 : str
sha256 : str
sha512 : str

def create_file_node(
graph : rdflib.Graph,
filepath : str,
node_iri : typing.Optional[str] = None,
node_prefix : str = DEFAULT_PREFIX,
disable_hashes : bool = False,
disable_mtime : bool = False
) -> rdflib.URIRef:
r"""
This function characterizes the file at filepath.

Expand Down Expand Up @@ -119,20 +138,20 @@ def create_file_node(graph, filepath, node_iri=None, node_prefix=DEFAULT_PREFIX,
))

# Compute hashes until they are re-computed and match once. (This is a lesson learned from working with a NAS that had a subtly faulty network cable.)
successful_hashdict = None
last_hashdict = dict()

successful_hashdict : typing.Optional[HashDict] = None
last_hashdict : typing.Optional[HashDict] = None
for attempt_no in [0, 1, 2, 3]:
current_hashdict = dict()
# Hash file's contents.
# This hashing logic was partially copied from DFXML's walk_to_dfxml.py.
md5obj = hashlib.md5()
sha1obj = hashlib.sha1()
sha256obj = hashlib.sha256()
sha512obj = hashlib.sha512()
stashed_error = None
byte_tally = 0
with open(filepath, "rb") as in_fh:
chunk_size = 2**22
byte_tally = 0
while True:
buf = b""
try:
Expand All @@ -147,13 +166,15 @@ def create_file_node(graph, filepath, node_iri=None, node_prefix=DEFAULT_PREFIX,
sha1obj.update(buf)
sha256obj.update(buf)
sha512obj.update(buf)
current_hashdict["filesize"] = byte_tally
if not stashed_error is None:
raise stashed_error
current_hashdict["md5"] = md5obj.hexdigest()
current_hashdict["sha1"] = sha1obj.hexdigest()
current_hashdict["sha256"] = sha256obj.hexdigest()
current_hashdict["sha512"] = sha512obj.hexdigest()
current_hashdict = HashDict(
byte_tally,
md5obj.hexdigest(),
sha1obj.hexdigest(),
sha256obj.hexdigest(),
sha512obj.hexdigest()
)
if last_hashdict == current_hashdict:
successful_hashdict = current_hashdict
break
Expand All @@ -163,22 +184,23 @@ def create_file_node(graph, filepath, node_iri=None, node_prefix=DEFAULT_PREFIX,
del current_hashdict
if successful_hashdict is None:
raise ValueError("Failed to confirm hashes of file %r." % filepath)
if successful_hashdict["filesize"] != file_stat.st_size:
if successful_hashdict.filesize != file_stat.st_size:
# TODO - Discuss with AC whether this should be something stronger, like an assertion error.
_logger.warning(
"Inode file size and hashed file sizes disagree: %d vs. %d.",
file_stat.st_size,
successful_hashdict["filesize"]
warnings.warn(
"Inode file size and hashed file sizes disagree: %d vs. %d." % (
file_stat.st_size,
successful_hashdict.filesize
)
)
# TODO - Discuss whether this property should be recorded even if hashes are not attempted.
graph.add((
n_contentdata_facet,
NS_UCO_OBSERVABLE.sizeInBytes,
rdflib.Literal(successful_hashdict["filesize"])
rdflib.Literal(successful_hashdict.filesize)
))

# Add confirmed hashes into graph.
for key in successful_hashdict:
for key in successful_hashdict._fields:
if not key in ("md5", "sha1", "sha256", "sha512"):
continue
n_hash = rdflib.BNode()
Expand All @@ -197,15 +219,16 @@ def create_file_node(graph, filepath, node_iri=None, node_prefix=DEFAULT_PREFIX,
NS_UCO_TYPES.hashMethod,
rdflib.Literal(key.upper(), datatype=NS_UCO_VOCABULARY.HashNameVocab)
))
hash_value = getattr(successful_hashdict, key)
graph.add((
n_hash,
NS_UCO_TYPES.hashValue,
rdflib.Literal(successful_hashdict[key].upper(), datatype=NS_XSD.hexBinary)
rdflib.Literal(hash_value.upper(), datatype=NS_XSD.hexBinary)
))

return n_file

def main():
def main() -> None:
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--base-prefix", default=DEFAULT_PREFIX)
Expand Down Expand Up @@ -234,7 +257,7 @@ def main():
else:
output_format = args.output_format

serialize_kwargs = {
serialize_kwargs : typing.Dict[str, typing.Any] = {
"format": output_format
}
if output_format == "json-ld":
Expand Down
7 changes: 4 additions & 3 deletions case_utils/case_sparql_construct/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,15 @@
import argparse
import os
import logging
import typing

import rdflib.plugins.sparql
import rdflib.plugins.sparql # type: ignore

import case_utils

_logger = logging.getLogger(os.path.basename(__file__))

def main():
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("-d", "--debug", action="store_true")
parser.add_argument("--disallow-empty-results", action="store_true", help="Raise error if no results are returned for query.")
Expand Down Expand Up @@ -74,7 +75,7 @@ def main():
else:
output_format = args.output_format

serialize_kwargs = {
serialize_kwargs : typing.Dict[str, typing.Any] = {
"format": output_format
}
if output_format == "json-ld":
Expand Down
6 changes: 3 additions & 3 deletions case_utils/case_sparql_select/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,16 +33,16 @@
import os
import logging

import pandas as pd
import rdflib.plugins.sparql
import pandas as pd # type: ignore
import rdflib.plugins.sparql # type: ignore

import case_utils

NS_XSD = rdflib.XSD

_logger = logging.getLogger(os.path.basename(__file__))

def main():
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("-d", "--debug", action="store_true")
parser.add_argument("--disallow-empty-results", action="store_true", help="Raise error if no results are returned for query.")
Expand Down
15 changes: 8 additions & 7 deletions case_utils/local_uuid.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,17 +21,17 @@
import sys
import uuid

USE_DEMO_UUID = False
USE_DEMO_UUID : bool = False

DEMO_UUID_COUNTER = 0
DEMO_UUID_COUNTER : int = 0

def configure():
def configure() -> None:
global USE_DEMO_UUID

if os.getenv("DEMO_UUID_REQUESTING_NONRANDOM") == "NONRANDOM_REQUESTED":
USE_DEMO_UUID = True

def demo_uuid():
def demo_uuid() -> str:
"""
This function generates a repeatable UUID, drawing on non-varying elements of the environment and process call for entropy.

Expand All @@ -52,16 +52,17 @@ def demo_uuid():
parts.append(str(DEMO_UUID_COUNTER))

# Component: Present working directory, replacing $HOME with '~'.
parts.append(os.getcwd().replace(os.getenv("HOME"), "~"))
env_HOME : str = os.getenv("HOME", "/nonexistent")
parts.append(os.getcwd().replace(env_HOME, "~"))

# Component: Argument vector.
parts.extend(sys.argv)

return str(uuid.uuid5(uuid.NAMESPACE_URL, "/".join(parts)))

def local_uuid():
def local_uuid() -> str:
"""
Generate either a UUID4, or if requested via environment configuration, a non-random demo UUID. Returns a string.
Generate either a UUID4, or if requested via environment configuration, a non-random demo UUID.
"""
global USE_DEMO_UUID
if USE_DEMO_UUID:
Expand Down
13 changes: 13 additions & 0 deletions case_utils/py.typed
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# This software was developed at the National Institute of Standards
# and Technology by employees of the Federal Government in the course
# of their official duties. Pursuant to title 17 Section 105 of the
# United States Code this software is not subject to copyright
# protection and is in the public domain. NIST assumes no
# responsibility whatsoever for its use by other parties, and makes
# no guarantees, expressed or implied, about its quality,
# reliability, or any other characteristic.
#
# We would appreciate acknowledgement if the software is used.

# This file is defined to support PEP 561:
# https://www.python.org/dev/peps/pep-0561/
4 changes: 4 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ classifiers =
Programming Language :: Python :: 3

[options]
include_package_data = true
# TODO The constraint on pyparsing can be removed when rdflib Issue #1190 is resolved.
# https://github.com/RDFLib/rdflib/issues/1190
install_requires =
Expand All @@ -32,3 +33,6 @@ console_scripts =
case_sparql_construct = case_utils.case_sparql_construct:main
# Note that numpy (pandas dependency, and pandas is dependency of case_sparql_select) is only supported in Python >= 3.7.
case_sparql_select = case_utils.case_sparql_select:main

[options.package_data]
case_utils = py.typed
14 changes: 14 additions & 0 deletions tests/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ all: \
check-case_sparql_construct \
check-case_sparql_select \
check-isomorphic_diff \
check-mypy \
download

.venv.done.log: \
Expand Down Expand Up @@ -79,6 +80,7 @@ all-case_sparql_select: \
# These check calls are provided in preferred run-order.
check: \
check-isomorphic_diff \
check-mypy \
check-case_file \
check-case_sparql_construct \
check-case_sparql_select
Expand Down Expand Up @@ -116,6 +118,18 @@ check-isomorphic_diff: \
--directory isomorphic_diff \
check

# mypy is called against specific members of the tests directory to avoid descending into the virtual environment.
check-mypy: \
.venv.done.log
source venv/bin/activate \
&& mypy \
$(top_srcdir)/case_utils \
case_file \
case_sparql_construct \
case_utils \
hexbinary \
src

clean:
@$(MAKE) \
--directory case_sparql_select \
Expand Down
19 changes: 13 additions & 6 deletions tests/case_file/test_case_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import os

import pytest
import rdflib.plugins.sparql
import rdflib.plugins.sparql # type: ignore

_logger = logging.getLogger(os.path.basename(__file__))

Expand All @@ -37,20 +37,24 @@

SRCDIR = os.path.dirname(__file__)

def load_graph(filename):
def load_graph(
filename : str
) -> rdflib.Graph:
in_graph = rdflib.Graph()
in_graph.parse(filename)
return in_graph

@pytest.fixture
def graph_case_file():
def graph_case_file() -> rdflib.Graph:
return load_graph(os.path.join(SRCDIR, "sample.txt.ttl"))

@pytest.fixture
def graph_case_file_disable_hashes():
def graph_case_file_disable_hashes() -> rdflib.Graph:
return load_graph(os.path.join(SRCDIR, "sample.txt-disable_hashes.ttl"))

def test_confirm_hashes(graph_case_file):
def test_confirm_hashes(
graph_case_file : rdflib.Graph
) -> None:
expected = {
"MD5": "098F6BCD4621D373CADE4E832627B4F6",
"SHA1": "A94A8FE5CCB19BA61C4C0873D391E987982FBBD3",
Expand Down Expand Up @@ -91,7 +95,10 @@ def test_confirm_hashes(graph_case_file):

assert expected == computed

def test_confirm_mtime(graph_case_file, graph_case_file_disable_hashes):
def test_confirm_mtime(
graph_case_file : rdflib.Graph,
graph_case_file_disable_hashes : rdflib.Graph
) -> None:
query_confirm_mtime = """
SELECT ?nFile
WHERE {
Expand Down
Loading