casework · ajnelson-nist · Jun 11, 2021 · Jun 11, 2021 · Jun 11, 2021 · Jun 11, 2021
@@ -35,6 +35,36 @@ case_file --disable-hashes sample.txt.json sample.txt
 ```
 
 
+### SPARQL executors
+
+Two commands are provided to generate output from a SPARQL query and one or more input graphs.  Input graphs can be any graph, such as instance data or supplementary ontology files that supply custom class definitions or other external ontologies.
+
+
+#### `case_sparql_construct`
+
+To use a SPARQL `CONSTRUCT` query to make a supplementary graph file from one or more input graphs:
+
+```bash
+case_sparql_construct output.json input.sparql input.json [input-2.json ...]
+```
+
+
+#### `case_sparql_select`
+
+To use a SPARQL `SELECT` query to make a table from one or more input graphs:
+
+```bash
+# HTML output with Bootstrap classes
+# (e.g. for Jekyll-backed websites)
+case_sparql_select output.html input.sparql input.json [input-2.json ...]
+
+# Markdown, Github-flavored
+case_sparql_select output.md input.sparql input.json [input-2.json ...]
+```
+
+Note that `case_sparql_select` is not guaranteed to function with Pythons below version 3.7.
+
+
 ### `local_uuid`
 
 This [module](case_utils/local_uuid.py) provides a wrapper UUID generator, `local_uuid()`.  Its main purpose is making example data generate consistent identifiers, and intentionally includes mechanisms to make it difficult to activate this mode without awareness of the caller.

@@ -0,0 +1,87 @@
+#!/usr/bin/env python3
+
+# This software was developed at the National Institute of Standards
+# and Technology by employees of the Federal Government in the course
+# of their official duties. Pursuant to title 17 Section 105 of the
+# United States Code this software is not subject to copyright
+# protection and is in the public domain. NIST assumes no
+# responsibility whatsoever for its use by other parties, and makes
+# no guarantees, expressed or implied, about its quality,
+# reliability, or any other characteristic.
+#
+# We would appreciate acknowledgement if the software is used.
+
+"""
+This script executes a SPARQL CONSTRUCT query, returning a graph of the generated triples.
+"""
+
+__version__ = "0.1.0"
+
+import argparse
+import os
+import logging
+
+import rdflib.plugins.sparql
+
+import case_utils
+
+_logger = logging.getLogger(os.path.basename(__file__))
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-d", "--debug", action="store_true")
+    parser.add_argument("--disallow-empty-results", action="store_true", help="Raise error if no results are returned for query.")
+    parser.add_argument("--output-format", help="Override extension-based format guesser.")
+    parser.add_argument("out_graph")
+    parser.add_argument("in_sparql")
+    parser.add_argument("in_graph", nargs="+")
+    args = parser.parse_args()
+
+    logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO)
+
+    in_graph = rdflib.Graph()
+    for in_graph_filename in args.in_graph:
+        in_graph.parse(in_graph_filename, format=case_utils.guess_format(in_graph_filename))
+        _logger.debug("len(in_graph) = %d.", len(in_graph))
+
+    out_graph = rdflib.Graph()
+
+    # Inherit prefixes defined in input context dictionary.
+    nsdict = {k:v for (k,v) in in_graph.namespace_manager.namespaces()}
+    for prefix in sorted(nsdict.keys()):
+        out_graph.bind(prefix, nsdict[prefix])
+
+    _logger.debug("Running query in %r." % args.in_sparql)
+    construct_query_text = None
+    with open(args.in_sparql, "r") as in_fh:
+        construct_query_text = in_fh.read().strip()
+    assert not construct_query_text is None
+
+    construct_query_object = rdflib.plugins.sparql.prepareQuery(construct_query_text, initNs=nsdict)
+
+    # https://rdfextras.readthedocs.io/en/latest/working_with.html
+    construct_query_result = in_graph.query(construct_query_object)
+    _logger.debug("type(construct_query_result) = %r." % type(construct_query_result))
+    _logger.debug("len(construct_query_result) = %d." % len(construct_query_result))
+    for (row_no, row) in enumerate(construct_query_result):
+        if row_no == 0:
+            _logger.debug("row[0] = %r." % (row,))
+        out_graph.add(row)
+
+    output_format = None
+    if args.output_format is None:
+        output_format = case_utils.guess_format(args.out_graph)
+    else:
+        output_format = args.output_format
+
+    serialize_kwargs = {
+      "format": output_format
+    }
+    if output_format == "json-ld":
+        context_dictionary = {k:v for (k,v) in graph.namespace_manager.namespaces()}
+        serialize_kwargs["context"] = context_dictionary
+
+    out_graph.serialize(args.out_graph, **serialize_kwargs)
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,116 @@
+#!/usr/bin/env python3
+
+# This software was developed at the National Institute of Standards
+# and Technology by employees of the Federal Government in the course
+# of their official duties. Pursuant to title 17 Section 105 of the
+# United States Code this software is not subject to copyright
+# protection and is in the public domain. NIST assumes no
+# responsibility whatsoever for its use by other parties, and makes
+# no guarantees, expressed or implied, about its quality,
+# reliability, or any other characteristic.
+#
+# We would appreciate acknowledgement if the software is used.
+
+"""
+This script executes a SPARQL SELECT query, returning a table representation.  The design of the workflow is based on this example built on SPARQLWrapper:
+https://lawlesst.github.io/notebook/sparql-dataframe.html
+
+Note that this assumes a limited syntax style in the outer SELECT clause of the query - only named variables, no aggregations, and a single space character separating all variable names.  E.g.:
+
+SELECT ?x ?y ?z
+WHERE
+{ ... }
+
+The word "DISTINCT" will also be cut from the query, if present.
+
+Should a more complex query be necessary, an outer, wrapping SELECT query would let this script continue to function.
+"""
+
+__version__ = "0.3.0"
+
+import argparse
+import binascii
+import os
+import logging
+
+import pandas as pd
+import rdflib.plugins.sparql
+
+import case_utils
+
+NS_XSD = rdflib.XSD
+
+_logger = logging.getLogger(os.path.basename(__file__))
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-d", "--debug", action="store_true")
+    parser.add_argument("--disallow-empty-results", action="store_true", help="Raise error if no results are returned for query.")
+    parser.add_argument("out_table", help="Expected extensions are .html for HTML tables or .md for Markdown tables.")
+    parser.add_argument("in_sparql")
+    parser.add_argument("in_graph", nargs="+")
+    args = parser.parse_args()
+
+    logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO)
+
+    graph = rdflib.Graph()
+    for in_graph_filename in args.in_graph:
+        graph.parse(in_graph_filename, format=case_utils.guess_format(in_graph_filename))
+
+    # Inherit prefixes defined in input context dictionary.
+    nsdict = {k:v for (k,v) in graph.namespace_manager.namespaces()}
+
+    select_query_text = None
+    with open(args.in_sparql, "r") as in_fh:
+        select_query_text = in_fh.read().strip()
+    _logger.debug("select_query_text = %r." % select_query_text)
+
+    # Build columns list from SELECT line.
+    select_query_text_lines = select_query_text.split("\n")
+    select_line = [line for line in select_query_text_lines if line.startswith("SELECT ")][0]
+    variables = select_line.replace(" DISTINCT", "").replace("SELECT ", "").split(" ")
+
+    tally = 0
+    records = []
+    select_query_object = rdflib.plugins.sparql.prepareQuery(select_query_text, initNs=nsdict)
+    for (row_no, row) in enumerate(graph.query(select_query_object)):
+        tally = row_no + 1
+        record = []
+        for (column_no, column) in enumerate(row):
+            if column is None:
+                column_value = ""
+            elif isinstance(column, rdflib.term.Literal) and column.datatype == NS_XSD.hexBinary:
+                # Use hexlify to convert xsd:hexBinary to ASCII.
+                # The render to ASCII is in support of this script rendering results for website viewing.
+                # .decode() is because hexlify returns bytes.
+                column_value = binascii.hexlify(column.toPython()).decode()
+            else:
+                column_value = column.toPython()
+            if row_no == 0:
+                _logger.debug("row[0]column[%d] = %r." % (column_no, column_value))
+            record.append(column_value)
+        records.append(record)
+    if tally == 0:
+        if args.disallow_empty_results:
+            raise ValueError("Failed to return any results.")
+
+    df = pd.DataFrame(records, columns=variables)
+
+    table_text = None
+    if args.out_table.endswith(".html"):
+        # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_html.html
+        # Add CSS classes for CASE website Bootstrap support.
+        table_text = df.to_html(classes=("table", "table-bordered", "table-condensed"))
+    elif args.out_table.endswith(".md"):
+        # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_markdown.html
+        # https://pypi.org/project/tabulate/
+        # Assume Github-flavored Markdown.
+        table_text = df.to_markdown(tablefmt="github")
+    if table_text is None:
+        raise NotImplementedError("Unsupported output extension for output filename %r.", args.out_table)
+
+    with open(args.out_table, "w") as out_fh:
+        out_fh.write(table_text)
+
+if __name__ == "__main__":
+    main()
@@ -18,12 +18,18 @@ classifiers =
 # TODO The constraint on pyparsing can be removed when rdflib Issue #1190 is resolved.
 # https://github.com/RDFLib/rdflib/issues/1190
 install_requires =
+    # Note that numpy (pandas dependency) is only supported in Python >= 3.7.
+    pandas;python_version>='3.7'
     pyparsing < 3.0.0
     rdflib-jsonld
     requests
+    tabulate
 packages = find:
 python_requires = >=3.6
 
 [options.entry_points]
 console_scripts =
     case_file = case_utils.case_file:main
+    case_sparql_construct = case_utils.case_sparql_construct:main
+    # Note that numpy (pandas dependency, and pandas is dependency of case_sparql_select) is only supported in Python >= 3.7.
+    case_sparql_select = case_utils.case_sparql_select:main
@@ -20,18 +20,26 @@ srcdir := $(shell pwd)
 PYTHON3 ?= $(shell which python3.9 2>/dev/null || which python3.8 2>/dev/null || which python3.7 2>/dev/null || which python3.6 2>/dev/null || which python3)
 
 all: \
-  all-case_file
+  all-case_file \
+  all-case_sparql_construct \
+  all-case_sparql_select
 
 .PHONY: \
   all-case_file \
+  all-case_sparql_construct \
+  all-case_sparql_select \
   check-case_file \
+  check-case_sparql_construct \
+  check-case_sparql_select \
   check-isomorphic_diff \
   download
 
 .venv.done.log: \
   $(top_srcdir)/.git_submodule_init.done.log \
   $(top_srcdir)/case_utils/__init__.py \
   $(top_srcdir)/case_utils/case_file/__init__.py \
+  $(top_srcdir)/case_utils/case_sparql_construct/__init__.py \
+  $(top_srcdir)/case_utils/case_sparql_select/__init__.py \
   $(top_srcdir)/case_utils/local_uuid.py \
   $(top_srcdir)/setup.cfg \
   $(top_srcdir)/setup.py \
@@ -58,13 +66,30 @@ all-case_file: \
 	$(MAKE) \
 	  --directory case_file
 
+all-case_sparql_construct: \
+  .venv.done.log
+	$(MAKE) \
+	  --directory case_sparql_construct
+
+all-case_sparql_select: \
+  .venv.done.log
+	# Only descend if python>=3.7, due to pandas dependency unsatisfiable in 3.6.x.
+	# Boolean explanation: sys.exit(False) has exit status 0.
+	venv/bin/python3 -c 'import sys ; sys.exit(not (sys.version_info < (3, 7)))' \
+	  || $(MAKE) \
+	    --directory case_sparql_select
+
 # These check calls are provided in preferred run-order.
 check: \
   check-isomorphic_diff \
-  check-case_file
+  check-case_file \
+  check-case_sparql_construct \
+  check-case_sparql_select
 	source venv/bin/activate \
 	  && pytest \
 	    --ignore case_file \
+	    --ignore case_sparql_construct \
+	    --ignore case_sparql_select \
 	    --log-level=DEBUG
 
 check-case_file: \
@@ -73,13 +98,34 @@ check-case_file: \
 	  --directory case_file \
 	  check
 
+check-case_sparql_construct: \
+  .venv.done.log
+	$(MAKE) \
+	  --directory case_sparql_construct \
+	  check
+
+check-case_sparql_select: \
+  .venv.done.log
+	# Only descend if python>=3.7, due to pandas dependency unsatisfiable in 3.6.x.
+	# Boolean explanation: sys.exit(False) has exit status 0.
+	venv/bin/python3 -c 'import sys ; sys.exit(not (sys.version_info < (3, 7)))' \
+	  || $(MAKE) \
+	    --directory case_sparql_select \
+	    check
+
 check-isomorphic_diff: \
   .venv.done.log
 	$(MAKE) \
 	  --directory isomorphic_diff \
 	  check
 
 clean:
+	@$(MAKE) \
+	  --directory case_sparql_select \
+	  clean
+	@$(MAKE) \
+	  --directory case_sparql_construct \
+	  clean
 	@$(MAKE) \
 	  --directory case_file \
 	  clean

@@ -0,0 +1 @@
+output.ttl