diff --git a/README.md b/README.md index 8d1e38a..7c2bd8d 100644 --- a/README.md +++ b/README.md @@ -35,6 +35,36 @@ case_file --disable-hashes sample.txt.json sample.txt ``` +### SPARQL executors + +Two commands are provided to generate output from a SPARQL query and one or more input graphs. Input graphs can be any graph, such as instance data or supplementary ontology files that supply custom class definitions or other external ontologies. + + +#### `case_sparql_construct` + +To use a SPARQL `CONSTRUCT` query to make a supplementary graph file from one or more input graphs: + +```bash +case_sparql_construct output.json input.sparql input.json [input-2.json ...] +``` + + +#### `case_sparql_select` + +To use a SPARQL `SELECT` query to make a table from one or more input graphs: + +```bash +# HTML output with Bootstrap classes +# (e.g. for Jekyll-backed websites) +case_sparql_select output.html input.sparql input.json [input-2.json ...] + +# Markdown, Github-flavored +case_sparql_select output.md input.sparql input.json [input-2.json ...] +``` + +Note that `case_sparql_select` is not guaranteed to function with Pythons below version 3.7. + + ### `local_uuid` This [module](case_utils/local_uuid.py) provides a wrapper UUID generator, `local_uuid()`. Its main purpose is making example data generate consistent identifiers, and intentionally includes mechanisms to make it difficult to activate this mode without awareness of the caller. diff --git a/case_utils/case_sparql_construct/__init__.py b/case_utils/case_sparql_construct/__init__.py new file mode 100644 index 0000000..0a77e4e --- /dev/null +++ b/case_utils/case_sparql_construct/__init__.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python3 + +# This software was developed at the National Institute of Standards +# and Technology by employees of the Federal Government in the course +# of their official duties. Pursuant to title 17 Section 105 of the +# United States Code this software is not subject to copyright +# protection and is in the public domain. NIST assumes no +# responsibility whatsoever for its use by other parties, and makes +# no guarantees, expressed or implied, about its quality, +# reliability, or any other characteristic. +# +# We would appreciate acknowledgement if the software is used. + +""" +This script executes a SPARQL CONSTRUCT query, returning a graph of the generated triples. +""" + +__version__ = "0.1.0" + +import argparse +import os +import logging + +import rdflib.plugins.sparql + +import case_utils + +_logger = logging.getLogger(os.path.basename(__file__)) + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("-d", "--debug", action="store_true") + parser.add_argument("--disallow-empty-results", action="store_true", help="Raise error if no results are returned for query.") + parser.add_argument("--output-format", help="Override extension-based format guesser.") + parser.add_argument("out_graph") + parser.add_argument("in_sparql") + parser.add_argument("in_graph", nargs="+") + args = parser.parse_args() + + logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) + + in_graph = rdflib.Graph() + for in_graph_filename in args.in_graph: + in_graph.parse(in_graph_filename, format=case_utils.guess_format(in_graph_filename)) + _logger.debug("len(in_graph) = %d.", len(in_graph)) + + out_graph = rdflib.Graph() + + # Inherit prefixes defined in input context dictionary. + nsdict = {k:v for (k,v) in in_graph.namespace_manager.namespaces()} + for prefix in sorted(nsdict.keys()): + out_graph.bind(prefix, nsdict[prefix]) + + _logger.debug("Running query in %r." % args.in_sparql) + construct_query_text = None + with open(args.in_sparql, "r") as in_fh: + construct_query_text = in_fh.read().strip() + assert not construct_query_text is None + + construct_query_object = rdflib.plugins.sparql.prepareQuery(construct_query_text, initNs=nsdict) + + # https://rdfextras.readthedocs.io/en/latest/working_with.html + construct_query_result = in_graph.query(construct_query_object) + _logger.debug("type(construct_query_result) = %r." % type(construct_query_result)) + _logger.debug("len(construct_query_result) = %d." % len(construct_query_result)) + for (row_no, row) in enumerate(construct_query_result): + if row_no == 0: + _logger.debug("row[0] = %r." % (row,)) + out_graph.add(row) + + output_format = None + if args.output_format is None: + output_format = case_utils.guess_format(args.out_graph) + else: + output_format = args.output_format + + serialize_kwargs = { + "format": output_format + } + if output_format == "json-ld": + context_dictionary = {k:v for (k,v) in graph.namespace_manager.namespaces()} + serialize_kwargs["context"] = context_dictionary + + out_graph.serialize(args.out_graph, **serialize_kwargs) + +if __name__ == "__main__": + main() diff --git a/case_utils/case_sparql_select/__init__.py b/case_utils/case_sparql_select/__init__.py new file mode 100644 index 0000000..357e3b0 --- /dev/null +++ b/case_utils/case_sparql_select/__init__.py @@ -0,0 +1,116 @@ +#!/usr/bin/env python3 + +# This software was developed at the National Institute of Standards +# and Technology by employees of the Federal Government in the course +# of their official duties. Pursuant to title 17 Section 105 of the +# United States Code this software is not subject to copyright +# protection and is in the public domain. NIST assumes no +# responsibility whatsoever for its use by other parties, and makes +# no guarantees, expressed or implied, about its quality, +# reliability, or any other characteristic. +# +# We would appreciate acknowledgement if the software is used. + +""" +This script executes a SPARQL SELECT query, returning a table representation. The design of the workflow is based on this example built on SPARQLWrapper: +https://lawlesst.github.io/notebook/sparql-dataframe.html + +Note that this assumes a limited syntax style in the outer SELECT clause of the query - only named variables, no aggregations, and a single space character separating all variable names. E.g.: + +SELECT ?x ?y ?z +WHERE +{ ... } + +The word "DISTINCT" will also be cut from the query, if present. + +Should a more complex query be necessary, an outer, wrapping SELECT query would let this script continue to function. +""" + +__version__ = "0.3.0" + +import argparse +import binascii +import os +import logging + +import pandas as pd +import rdflib.plugins.sparql + +import case_utils + +NS_XSD = rdflib.XSD + +_logger = logging.getLogger(os.path.basename(__file__)) + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("-d", "--debug", action="store_true") + parser.add_argument("--disallow-empty-results", action="store_true", help="Raise error if no results are returned for query.") + parser.add_argument("out_table", help="Expected extensions are .html for HTML tables or .md for Markdown tables.") + parser.add_argument("in_sparql") + parser.add_argument("in_graph", nargs="+") + args = parser.parse_args() + + logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) + + graph = rdflib.Graph() + for in_graph_filename in args.in_graph: + graph.parse(in_graph_filename, format=case_utils.guess_format(in_graph_filename)) + + # Inherit prefixes defined in input context dictionary. + nsdict = {k:v for (k,v) in graph.namespace_manager.namespaces()} + + select_query_text = None + with open(args.in_sparql, "r") as in_fh: + select_query_text = in_fh.read().strip() + _logger.debug("select_query_text = %r." % select_query_text) + + # Build columns list from SELECT line. + select_query_text_lines = select_query_text.split("\n") + select_line = [line for line in select_query_text_lines if line.startswith("SELECT ")][0] + variables = select_line.replace(" DISTINCT", "").replace("SELECT ", "").split(" ") + + tally = 0 + records = [] + select_query_object = rdflib.plugins.sparql.prepareQuery(select_query_text, initNs=nsdict) + for (row_no, row) in enumerate(graph.query(select_query_object)): + tally = row_no + 1 + record = [] + for (column_no, column) in enumerate(row): + if column is None: + column_value = "" + elif isinstance(column, rdflib.term.Literal) and column.datatype == NS_XSD.hexBinary: + # Use hexlify to convert xsd:hexBinary to ASCII. + # The render to ASCII is in support of this script rendering results for website viewing. + # .decode() is because hexlify returns bytes. + column_value = binascii.hexlify(column.toPython()).decode() + else: + column_value = column.toPython() + if row_no == 0: + _logger.debug("row[0]column[%d] = %r." % (column_no, column_value)) + record.append(column_value) + records.append(record) + if tally == 0: + if args.disallow_empty_results: + raise ValueError("Failed to return any results.") + + df = pd.DataFrame(records, columns=variables) + + table_text = None + if args.out_table.endswith(".html"): + # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_html.html + # Add CSS classes for CASE website Bootstrap support. + table_text = df.to_html(classes=("table", "table-bordered", "table-condensed")) + elif args.out_table.endswith(".md"): + # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_markdown.html + # https://pypi.org/project/tabulate/ + # Assume Github-flavored Markdown. + table_text = df.to_markdown(tablefmt="github") + if table_text is None: + raise NotImplementedError("Unsupported output extension for output filename %r.", args.out_table) + + with open(args.out_table, "w") as out_fh: + out_fh.write(table_text) + +if __name__ == "__main__": + main() diff --git a/setup.cfg b/setup.cfg index a6c4bb9..5837731 100644 --- a/setup.cfg +++ b/setup.cfg @@ -18,12 +18,18 @@ classifiers = # TODO The constraint on pyparsing can be removed when rdflib Issue #1190 is resolved. # https://github.com/RDFLib/rdflib/issues/1190 install_requires = + # Note that numpy (pandas dependency) is only supported in Python >= 3.7. + pandas;python_version>='3.7' pyparsing < 3.0.0 rdflib-jsonld requests + tabulate packages = find: python_requires = >=3.6 [options.entry_points] console_scripts = case_file = case_utils.case_file:main + case_sparql_construct = case_utils.case_sparql_construct:main + # Note that numpy (pandas dependency, and pandas is dependency of case_sparql_select) is only supported in Python >= 3.7. + case_sparql_select = case_utils.case_sparql_select:main diff --git a/tests/Makefile b/tests/Makefile index 8a83ec6..bdbcee5 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -20,11 +20,17 @@ srcdir := $(shell pwd) PYTHON3 ?= $(shell which python3.9 2>/dev/null || which python3.8 2>/dev/null || which python3.7 2>/dev/null || which python3.6 2>/dev/null || which python3) all: \ - all-case_file + all-case_file \ + all-case_sparql_construct \ + all-case_sparql_select .PHONY: \ all-case_file \ + all-case_sparql_construct \ + all-case_sparql_select \ check-case_file \ + check-case_sparql_construct \ + check-case_sparql_select \ check-isomorphic_diff \ download @@ -32,6 +38,8 @@ all: \ $(top_srcdir)/.git_submodule_init.done.log \ $(top_srcdir)/case_utils/__init__.py \ $(top_srcdir)/case_utils/case_file/__init__.py \ + $(top_srcdir)/case_utils/case_sparql_construct/__init__.py \ + $(top_srcdir)/case_utils/case_sparql_select/__init__.py \ $(top_srcdir)/case_utils/local_uuid.py \ $(top_srcdir)/setup.cfg \ $(top_srcdir)/setup.py \ @@ -58,13 +66,30 @@ all-case_file: \ $(MAKE) \ --directory case_file +all-case_sparql_construct: \ + .venv.done.log + $(MAKE) \ + --directory case_sparql_construct + +all-case_sparql_select: \ + .venv.done.log + # Only descend if python>=3.7, due to pandas dependency unsatisfiable in 3.6.x. + # Boolean explanation: sys.exit(False) has exit status 0. + venv/bin/python3 -c 'import sys ; sys.exit(not (sys.version_info < (3, 7)))' \ + || $(MAKE) \ + --directory case_sparql_select + # These check calls are provided in preferred run-order. check: \ check-isomorphic_diff \ - check-case_file + check-case_file \ + check-case_sparql_construct \ + check-case_sparql_select source venv/bin/activate \ && pytest \ --ignore case_file \ + --ignore case_sparql_construct \ + --ignore case_sparql_select \ --log-level=DEBUG check-case_file: \ @@ -73,6 +98,21 @@ check-case_file: \ --directory case_file \ check +check-case_sparql_construct: \ + .venv.done.log + $(MAKE) \ + --directory case_sparql_construct \ + check + +check-case_sparql_select: \ + .venv.done.log + # Only descend if python>=3.7, due to pandas dependency unsatisfiable in 3.6.x. + # Boolean explanation: sys.exit(False) has exit status 0. + venv/bin/python3 -c 'import sys ; sys.exit(not (sys.version_info < (3, 7)))' \ + || $(MAKE) \ + --directory case_sparql_select \ + check + check-isomorphic_diff: \ .venv.done.log $(MAKE) \ @@ -80,6 +120,12 @@ check-isomorphic_diff: \ check clean: + @$(MAKE) \ + --directory case_sparql_select \ + clean + @$(MAKE) \ + --directory case_sparql_construct \ + clean @$(MAKE) \ --directory case_file \ clean diff --git a/tests/case_sparql_construct/.gitignore b/tests/case_sparql_construct/.gitignore new file mode 100644 index 0000000..1f24ccd --- /dev/null +++ b/tests/case_sparql_construct/.gitignore @@ -0,0 +1 @@ +output.ttl diff --git a/tests/case_sparql_construct/Makefile b/tests/case_sparql_construct/Makefile new file mode 100644 index 0000000..9bc0fb3 --- /dev/null +++ b/tests/case_sparql_construct/Makefile @@ -0,0 +1,47 @@ +#!/usr/bin/make -f + +# This software was developed at the National Institute of Standards +# and Technology by employees of the Federal Government in the course +# of their official duties. Pursuant to title 17 Section 105 of the +# United States Code this software is not subject to copyright +# protection and is in the public domain. NIST assumes no +# responsibility whatsoever for its use by other parties, and makes +# no guarantees, expressed or implied, about its quality, +# reliability, or any other characteristic. +# +# We would appreciate acknowledgement if the software is used. + +SHELL := /bin/bash + +top_srcdir := $(shell cd ../.. ; pwd) + +tests_srcdir := $(top_srcdir)/tests + +all: \ + output.ttl + +check: \ + output.ttl + source $(tests_srcdir)/venv/bin/activate \ + && pytest \ + --log-level=DEBUG + +clean: + @rm -rf \ + __pycache__ + @rm -f \ + output.ttl \ + _* + +output.ttl: \ + $(tests_srcdir)/.venv.done.log \ + input-1.sparql \ + input-2.ttl \ + input-3.json + source $(tests_srcdir)/venv/bin/activate \ + && case_sparql_construct \ + _$@ \ + input-1.sparql \ + input-2.ttl \ + input-3.json + mv _$@ $@ diff --git a/tests/case_sparql_construct/README.md b/tests/case_sparql_construct/README.md new file mode 100644 index 0000000..bab41ac --- /dev/null +++ b/tests/case_sparql_construct/README.md @@ -0,0 +1,16 @@ +# Test of CASE SPARQL CONSTRUCT query runner + + +## Test procedure + +The tests in this directory confirms `case_sparql_construct` satisfies a base set of expected command line functionality. +1. Inputs - `input-2.ttl` and `input-3.json` contain a small graph split across two files, and `input-1.sparql` contains a SPARQL `CONSTRUCT` query. +2. Outputs - `output.ttl` is generated by using `case_sparql_construct` to run `input-1.sparql` against the two `input-*.*` graph files. This affirms that `case_sparql_construct` can read multiple input files of differing formats. +3. Output verification - two name-pairs should have vcard records generated. The test `test_templates_with_blank_nodes_result()` confirms those pairs are in the output graph. + + +## References + +The data and query used in `input-2.ttl`, `input-3.json` and `input.sparql` are copied from ["SPARQL Query Language for RDF", Section 10.2.1](https://www.w3.org/TR/rdf-sparql-query/#tempatesWithBNodes), with these modifications: +* `input-2.ttl` contains the original example's `_:a` (Alice) records, but drops the `_:b` (Bob) records. +* `input-3.json` is a conversion of the original Turtle example's `_:b` records to JSON-LD. diff --git a/tests/case_sparql_construct/input-1.sparql b/tests/case_sparql_construct/input-1.sparql new file mode 100644 index 0000000..aee09cc --- /dev/null +++ b/tests/case_sparql_construct/input-1.sparql @@ -0,0 +1,14 @@ +# Query source: +# https://www.w3.org/TR/rdf-sparql-query/#tempatesWithBNodes + +PREFIX foaf: +PREFIX vcard: + +CONSTRUCT { ?x vcard:N _:v . + _:v vcard:givenName ?gname . + _:v vcard:familyName ?fname } +WHERE + { + { ?x foaf:firstname ?gname } UNION { ?x foaf:givenname ?gname } . + { ?x foaf:surname ?fname } UNION { ?x foaf:family_name ?fname } . + } diff --git a/tests/case_sparql_construct/input-2.ttl b/tests/case_sparql_construct/input-2.ttl new file mode 100644 index 0000000..8ac286f --- /dev/null +++ b/tests/case_sparql_construct/input-2.ttl @@ -0,0 +1,4 @@ +@prefix foaf: . + +_:a foaf:givenname "Alice" . +_:a foaf:family_name "Hacker" . diff --git a/tests/case_sparql_construct/input-3.json b/tests/case_sparql_construct/input-3.json new file mode 100644 index 0000000..6b78699 --- /dev/null +++ b/tests/case_sparql_construct/input-3.json @@ -0,0 +1,11 @@ +{ + "@context": { + "foaf": "http://xmlns.com/foaf/0.1/" + }, + "@graph": [ + { + "foaf:firstname": "Bob", + "foaf:surname": "Hacker" + } + ] +} diff --git a/tests/case_sparql_construct/test_case_sparql_construct.py b/tests/case_sparql_construct/test_case_sparql_construct.py new file mode 100644 index 0000000..60da9d9 --- /dev/null +++ b/tests/case_sparql_construct/test_case_sparql_construct.py @@ -0,0 +1,49 @@ +#!/usr/bin/make -f + +# This software was developed at the National Institute of Standards +# and Technology by employees of the Federal Government in the course +# of their official duties. Pursuant to title 17 Section 105 of the +# United States Code this software is not subject to copyright +# protection and is in the public domain. NIST assumes no +# responsibility whatsoever for its use by other parties, and makes +# no guarantees, expressed or implied, about its quality, +# reliability, or any other characteristic. +# +# We would appreciate acknowledgement if the software is used. + +import rdflib.plugins.sparql + +import case_utils + +def test_templates_with_blank_nodes_result(): + ground_truth_positive = { + ("Alice", "Hacker"), + ("Bob", "Hacker") + } + ground_truth_negative = set() + + graph = rdflib.Graph() + graph.parse("output.ttl", format=case_utils.guess_format("output.ttl")) + + computed = set() + query_string = """\ +PREFIX vcard: + +SELECT ?lGivenName ?lFamilyName +WHERE { + ?nNode + vcard:givenName ?lGivenName ; + vcard:familyName ?lFamilyName ; + . +} +""" + for result in graph.query(query_string): + ( + l_given_name, + l_family_name + ) = result + computed.add(( + l_given_name.toPython(), + l_family_name.toPython() + )) + assert computed == ground_truth_positive diff --git a/tests/case_sparql_select/.check-output.html b/tests/case_sparql_select/.check-output.html new file mode 100644 index 0000000..aff9beb --- /dev/null +++ b/tests/case_sparql_select/.check-output.html @@ -0,0 +1,21 @@ + + + + + + + + + + + + + + + + + + + + +
?name?mbox
0Johnny Lee Outlawmailto:jlow@example.com
1Peter Goodguymailto:peter@example.org
\ No newline at end of file diff --git a/tests/case_sparql_select/.check-output.md b/tests/case_sparql_select/.check-output.md new file mode 100644 index 0000000..77b05f4 --- /dev/null +++ b/tests/case_sparql_select/.check-output.md @@ -0,0 +1,4 @@ +| | ?name | ?mbox | +|----|-------------------|--------------------------| +| 0 | Johnny Lee Outlaw | mailto:jlow@example.com | +| 1 | Peter Goodguy | mailto:peter@example.org | \ No newline at end of file diff --git a/tests/case_sparql_select/.gitignore b/tests/case_sparql_select/.gitignore new file mode 100644 index 0000000..a85ef3b --- /dev/null +++ b/tests/case_sparql_select/.gitignore @@ -0,0 +1,2 @@ +output.html +output.md diff --git a/tests/case_sparql_select/Makefile b/tests/case_sparql_select/Makefile new file mode 100644 index 0000000..200c36b --- /dev/null +++ b/tests/case_sparql_select/Makefile @@ -0,0 +1,64 @@ +#!/usr/bin/make -f + +# This software was developed at the National Institute of Standards +# and Technology by employees of the Federal Government in the course +# of their official duties. Pursuant to title 17 Section 105 of the +# United States Code this software is not subject to copyright +# protection and is in the public domain. NIST assumes no +# responsibility whatsoever for its use by other parties, and makes +# no guarantees, expressed or implied, about its quality, +# reliability, or any other characteristic. +# +# We would appreciate acknowledgement if the software is used. + +SHELL := /bin/bash + +top_srcdir := $(shell cd ../.. ; pwd) + +tests_srcdir := $(top_srcdir)/tests + +all: \ + output.html \ + output.md + +.PHONY: \ + check-html \ + check-markdown + +.PRECIOUS: \ + output.% + +check: \ + check-html \ + check-markdown + +check-html: \ + .check-output.html \ + output.html + diff $^ + +check-markdown: \ + .check-output.md \ + output.md + diff $^ + +clean: + @rm -rf \ + __pycache__ + @rm -f \ + *.html \ + *.md \ + _* + +output.%: \ + $(tests_srcdir)/.venv.done.log \ + input-1.sparql \ + input-2.ttl \ + input-3.json + source $(tests_srcdir)/venv/bin/activate \ + && case_sparql_select \ + _$@ \ + input-1.sparql \ + input-2.ttl \ + input-3.json + mv _$@ $@ diff --git a/tests/case_sparql_select/input-1.sparql b/tests/case_sparql_select/input-1.sparql new file mode 100644 index 0000000..ab97ec5 --- /dev/null +++ b/tests/case_sparql_select/input-1.sparql @@ -0,0 +1,9 @@ +# Query source: +# https://www.w3.org/TR/rdf-sparql-query/#MultipleMatches + +PREFIX foaf: +SELECT ?name ?mbox +WHERE + { ?x foaf:name ?name . + ?x foaf:mbox ?mbox } +ORDER BY ?name ?mbox diff --git a/tests/case_sparql_select/input-2.ttl b/tests/case_sparql_select/input-2.ttl new file mode 100644 index 0000000..d1ccff9 --- /dev/null +++ b/tests/case_sparql_select/input-2.ttl @@ -0,0 +1,5 @@ +@prefix foaf: . + +_:a foaf:name "Johnny Lee Outlaw" . +_:a foaf:mbox . +_:c foaf:mbox . diff --git a/tests/case_sparql_select/input-3.json b/tests/case_sparql_select/input-3.json new file mode 100644 index 0000000..8ddfbf4 --- /dev/null +++ b/tests/case_sparql_select/input-3.json @@ -0,0 +1,13 @@ +{ + "@context": { + "foaf": "http://xmlns.com/foaf/0.1/" + }, + "@graph": [ + { + "foaf:name": "Peter Goodguy", + "foaf:mbox": { + "@id": "mailto:peter@example.org" + } + } + ] +}