Skip to content

Commit c902db3

Browse files
Merge pull request #6 from casework/AC-178
Ac 178
2 parents b7ea257 + 1deba2f commit c902db3

File tree

19 files changed

+547
-2
lines changed

19 files changed

+547
-2
lines changed

README.md

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,36 @@ case_file --disable-hashes sample.txt.json sample.txt
3535
```
3636

3737

38+
### SPARQL executors
39+
40+
Two commands are provided to generate output from a SPARQL query and one or more input graphs. Input graphs can be any graph, such as instance data or supplementary ontology files that supply custom class definitions or other external ontologies.
41+
42+
43+
#### `case_sparql_construct`
44+
45+
To use a SPARQL `CONSTRUCT` query to make a supplementary graph file from one or more input graphs:
46+
47+
```bash
48+
case_sparql_construct output.json input.sparql input.json [input-2.json ...]
49+
```
50+
51+
52+
#### `case_sparql_select`
53+
54+
To use a SPARQL `SELECT` query to make a table from one or more input graphs:
55+
56+
```bash
57+
# HTML output with Bootstrap classes
58+
# (e.g. for Jekyll-backed websites)
59+
case_sparql_select output.html input.sparql input.json [input-2.json ...]
60+
61+
# Markdown, Github-flavored
62+
case_sparql_select output.md input.sparql input.json [input-2.json ...]
63+
```
64+
65+
Note that `case_sparql_select` is not guaranteed to function with Pythons below version 3.7.
66+
67+
3868
### `local_uuid`
3969

4070
This [module](case_utils/local_uuid.py) provides a wrapper UUID generator, `local_uuid()`. Its main purpose is making example data generate consistent identifiers, and intentionally includes mechanisms to make it difficult to activate this mode without awareness of the caller.
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
#!/usr/bin/env python3
2+
3+
# This software was developed at the National Institute of Standards
4+
# and Technology by employees of the Federal Government in the course
5+
# of their official duties. Pursuant to title 17 Section 105 of the
6+
# United States Code this software is not subject to copyright
7+
# protection and is in the public domain. NIST assumes no
8+
# responsibility whatsoever for its use by other parties, and makes
9+
# no guarantees, expressed or implied, about its quality,
10+
# reliability, or any other characteristic.
11+
#
12+
# We would appreciate acknowledgement if the software is used.
13+
14+
"""
15+
This script executes a SPARQL CONSTRUCT query, returning a graph of the generated triples.
16+
"""
17+
18+
__version__ = "0.1.0"
19+
20+
import argparse
21+
import os
22+
import logging
23+
24+
import rdflib.plugins.sparql
25+
26+
import case_utils
27+
28+
_logger = logging.getLogger(os.path.basename(__file__))
29+
30+
def main():
31+
parser = argparse.ArgumentParser()
32+
parser.add_argument("-d", "--debug", action="store_true")
33+
parser.add_argument("--disallow-empty-results", action="store_true", help="Raise error if no results are returned for query.")
34+
parser.add_argument("--output-format", help="Override extension-based format guesser.")
35+
parser.add_argument("out_graph")
36+
parser.add_argument("in_sparql")
37+
parser.add_argument("in_graph", nargs="+")
38+
args = parser.parse_args()
39+
40+
logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO)
41+
42+
in_graph = rdflib.Graph()
43+
for in_graph_filename in args.in_graph:
44+
in_graph.parse(in_graph_filename, format=case_utils.guess_format(in_graph_filename))
45+
_logger.debug("len(in_graph) = %d.", len(in_graph))
46+
47+
out_graph = rdflib.Graph()
48+
49+
# Inherit prefixes defined in input context dictionary.
50+
nsdict = {k:v for (k,v) in in_graph.namespace_manager.namespaces()}
51+
for prefix in sorted(nsdict.keys()):
52+
out_graph.bind(prefix, nsdict[prefix])
53+
54+
_logger.debug("Running query in %r." % args.in_sparql)
55+
construct_query_text = None
56+
with open(args.in_sparql, "r") as in_fh:
57+
construct_query_text = in_fh.read().strip()
58+
assert not construct_query_text is None
59+
60+
construct_query_object = rdflib.plugins.sparql.prepareQuery(construct_query_text, initNs=nsdict)
61+
62+
# https://rdfextras.readthedocs.io/en/latest/working_with.html
63+
construct_query_result = in_graph.query(construct_query_object)
64+
_logger.debug("type(construct_query_result) = %r." % type(construct_query_result))
65+
_logger.debug("len(construct_query_result) = %d." % len(construct_query_result))
66+
for (row_no, row) in enumerate(construct_query_result):
67+
if row_no == 0:
68+
_logger.debug("row[0] = %r." % (row,))
69+
out_graph.add(row)
70+
71+
output_format = None
72+
if args.output_format is None:
73+
output_format = case_utils.guess_format(args.out_graph)
74+
else:
75+
output_format = args.output_format
76+
77+
serialize_kwargs = {
78+
"format": output_format
79+
}
80+
if output_format == "json-ld":
81+
context_dictionary = {k:v for (k,v) in graph.namespace_manager.namespaces()}
82+
serialize_kwargs["context"] = context_dictionary
83+
84+
out_graph.serialize(args.out_graph, **serialize_kwargs)
85+
86+
if __name__ == "__main__":
87+
main()
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
#!/usr/bin/env python3
2+
3+
# This software was developed at the National Institute of Standards
4+
# and Technology by employees of the Federal Government in the course
5+
# of their official duties. Pursuant to title 17 Section 105 of the
6+
# United States Code this software is not subject to copyright
7+
# protection and is in the public domain. NIST assumes no
8+
# responsibility whatsoever for its use by other parties, and makes
9+
# no guarantees, expressed or implied, about its quality,
10+
# reliability, or any other characteristic.
11+
#
12+
# We would appreciate acknowledgement if the software is used.
13+
14+
"""
15+
This script executes a SPARQL SELECT query, returning a table representation. The design of the workflow is based on this example built on SPARQLWrapper:
16+
https://lawlesst.github.io/notebook/sparql-dataframe.html
17+
18+
Note that this assumes a limited syntax style in the outer SELECT clause of the query - only named variables, no aggregations, and a single space character separating all variable names. E.g.:
19+
20+
SELECT ?x ?y ?z
21+
WHERE
22+
{ ... }
23+
24+
The word "DISTINCT" will also be cut from the query, if present.
25+
26+
Should a more complex query be necessary, an outer, wrapping SELECT query would let this script continue to function.
27+
"""
28+
29+
__version__ = "0.3.0"
30+
31+
import argparse
32+
import binascii
33+
import os
34+
import logging
35+
36+
import pandas as pd
37+
import rdflib.plugins.sparql
38+
39+
import case_utils
40+
41+
NS_XSD = rdflib.XSD
42+
43+
_logger = logging.getLogger(os.path.basename(__file__))
44+
45+
def main():
46+
parser = argparse.ArgumentParser()
47+
parser.add_argument("-d", "--debug", action="store_true")
48+
parser.add_argument("--disallow-empty-results", action="store_true", help="Raise error if no results are returned for query.")
49+
parser.add_argument("out_table", help="Expected extensions are .html for HTML tables or .md for Markdown tables.")
50+
parser.add_argument("in_sparql")
51+
parser.add_argument("in_graph", nargs="+")
52+
args = parser.parse_args()
53+
54+
logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO)
55+
56+
graph = rdflib.Graph()
57+
for in_graph_filename in args.in_graph:
58+
graph.parse(in_graph_filename, format=case_utils.guess_format(in_graph_filename))
59+
60+
# Inherit prefixes defined in input context dictionary.
61+
nsdict = {k:v for (k,v) in graph.namespace_manager.namespaces()}
62+
63+
select_query_text = None
64+
with open(args.in_sparql, "r") as in_fh:
65+
select_query_text = in_fh.read().strip()
66+
_logger.debug("select_query_text = %r." % select_query_text)
67+
68+
# Build columns list from SELECT line.
69+
select_query_text_lines = select_query_text.split("\n")
70+
select_line = [line for line in select_query_text_lines if line.startswith("SELECT ")][0]
71+
variables = select_line.replace(" DISTINCT", "").replace("SELECT ", "").split(" ")
72+
73+
tally = 0
74+
records = []
75+
select_query_object = rdflib.plugins.sparql.prepareQuery(select_query_text, initNs=nsdict)
76+
for (row_no, row) in enumerate(graph.query(select_query_object)):
77+
tally = row_no + 1
78+
record = []
79+
for (column_no, column) in enumerate(row):
80+
if column is None:
81+
column_value = ""
82+
elif isinstance(column, rdflib.term.Literal) and column.datatype == NS_XSD.hexBinary:
83+
# Use hexlify to convert xsd:hexBinary to ASCII.
84+
# The render to ASCII is in support of this script rendering results for website viewing.
85+
# .decode() is because hexlify returns bytes.
86+
column_value = binascii.hexlify(column.toPython()).decode()
87+
else:
88+
column_value = column.toPython()
89+
if row_no == 0:
90+
_logger.debug("row[0]column[%d] = %r." % (column_no, column_value))
91+
record.append(column_value)
92+
records.append(record)
93+
if tally == 0:
94+
if args.disallow_empty_results:
95+
raise ValueError("Failed to return any results.")
96+
97+
df = pd.DataFrame(records, columns=variables)
98+
99+
table_text = None
100+
if args.out_table.endswith(".html"):
101+
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_html.html
102+
# Add CSS classes for CASE website Bootstrap support.
103+
table_text = df.to_html(classes=("table", "table-bordered", "table-condensed"))
104+
elif args.out_table.endswith(".md"):
105+
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_markdown.html
106+
# https://pypi.org/project/tabulate/
107+
# Assume Github-flavored Markdown.
108+
table_text = df.to_markdown(tablefmt="github")
109+
if table_text is None:
110+
raise NotImplementedError("Unsupported output extension for output filename %r.", args.out_table)
111+
112+
with open(args.out_table, "w") as out_fh:
113+
out_fh.write(table_text)
114+
115+
if __name__ == "__main__":
116+
main()

setup.cfg

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,18 @@ classifiers =
1818
# TODO The constraint on pyparsing can be removed when rdflib Issue #1190 is resolved.
1919
# https://github.com/RDFLib/rdflib/issues/1190
2020
install_requires =
21+
# Note that numpy (pandas dependency) is only supported in Python >= 3.7.
22+
pandas;python_version>='3.7'
2123
pyparsing < 3.0.0
2224
rdflib-jsonld
2325
requests
26+
tabulate
2427
packages = find:
2528
python_requires = >=3.6
2629

2730
[options.entry_points]
2831
console_scripts =
2932
case_file = case_utils.case_file:main
33+
case_sparql_construct = case_utils.case_sparql_construct:main
34+
# Note that numpy (pandas dependency, and pandas is dependency of case_sparql_select) is only supported in Python >= 3.7.
35+
case_sparql_select = case_utils.case_sparql_select:main

tests/Makefile

Lines changed: 48 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,18 +20,26 @@ srcdir := $(shell pwd)
2020
PYTHON3 ?= $(shell which python3.9 2>/dev/null || which python3.8 2>/dev/null || which python3.7 2>/dev/null || which python3.6 2>/dev/null || which python3)
2121

2222
all: \
23-
all-case_file
23+
all-case_file \
24+
all-case_sparql_construct \
25+
all-case_sparql_select
2426

2527
.PHONY: \
2628
all-case_file \
29+
all-case_sparql_construct \
30+
all-case_sparql_select \
2731
check-case_file \
32+
check-case_sparql_construct \
33+
check-case_sparql_select \
2834
check-isomorphic_diff \
2935
download
3036

3137
.venv.done.log: \
3238
$(top_srcdir)/.git_submodule_init.done.log \
3339
$(top_srcdir)/case_utils/__init__.py \
3440
$(top_srcdir)/case_utils/case_file/__init__.py \
41+
$(top_srcdir)/case_utils/case_sparql_construct/__init__.py \
42+
$(top_srcdir)/case_utils/case_sparql_select/__init__.py \
3543
$(top_srcdir)/case_utils/local_uuid.py \
3644
$(top_srcdir)/setup.cfg \
3745
$(top_srcdir)/setup.py \
@@ -58,13 +66,30 @@ all-case_file: \
5866
$(MAKE) \
5967
--directory case_file
6068

69+
all-case_sparql_construct: \
70+
.venv.done.log
71+
$(MAKE) \
72+
--directory case_sparql_construct
73+
74+
all-case_sparql_select: \
75+
.venv.done.log
76+
# Only descend if python>=3.7, due to pandas dependency unsatisfiable in 3.6.x.
77+
# Boolean explanation: sys.exit(False) has exit status 0.
78+
venv/bin/python3 -c 'import sys ; sys.exit(not (sys.version_info < (3, 7)))' \
79+
|| $(MAKE) \
80+
--directory case_sparql_select
81+
6182
# These check calls are provided in preferred run-order.
6283
check: \
6384
check-isomorphic_diff \
64-
check-case_file
85+
check-case_file \
86+
check-case_sparql_construct \
87+
check-case_sparql_select
6588
source venv/bin/activate \
6689
&& pytest \
6790
--ignore case_file \
91+
--ignore case_sparql_construct \
92+
--ignore case_sparql_select \
6893
--log-level=DEBUG
6994

7095
check-case_file: \
@@ -73,13 +98,34 @@ check-case_file: \
7398
--directory case_file \
7499
check
75100

101+
check-case_sparql_construct: \
102+
.venv.done.log
103+
$(MAKE) \
104+
--directory case_sparql_construct \
105+
check
106+
107+
check-case_sparql_select: \
108+
.venv.done.log
109+
# Only descend if python>=3.7, due to pandas dependency unsatisfiable in 3.6.x.
110+
# Boolean explanation: sys.exit(False) has exit status 0.
111+
venv/bin/python3 -c 'import sys ; sys.exit(not (sys.version_info < (3, 7)))' \
112+
|| $(MAKE) \
113+
--directory case_sparql_select \
114+
check
115+
76116
check-isomorphic_diff: \
77117
.venv.done.log
78118
$(MAKE) \
79119
--directory isomorphic_diff \
80120
check
81121

82122
clean:
123+
@$(MAKE) \
124+
--directory case_sparql_select \
125+
clean
126+
@$(MAKE) \
127+
--directory case_sparql_construct \
128+
clean
83129
@$(MAKE) \
84130
--directory case_file \
85131
clean
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
output.ttl

0 commit comments

Comments
 (0)