Skip to content

Add CDO concept typo-checker based on set-differencing URIRefs using CDO prefixes #77

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Nov 17, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions CONTRIBUTE.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,12 @@ pushd case_utils/ontology
git add case-0.6.0.ttl # Assuming CASE 0.6.0 was just released.
# and/or
git add uco-0.8.0.ttl # Assuming UCO 0.8.0 was adopted in CASE 0.6.0.

git add ontology_and_version_iris.txt
popd
make check
# Assuming `make check` passes:
git commit -m "Build CASE 0.6.0 monolithic .ttl files" case_utils/ontology/case-0.6.0-subclasses.ttl case_utils/ontology/case-0.6.0.ttl
git commit -m "Build CASE 0.6.0 monolithic .ttl files" case_utils/ontology/case-0.6.0-subclasses.ttl case_utils/ontology/case-0.6.0.ttl case_utils/ontology/ontology_and_version_iris.txt
git commit -m "Update CASE ontology pointer to version 0.6.0" dependencies/CASE case_utils/ontology/version_info.py
```

Expand All @@ -43,4 +45,4 @@ pre-commit --version
The `pre-commit` tool hooks into Git's commit machinery to run a set of linters and static analyzers over each change. To install `pre-commit` into Git's hooks, run:
```bash
pre-commit install
```
```
94 changes: 93 additions & 1 deletion case_utils/case_validate/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,19 +37,39 @@
import os
import sys
import typing
import warnings

import pyshacl # type: ignore
import rdflib.util
import rdflib

import case_utils.ontology
from case_utils.ontology.version_info import (
CURRENT_CASE_VERSION,
built_version_choices_list,
)

NS_OWL = rdflib.OWL
NS_RDF = rdflib.RDF
NS_RDFS = rdflib.RDFS

_logger = logging.getLogger(os.path.basename(__file__))


class NonExistentCDOConceptWarning(UserWarning):
"""
This class is used when a concept is encountered in the data graph that is not part of CDO ontologies, according to the --built-version flags and --ontology-graph flags.
"""

pass


def concept_is_cdo_concept(n_concept: rdflib.URIRef) -> bool:
concept_iri = str(n_concept)
return concept_iri.startswith(
"https://ontology.unifiedcyberontology.org/"
) or concept_iri.startswith("https://ontology.caseontology.org/")


def main() -> None:
parser = argparse.ArgumentParser(
description="CASE wrapper to pySHACL command line tool."
Expand Down Expand Up @@ -160,6 +180,71 @@ def main() -> None:
_logger.debug("arg_ontology_graph = %r.", arg_ontology_graph)
ontology_graph.parse(arg_ontology_graph)

# Construct set of CDO concepts for data graph concept-existence review.
cdo_concepts: typing.Set[rdflib.URIRef] = set()

for n_structural_class in [
NS_OWL.Class,
NS_OWL.AnnotationProperty,
NS_OWL.DatatypeProperty,
NS_OWL.ObjectProperty,
NS_RDFS.Datatype,
]:
for ontology_triple in ontology_graph.triples(
(None, NS_RDF.type, n_structural_class)
):
if not isinstance(ontology_triple[0], rdflib.URIRef):
continue
if concept_is_cdo_concept(ontology_triple[0]):
cdo_concepts.add(ontology_triple[0])
for n_ontology_predicate in [
NS_OWL.backwardCompatibleWith,
NS_OWL.imports,
NS_OWL.incompatibleWith,
NS_OWL.priorVersion,
NS_OWL.versionIRI,
]:
for ontology_triple in ontology_graph.triples(
(None, n_ontology_predicate, None)
):
assert isinstance(ontology_triple[0], rdflib.URIRef)
assert isinstance(ontology_triple[2], rdflib.URIRef)
cdo_concepts.add(ontology_triple[0])
cdo_concepts.add(ontology_triple[2])
for ontology_triple in ontology_graph.triples((None, NS_RDF.type, NS_OWL.Ontology)):
if not isinstance(ontology_triple[0], rdflib.URIRef):
continue
cdo_concepts.add(ontology_triple[0])

# Also load historical ontology and version IRIs.
ontology_and_version_iris_data = importlib.resources.read_text(
case_utils.ontology, "ontology_and_version_iris.txt"
)
for line in ontology_and_version_iris_data.split("\n"):
cleaned_line = line.strip()
if cleaned_line == "":
continue
cdo_concepts.add(rdflib.URIRef(cleaned_line))

data_cdo_concepts: typing.Set[rdflib.URIRef] = set()
for data_triple in data_graph.triples((None, None, None)):
for data_triple_member in data_triple:
if isinstance(data_triple_member, rdflib.URIRef):
if concept_is_cdo_concept(data_triple_member):
data_cdo_concepts.add(data_triple_member)
elif isinstance(data_triple_member, rdflib.Literal):
if isinstance(data_triple_member.datatype, rdflib.URIRef):
if concept_is_cdo_concept(data_triple_member.datatype):
data_cdo_concepts.add(data_triple_member.datatype)

undefined_cdo_concepts = data_cdo_concepts - cdo_concepts
for undefined_cdo_concept in sorted(undefined_cdo_concepts):
warnings.warn(undefined_cdo_concept, NonExistentCDOConceptWarning)
undefined_cdo_concepts_message = (
"There were %d concepts with CDO IRIs in the data graph that are not in the ontology graph."
% len(undefined_cdo_concepts)
)

# Determine output format.
# pySHACL's determination of output formatting is handled solely
# through the -f flag. Other CASE CLI tools handle format
Expand Down Expand Up @@ -214,6 +299,13 @@ def main() -> None:
% type(validation_graph)
)

if len(undefined_cdo_concepts) > 0:
warnings.warn(undefined_cdo_concepts_message)
if not args.allow_warnings:
undefined_cdo_concepts_alleviation_message = "The data graph is SHACL-conformant with the CDO ontologies, but nonexistent-concept references raise Warnings with this tool. Please either correct the concept names in the data graph; use the --ontology-graph flag to pass a corrected CDO ontology file, also using --built-version none; or, use the --allow-warnings flag."
warnings.warn(undefined_cdo_concepts_alleviation_message)
conforms = False

sys.exit(0 if conforms else 1)


Expand Down
15 changes: 14 additions & 1 deletion case_utils/ontology/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ RDF_TOOLKIT_JAR := $(uco_srcdir)/lib/rdf-toolkit.jar
case_version := $(shell python3 version_info.py)

all: \
case-$(case_version)-subclasses.ttl
ontology_and_version_iris.txt

.PRECIOUS: \
case-$(case_version).ttl
Expand Down Expand Up @@ -79,3 +79,16 @@ case-$(case_version)-subclasses.ttl: \
clean:
@rm -f \
case-$(case_version)*.ttl

ontology_and_version_iris.txt: \
src/ontology_and_version_iris.py \
case-$(case_version)-subclasses.ttl
# Guarantee venv is built. (Same rationale as in the subclasses.ttl recipe.)
$(MAKE) \
--directory $(case_srcdir)/tests \
.venv.done.log
source $(case_srcdir)/tests/venv/bin/activate \
&& python3 src/ontology_and_version_iris.py \
_$@ \
case-*.ttl
mv _$@ $@
72 changes: 72 additions & 0 deletions case_utils/ontology/ontology_and_version_iris.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
http://case.example.org/core
https://ontology.caseontology.org/case/case
https://ontology.caseontology.org/case/case/0.7.1
https://ontology.caseontology.org/case/case/1.0.0
https://ontology.caseontology.org/case/investigation
https://ontology.caseontology.org/case/investigation/0.7.1
https://ontology.caseontology.org/case/investigation/1.0.0
https://ontology.caseontology.org/case/vocabulary
https://ontology.caseontology.org/case/vocabulary/0.7.1
https://ontology.caseontology.org/case/vocabulary/1.0.0
https://ontology.unifiedcyberontology.org/co
https://ontology.unifiedcyberontology.org/co/1.0.0
https://ontology.unifiedcyberontology.org/owl
https://ontology.unifiedcyberontology.org/owl/1.0.0
https://ontology.unifiedcyberontology.org/uco/action
https://ontology.unifiedcyberontology.org/uco/action/0.9.1
https://ontology.unifiedcyberontology.org/uco/action/1.0.0
https://ontology.unifiedcyberontology.org/uco/configuration
https://ontology.unifiedcyberontology.org/uco/configuration/1.0.0
https://ontology.unifiedcyberontology.org/uco/core
https://ontology.unifiedcyberontology.org/uco/core/0.9.1
https://ontology.unifiedcyberontology.org/uco/core/1.0.0
https://ontology.unifiedcyberontology.org/uco/identity
https://ontology.unifiedcyberontology.org/uco/identity/0.9.1
https://ontology.unifiedcyberontology.org/uco/identity/1.0.0
https://ontology.unifiedcyberontology.org/uco/location
https://ontology.unifiedcyberontology.org/uco/location/0.9.1
https://ontology.unifiedcyberontology.org/uco/location/1.0.0
https://ontology.unifiedcyberontology.org/uco/marking
https://ontology.unifiedcyberontology.org/uco/marking/0.9.1
https://ontology.unifiedcyberontology.org/uco/marking/1.0.0
https://ontology.unifiedcyberontology.org/uco/observable
https://ontology.unifiedcyberontology.org/uco/observable/0.9.1
https://ontology.unifiedcyberontology.org/uco/observable/1.0.0
https://ontology.unifiedcyberontology.org/uco/pattern
https://ontology.unifiedcyberontology.org/uco/pattern/0.9.1
https://ontology.unifiedcyberontology.org/uco/pattern/1.0.0
https://ontology.unifiedcyberontology.org/uco/role
https://ontology.unifiedcyberontology.org/uco/role/0.9.1
https://ontology.unifiedcyberontology.org/uco/role/1.0.0
https://ontology.unifiedcyberontology.org/uco/time
https://ontology.unifiedcyberontology.org/uco/time/0.9.1
https://ontology.unifiedcyberontology.org/uco/time/1.0.0
https://ontology.unifiedcyberontology.org/uco/tool
https://ontology.unifiedcyberontology.org/uco/tool/0.9.1
https://ontology.unifiedcyberontology.org/uco/tool/1.0.0
https://ontology.unifiedcyberontology.org/uco/types
https://ontology.unifiedcyberontology.org/uco/types/0.9.1
https://ontology.unifiedcyberontology.org/uco/types/1.0.0
https://ontology.unifiedcyberontology.org/uco/uco
https://ontology.unifiedcyberontology.org/uco/uco/0.9.1
https://ontology.unifiedcyberontology.org/uco/uco/1.0.0
https://ontology.unifiedcyberontology.org/uco/victim
https://ontology.unifiedcyberontology.org/uco/victim/0.9.1
https://ontology.unifiedcyberontology.org/uco/victim/1.0.0
https://ontology.unifiedcyberontology.org/uco/vocabulary
https://ontology.unifiedcyberontology.org/uco/vocabulary/0.9.1
https://ontology.unifiedcyberontology.org/uco/vocabulary/1.0.0
https://unifiedcyberontology.org/ontology/uco/action
https://unifiedcyberontology.org/ontology/uco/core
https://unifiedcyberontology.org/ontology/uco/identity
https://unifiedcyberontology.org/ontology/uco/location
https://unifiedcyberontology.org/ontology/uco/marking
https://unifiedcyberontology.org/ontology/uco/observable
https://unifiedcyberontology.org/ontology/uco/pattern
https://unifiedcyberontology.org/ontology/uco/role
https://unifiedcyberontology.org/ontology/uco/time
https://unifiedcyberontology.org/ontology/uco/tool
https://unifiedcyberontology.org/ontology/uco/types
https://unifiedcyberontology.org/ontology/uco/uco
https://unifiedcyberontology.org/ontology/uco/victim
https://unifiedcyberontology.org/ontology/uco/vocabulary
90 changes: 90 additions & 0 deletions case_utils/ontology/src/ontology_and_version_iris.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
#!/usr/bin/env python3

# This software was developed at the National Institute of Standards
# and Technology by employees of the Federal Government in the course
# of their official duties. Pursuant to title 17 Section 105 of the
# United States Code this software is not subject to copyright
# protection and is in the public domain. NIST assumes no
# responsibility whatsoever for its use by other parties, and makes
# no guarantees, expressed or implied, about its quality,
# reliability, or any other characteristic.
#
# We would appreciate acknowledgement if the software is used.

"""
This script creates a list of all ontology and version IRIs that have ever existed in a CDO ontology to describe a CDO ontology. I.e. the subject of triples with owl:Ontology as predicate are included, as are the objects of version-referencing triples (owl:versionIRI, owl:incompatibleWith, etc.).
"""

__version__ = "0.1.0"

import argparse
import typing

import rdflib

NS_OWL = rdflib.OWL
NS_RDF = rdflib.RDF


def concept_is_cdo_concept(n_concept: rdflib.URIRef) -> bool:
"""
This function is purposefully distinct from the function used in case_validate. Within this script, the publishing history of CASE and UCO is reviewed."""
concept_iri = str(n_concept)
return (
concept_iri.startswith("https://ontology.unifiedcyberontology.org/")
or concept_iri.startswith("https://ontology.caseontology.org/")
or concept_iri.startswith("https://unifiedcyberontology.org/ontology/")
or concept_iri.startswith("https://caseontology.org/ontology/")
or concept_iri == "http://case.example.org/core"
)


def extract_ontology_iris(ontology_graph: rdflib.Graph) -> typing.Set[rdflib.URIRef]:
"""
Return all concepts describing the OWL Ontology in the input graph. This does not return classes, properties, etc. defined within the ontology; instead, it only returns the ontology IRI and annotations about the ontology.
"""
ontology_concepts: typing.Set[rdflib.URIRef] = set()
for n_ontology_predicate in [
NS_OWL.backwardCompatibleWith,
NS_OWL.imports,
NS_OWL.incompatibleWith,
NS_OWL.priorVersion,
NS_OWL.versionIRI,
]:
for ontology_triple in ontology_graph.triples(
(None, n_ontology_predicate, None)
):
assert isinstance(ontology_triple[0], rdflib.URIRef)
assert isinstance(ontology_triple[2], rdflib.URIRef)
ontology_concepts.add(ontology_triple[0])
ontology_concepts.add(ontology_triple[2])
for ontology_triple in ontology_graph.triples((None, NS_RDF.type, NS_OWL.Ontology)):
if not isinstance(ontology_triple[0], rdflib.URIRef):
continue
if concept_is_cdo_concept(ontology_triple[0]):
ontology_concepts.add(ontology_triple[0])
return ontology_concepts


def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("out_txt")
parser.add_argument("in_ttl", nargs="+")
args = parser.parse_args()

cdo_concepts: typing.Set[rdflib.URIRef] = set()
for in_ttl in args.in_ttl:
ontology_graph = rdflib.Graph()
ontology_graph.parse(in_ttl)
ontology_concepts = extract_ontology_iris(ontology_graph)
for ontology_concept in ontology_concepts:
if concept_is_cdo_concept(ontology_concept):
cdo_concepts.add(ontology_concept)

with open(args.out_txt, "w") as out_fh:
for cdo_concept in sorted(cdo_concepts):
print(cdo_concept, file=out_fh)


if __name__ == "__main__":
main()
Loading