Skip to content

case_sparql_select: Add JSON output mode #91

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Dec 8, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
214 changes: 147 additions & 67 deletions case_utils/case_sparql_select/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,74 +49,44 @@
_logger = logging.getLogger(os.path.basename(__file__))


def main() -> None:
parser = argparse.ArgumentParser()

# Configure debug logging before running parse_args, because there could be an error raised before the construction of the argument parser.
logging.basicConfig(
level=logging.DEBUG
if ("--debug" in sys.argv or "-d" in sys.argv)
else logging.INFO
)

parser.add_argument("-d", "--debug", action="store_true")
parser.add_argument(
"--built-version",
choices=tuple(built_version_choices_list),
default="case-" + CURRENT_CASE_VERSION,
help="Ontology version to use to supplement query, such as for subclass querying. Does not require networking to use. Default is most recent CASE release. Passing 'none' will mean no pre-built CASE ontology versions accompanying this tool will be included in the analysis.",
)
parser.add_argument(
"--disallow-empty-results",
action="store_true",
help="Raise error if no results are returned for query.",
)
parser.add_argument(
"--use-prefixes",
action="store_true",
help="Abbreviate node IDs according to graph's encoded prefixes. (This will use prefixes in the graph, not the query.)",
)
parser.add_argument(
"out_table",
help="Expected extensions are .html for HTML tables, .md for Markdown tables, .csv for comma-separated values, and .tsv for tab-separated values.",
)
parser.add_argument(
"in_sparql",
help="File containing a SPARQL SELECT query. Note that prefixes not mapped with a PREFIX statement will be mapped according to their first occurrence among input graphs.",
)
parser.add_argument("in_graph", nargs="+")
args = parser.parse_args()
def query_text_to_variables(select_query_text: str) -> typing.List[str]:
# Build columns list from SELECT line.
select_query_text_lines = select_query_text.split("\n")
select_line = [
line for line in select_query_text_lines if line.startswith("SELECT ")
][0]
variables = select_line.replace(" DISTINCT", "").replace("SELECT ", "").split(" ")
return variables

graph = rdflib.Graph()
for in_graph_filename in args.in_graph:
graph.parse(in_graph_filename)

def graph_and_query_to_data_frame(
graph: rdflib.Graph,
select_query_text: str,
*args: typing.Any,
built_version: str = "case-" + CURRENT_CASE_VERSION,
disallow_empty_results: bool = False,
use_prefixes: bool = False,
**kwargs: typing.Any,
) -> pd.DataFrame:
# Inherit prefixes defined in input context dictionary.
nsdict = {k: v for (k, v) in graph.namespace_manager.namespaces()}

select_query_text = None
with open(args.in_sparql, "r") as in_fh:
select_query_text = in_fh.read().strip()
_logger.debug("select_query_text = %r." % select_query_text)

# Avoid side-effects on input parameter.
if "subClassOf" in select_query_text:
case_utils.ontology.load_subclass_hierarchy(
graph, built_version=args.built_version
)
_graph = rdflib.Graph()
_graph += graph
case_utils.ontology.load_subclass_hierarchy(_graph, built_version=built_version)
else:
_graph = graph

# Build columns list from SELECT line.
select_query_text_lines = select_query_text.split("\n")
select_line = [
line for line in select_query_text_lines if line.startswith("SELECT ")
][0]
variables = select_line.replace(" DISTINCT", "").replace("SELECT ", "").split(" ")
variables = query_text_to_variables(select_query_text)

tally = 0
records = []
select_query_object = rdflib.plugins.sparql.processor.prepareQuery(
select_query_text, initNs=nsdict
)
for (row_no, row) in enumerate(graph.query(select_query_object)):
for (row_no, row) in enumerate(_graph.query(select_query_object)):
tally = row_no + 1
record = []
for (column_no, column) in enumerate(row):
Expand All @@ -131,7 +101,7 @@ def main() -> None:
# .decode() is because hexlify returns bytes.
column_value = binascii.hexlify(column.toPython()).decode()
elif isinstance(column, rdflib.URIRef):
if args.use_prefixes:
if use_prefixes:
column_value = graph.namespace_manager.qname(column.toPython())
else:
column_value = column.toPython()
Expand All @@ -141,39 +111,149 @@ def main() -> None:
_logger.debug("row[0]column[%d] = %r." % (column_no, column_value))
record.append(column_value)
records.append(record)

if tally == 0:
if args.disallow_empty_results:
if disallow_empty_results:
raise ValueError("Failed to return any results.")

df = pd.DataFrame(records, columns=variables)
return df


def data_frame_to_table_text(
df: pd.DataFrame,
*args: typing.Any,
json_indent: typing.Optional[int] = None,
json_orient: str,
output_mode: str,
**kwargs: typing.Any,
) -> str:
table_text: typing.Optional[str] = None
if args.out_table.endswith(".csv") or args.out_table.endswith(".tsv"):
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html

if output_mode in {"csv", "tsv"}:
sep: str
if args.out_table.endswith(".csv"):
if output_mode == "csv":
sep = ","
elif args.out_table.endswith(".tsv"):
elif output_mode == "tsv":
sep = "\t"
else:
raise NotImplementedError(
"Output extension not implemented in CSV-style output."
)
table_text = df.to_csv(sep=sep)
elif args.out_table.endswith(".html"):
elif output_mode == "html":
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_html.html
# Add CSS classes for CASE website Bootstrap support.
table_text = df.to_html(classes=("table", "table-bordered", "table-condensed"))
elif args.out_table.endswith(".md"):
elif output_mode == "json":
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_json.html

table_text = df.to_json(
indent=json_indent, orient=json_orient, date_format="iso"
)
elif output_mode == "md":
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_markdown.html
# https://pypi.org/project/tabulate/
# Assume Github-flavored Markdown.

table_text = df.to_markdown(tablefmt="github")
if table_text is None:
raise NotImplementedError(
"Unsupported output extension for output filename %r.", args.out_table
)
else:
if table_text is None:
raise NotImplementedError("Unimplemented output mode: %r." % output_mode)
assert table_text is not None

return table_text


def main() -> None:
parser = argparse.ArgumentParser()

# Configure debug logging before running parse_args, because there could be an error raised before the construction of the argument parser.
logging.basicConfig(
level=logging.DEBUG
if ("--debug" in sys.argv or "-d" in sys.argv)
else logging.INFO
)

parser.add_argument("-d", "--debug", action="store_true")
parser.add_argument(
"--built-version",
choices=tuple(built_version_choices_list),
default="case-" + CURRENT_CASE_VERSION,
help="Ontology version to use to supplement query, such as for subclass querying. Does not require networking to use. Default is most recent CASE release. Passing 'none' will mean no pre-built CASE ontology versions accompanying this tool will be included in the analysis.",
)
parser.add_argument(
"--disallow-empty-results",
action="store_true",
help="Raise error if no results are returned for query.",
)
parser.add_argument(
"--json-indent",
type=int,
help="Number of whitespace characters to use for indentation. Only applicable for JSON output.",
)
parser.add_argument(
"--json-orient",
default="columns",
choices=("columns", "index", "records", "split", "table", "values"),
help="Orientation to use for Pandas DataFrame JSON output. Only applicable for JSON output.",
)
parser.add_argument(
"--use-prefixes",
action="store_true",
help="Abbreviate node IDs according to graph's encoded prefixes. (This will use prefixes in the graph, not the query.)",
)
parser.add_argument(
"out_table",
help="Expected extensions are .html for HTML tables, .json for JSON tables, .md for Markdown tables, .csv for comma-separated values, and .tsv for tab-separated values. Note that JSON is a Pandas output JSON format (chosen by '--json-orient'), and not JSON-LD.",
)
parser.add_argument(
"in_sparql",
help="File containing a SPARQL SELECT query. Note that prefixes not mapped with a PREFIX statement will be mapped according to their first occurrence among input graphs.",
)

parser.add_argument("in_graph", nargs="+")
args = parser.parse_args()

output_mode: str
if args.out_table.endswith(".csv"):
output_mode = "csv"
elif args.out_table.endswith(".html"):
output_mode = "html"
elif args.out_table.endswith(".json"):
output_mode = "json"
elif args.out_table.endswith(".md"):
output_mode = "md"
elif args.out_table.endswith(".tsv"):
output_mode = "tsv"
else:
raise NotImplementedError("Output file extension not implemented.")

graph = rdflib.Graph()
for in_graph_filename in args.in_graph:
graph.parse(in_graph_filename)

select_query_text: typing.Optional[str] = None
with open(args.in_sparql, "r") as in_fh:
select_query_text = in_fh.read().strip()
if select_query_text is None:
raise ValueError("Failed to load query.")
_logger.debug("select_query_text = %r." % select_query_text)

df = graph_and_query_to_data_frame(
graph,
select_query_text,
built_version=args.built_version,
disallow_empty_results=args.disallow_empty_results is True,
use_prefixes=args.use_prefixes is True,
)

table_text = data_frame_to_table_text(
df,
json_indent=args.json_indent,
json_orient=args.json_orient,
output_mode=output_mode,
)
with open(args.out_table, "w") as out_fh:
out_fh.write(table_text)
if table_text[-1] != "\n":
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"?nFile":{
"0":"kb:file-1",
"1":"kb:file-2"
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"?nFile":{"0":"kb:file-1","1":"kb:file-2"}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{
"?name":{
"0":"Johnny Lee Outlaw",
"1":"Peter Goodguy"
},
"?mbox":{
"0":"mailto:jlow@example.com",
"1":"mailto:peter@example.org"
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"?name":{"0":"Johnny Lee Outlaw","1":"Peter Goodguy"},"?mbox":{"0":"mailto:jlow@example.com","1":"mailto:peter@example.org"}}
2 changes: 1 addition & 1 deletion tests/case_utils/case_sparql_select/.gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
prefixed*
subclass-*.md
w3-output.*
w3-output*
Loading