Skip to content

case_sparql_select: Cross-test JSON output and --no-(header,index) flag branches #93

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
294 changes: 225 additions & 69 deletions case_utils/case_sparql_select/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,74 +49,44 @@
_logger = logging.getLogger(os.path.basename(__file__))


def main() -> None:
parser = argparse.ArgumentParser()

# Configure debug logging before running parse_args, because there could be an error raised before the construction of the argument parser.
logging.basicConfig(
level=logging.DEBUG
if ("--debug" in sys.argv or "-d" in sys.argv)
else logging.INFO
)

parser.add_argument("-d", "--debug", action="store_true")
parser.add_argument(
"--built-version",
choices=tuple(built_version_choices_list),
default="case-" + CURRENT_CASE_VERSION,
help="Ontology version to use to supplement query, such as for subclass querying. Does not require networking to use. Default is most recent CASE release. Passing 'none' will mean no pre-built CASE ontology versions accompanying this tool will be included in the analysis.",
)
parser.add_argument(
"--disallow-empty-results",
action="store_true",
help="Raise error if no results are returned for query.",
)
parser.add_argument(
"--use-prefixes",
action="store_true",
help="Abbreviate node IDs according to graph's encoded prefixes. (This will use prefixes in the graph, not the query.)",
)
parser.add_argument(
"out_table",
help="Expected extensions are .html for HTML tables, .md for Markdown tables, .csv for comma-separated values, and .tsv for tab-separated values.",
)
parser.add_argument(
"in_sparql",
help="File containing a SPARQL SELECT query. Note that prefixes not mapped with a PREFIX statement will be mapped according to their first occurrence among input graphs.",
)
parser.add_argument("in_graph", nargs="+")
args = parser.parse_args()
def query_text_to_variables(select_query_text: str) -> typing.List[str]:
# Build columns list from SELECT line.
select_query_text_lines = select_query_text.split("\n")
select_line = [
line for line in select_query_text_lines if line.startswith("SELECT ")
][0]
variables = select_line.replace(" DISTINCT", "").replace("SELECT ", "").split(" ")
return variables

graph = rdflib.Graph()
for in_graph_filename in args.in_graph:
graph.parse(in_graph_filename)

def graph_and_query_to_data_frame(
graph: rdflib.Graph,
select_query_text: str,
*args: typing.Any,
built_version: str = "case-" + CURRENT_CASE_VERSION,
disallow_empty_results: bool = False,
use_prefixes: bool = False,
**kwargs: typing.Any,
) -> pd.DataFrame:
# Inherit prefixes defined in input context dictionary.
nsdict = {k: v for (k, v) in graph.namespace_manager.namespaces()}

select_query_text = None
with open(args.in_sparql, "r") as in_fh:
select_query_text = in_fh.read().strip()
_logger.debug("select_query_text = %r." % select_query_text)

# Avoid side-effects on input parameter.
if "subClassOf" in select_query_text:
case_utils.ontology.load_subclass_hierarchy(
graph, built_version=args.built_version
)
_graph = rdflib.Graph()
_graph += graph
case_utils.ontology.load_subclass_hierarchy(_graph, built_version=built_version)
else:
_graph = graph

# Build columns list from SELECT line.
select_query_text_lines = select_query_text.split("\n")
select_line = [
line for line in select_query_text_lines if line.startswith("SELECT ")
][0]
variables = select_line.replace(" DISTINCT", "").replace("SELECT ", "").split(" ")
variables = query_text_to_variables(select_query_text)

tally = 0
records = []
select_query_object = rdflib.plugins.sparql.processor.prepareQuery(
select_query_text, initNs=nsdict
)
for (row_no, row) in enumerate(graph.query(select_query_object)):
for (row_no, row) in enumerate(_graph.query(select_query_object)):
tally = row_no + 1
record = []
for (column_no, column) in enumerate(row):
Expand All @@ -131,7 +101,7 @@ def main() -> None:
# .decode() is because hexlify returns bytes.
column_value = binascii.hexlify(column.toPython()).decode()
elif isinstance(column, rdflib.URIRef):
if args.use_prefixes:
if use_prefixes:
column_value = graph.namespace_manager.qname(column.toPython())
else:
column_value = column.toPython()
Expand All @@ -141,39 +111,225 @@ def main() -> None:
_logger.debug("row[0]column[%d] = %r." % (column_no, column_value))
record.append(column_value)
records.append(record)

if tally == 0:
if args.disallow_empty_results:
if disallow_empty_results:
raise ValueError("Failed to return any results.")

df = pd.DataFrame(records, columns=variables)
return df


def data_frame_to_table_text(
df: pd.DataFrame,
*args: typing.Any,
json_indent: typing.Optional[int] = None,
json_orient: str,
output_mode: str,
use_header: bool,
use_index: bool,
**kwargs: typing.Any,
) -> str:
table_text: typing.Optional[str] = None
if args.out_table.endswith(".csv") or args.out_table.endswith(".tsv"):
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html

# Set up kwargs dicts. One kwarg behaves slightly differently for Markdown vs. other formats.
general_kwargs: typing.Dict[str, typing.Any] = dict()
md_kwargs: typing.Dict[str, typing.Any] = dict()

# Note some output modes will drop 'header' from general_kwargs, due to alternate support or lack of support.
if use_header:
general_kwargs["header"] = True
else:
general_kwargs["header"] = False
md_kwargs["headers"] = tuple()

general_kwargs["index"] = use_index

if output_mode in {"csv", "tsv"}:
sep: str
if args.out_table.endswith(".csv"):
if output_mode == "csv":
sep = ","
elif args.out_table.endswith(".tsv"):
elif output_mode == "tsv":
sep = "\t"
else:
raise NotImplementedError(
"Output extension not implemented in CSV-style output."
)
table_text = df.to_csv(sep=sep)
elif args.out_table.endswith(".html"):
table_text = df.to_csv(sep=sep, **general_kwargs)
elif output_mode == "html":
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_html.html
# Add CSS classes for CASE website Bootstrap support.
table_text = df.to_html(classes=("table", "table-bordered", "table-condensed"))
elif args.out_table.endswith(".md"):
table_text = df.to_html(
classes=("table", "table-bordered", "table-condensed"), **general_kwargs
)
elif output_mode == "json":
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_json.html

# Drop unsupported kwarg.
del general_kwargs["header"]

table_text = df.to_json(
indent=json_indent, orient=json_orient, date_format="iso", **general_kwargs
)
elif output_mode == "md":
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_markdown.html
# https://pypi.org/project/tabulate/
# Assume Github-flavored Markdown.
table_text = df.to_markdown(tablefmt="github")
if table_text is None:
raise NotImplementedError(
"Unsupported output extension for output filename %r.", args.out_table

# Drop unsupported kwarg.
del general_kwargs["header"]

table_text = df.to_markdown(tablefmt="github", **general_kwargs, **md_kwargs)
else:
if table_text is None:
raise NotImplementedError("Unimplemented output mode: %r." % output_mode)
assert table_text is not None

return table_text


def main() -> None:
parser = argparse.ArgumentParser()

# Configure debug logging before running parse_args, because there could be an error raised before the construction of the argument parser.
logging.basicConfig(
level=logging.DEBUG
if ("--debug" in sys.argv or "-d" in sys.argv)
else logging.INFO
)

parser.add_argument("-d", "--debug", action="store_true")
parser.add_argument(
"--built-version",
choices=tuple(built_version_choices_list),
default="case-" + CURRENT_CASE_VERSION,
help="Ontology version to use to supplement query, such as for subclass querying. Does not require networking to use. Default is most recent CASE release. Passing 'none' will mean no pre-built CASE ontology versions accompanying this tool will be included in the analysis.",
)
parser.add_argument(
"--disallow-empty-results",
action="store_true",
help="Raise error if no results are returned for query.",
)
parser.add_argument(
"--json-indent",
type=int,
help="Number of whitespace characters to use for indentation. Only applicable for JSON output.",
)
parser.add_argument(
"--json-orient",
default="columns",
choices=("columns", "index", "records", "split", "table", "values"),
help="Orientation to use for Pandas DataFrame JSON output. Only applicable for JSON output.",
)
parser.add_argument(
"--use-prefixes",
action="store_true",
help="Abbreviate node IDs according to graph's encoded prefixes. (This will use prefixes in the graph, not the query.)",
)
parser.add_argument(
"out_table",
help="Expected extensions are .html for HTML tables, .json for JSON tables, .md for Markdown tables, .csv for comma-separated values, and .tsv for tab-separated values. Note that JSON is a Pandas output JSON format (chosen by '--json-orient'), and not JSON-LD.",
)
parser.add_argument(
"in_sparql",
help="File containing a SPARQL SELECT query. Note that prefixes not mapped with a PREFIX statement will be mapped according to their first occurrence among input graphs.",
)

parser_header_group = parser.add_mutually_exclusive_group(required=False)
parser_header_group.add_argument(
"--header",
action="store_true",
help="Print column labels. This is the default behavior.",
)
parser_header_group.add_argument(
"--no-header",
action="store_true",
help="Do not print column labels.",
)

parser_index_group = parser.add_mutually_exclusive_group(required=False)
parser_index_group.add_argument(
"--index",
action="store_true",
help="Print index (auto-incrementing row labels as left untitled column). This is the default behavior.",
)
parser_index_group.add_argument(
"--no-index",
action="store_true",
help="Do not print index. If output is JSON, --json-orient must be 'split' or 'table'.",
)

parser.add_argument("in_graph", nargs="+")
args = parser.parse_args()

output_mode: str
if args.out_table.endswith(".csv"):
output_mode = "csv"
elif args.out_table.endswith(".html"):
output_mode = "html"
elif args.out_table.endswith(".json"):
output_mode = "json"
elif args.out_table.endswith(".md"):
output_mode = "md"
elif args.out_table.endswith(".tsv"):
output_mode = "tsv"
else:
raise NotImplementedError("Output file extension not implemented.")

graph = rdflib.Graph()
for in_graph_filename in args.in_graph:
graph.parse(in_graph_filename)

select_query_text: typing.Optional[str] = None
with open(args.in_sparql, "r") as in_fh:
select_query_text = in_fh.read().strip()
if select_query_text is None:
raise ValueError("Failed to load query.")
_logger.debug("select_query_text = %r." % select_query_text)

# Process --header and --no-header.
use_header: bool
if args.header is True:
use_header = True
if args.no_header is True:
use_header = False
else:
use_header = True

# Process --index and --no-index.
use_index: bool
if args.index is True:
use_index = True
if args.no_index is True:
use_index = False
else:
use_index = True

if (
output_mode == "json"
and use_index is False
and args.json_orient not in {"split", "table"}
):
raise ValueError(
"For JSON output, --no-index flag requires --json-orient to be either 'split' or 'table'."
)

df = graph_and_query_to_data_frame(
graph,
select_query_text,
built_version=args.built_version,
disallow_empty_results=args.disallow_empty_results is True,
use_prefixes=args.use_prefixes is True,
)

table_text = data_frame_to_table_text(
df,
json_indent=args.json_indent,
json_orient=args.json_orient,
output_mode=output_mode,
use_header=use_header,
use_index=use_index,
)
with open(args.out_table, "w") as out_fh:
out_fh.write(table_text)
if table_text[-1] != "\n":
Expand Down
1 change: 1 addition & 0 deletions tests/case_utils/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ check: \
&& pytest \
--ignore case_file \
--ignore case_sparql_construct \
--ignore case_sparql_select \
--ignore case_validate \
--log-level=DEBUG

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"?nFile":{
"0":"kb:file-1",
"1":"kb:file-2"
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"?nFile":{"0":"kb:file-1","1":"kb:file-2"}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{
"?name":{
"0":"Johnny Lee Outlaw",
"1":"Peter Goodguy"
},
"?mbox":{
"0":"mailto:jlow@example.com",
"1":"mailto:peter@example.org"
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"?name":{"0":"Johnny Lee Outlaw","1":"Peter Goodguy"},"?mbox":{"0":"mailto:jlow@example.com","1":"mailto:peter@example.org"}}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"0":{"?name":"Johnny Lee Outlaw","?mbox":"mailto:jlow@example.com"},"1":{"?name":"Peter Goodguy","?mbox":"mailto:peter@example.org"}}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[{"?name":"Johnny Lee Outlaw","?mbox":"mailto:jlow@example.com"},{"?name":"Peter Goodguy","?mbox":"mailto:peter@example.org"}]
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"columns":["?name","?mbox"],"index":[0,1],"data":[["Johnny Lee Outlaw","mailto:jlow@example.com"],["Peter Goodguy","mailto:peter@example.org"]]}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"schema":{"fields":[{"name":"index","type":"integer"},{"name":"?name","type":"string"},{"name":"?mbox","type":"string"}],"primaryKey":["index"],"pandas_version":"1.4.0"},"data":[{"index":0,"?name":"Johnny Lee Outlaw","?mbox":"mailto:jlow@example.com"},{"index":1,"?name":"Peter Goodguy","?mbox":"mailto:peter@example.org"}]}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[["Johnny Lee Outlaw","mailto:jlow@example.com"],["Peter Goodguy","mailto:peter@example.org"]]
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
,?name,?mbox
0,Johnny Lee Outlaw,mailto:jlow@example.com
1,Peter Goodguy,mailto:peter@example.org
Loading