diff --git a/doc/modules/ROOT/pages/tutorials/fastrp-and-knn.adoc b/doc/modules/ROOT/pages/tutorials/fastrp-and-knn.adoc index c6036d388..1c0a68773 100644 --- a/doc/modules/ROOT/pages/tutorials/fastrp-and-knn.adoc +++ b/doc/modules/ROOT/pages/tutorials/fastrp-and-knn.adoc @@ -245,6 +245,10 @@ print(f"Nodes compared: {result['nodesCompared']}") print(f"Mean similarity: {result['similarityDistribution']['mean']}") ---- +---- +<_io.TextIOWrapper name='examples/test.txt' mode='r' encoding='UTF-8'> +---- + As we can see the mean similarity between nodes is quite high. This is due to the fact that we have a small example where there are no long paths between nodes leading to many similar FastRP node embeddings. diff --git a/examples/fastrp-and-knn.ipynb b/examples/fastrp-and-knn.ipynb index 61424de6c..2b3bd5b18 100644 --- a/examples/fastrp-and-knn.ipynb +++ b/examples/fastrp-and-knn.ipynb @@ -284,7 +284,11 @@ "cell_type": "code", "execution_count": null, "id": "9b132f95", - "metadata": {}, + "metadata": { + "tags": [ + "replace-output-with:test.txt" + ] + }, "outputs": [], "source": [ "# Run kNN and write back to db (we skip memory estimation this time...)\n", diff --git a/examples/test.txt b/examples/test.txt new file mode 100644 index 000000000..9812a4b23 --- /dev/null +++ b/examples/test.txt @@ -0,0 +1 @@ +ABCDE diff --git a/scripts/nb2doc/convert.sh b/scripts/nb2doc/convert.sh index 08febc21f..2269bac1d 100755 --- a/scripts/nb2doc/convert.sh +++ b/scripts/nb2doc/convert.sh @@ -1,28 +1,5 @@ #!/bin/bash -DOC_DIR=doc/modules/ROOT/pages/tutorials -NB_DIR=examples - -for notebook in ${NB_DIR}/*.ipynb -do - docfile=$(basename ${notebook} | cut -d. -f1) - echo "${notebook} -> ${DOC_DIR}/${docfile}.adoc" - - # --noprompt - #  Skips the "In/Out" lines before each cell - # --ClearMetadataPreprocessor.enabled=True - #  Cleans the "ipython3" language replacing it with "Python" - # (for Asciidoc code cells) - # --ASCIIDocExporter.file_extension=.adoc - # If not set, the extension is .asciidoc - - jupyter nbconvert \ - --to asciidoc \ - --template=scripts/nb2doc/asciidoc-template \ - --output-dir ${DOC_DIR} \ - --ASCIIDocExporter.file_extension=.adoc \ - --no-prompt \ - --ClearMetadataPreprocessor.enabled=True \ - ${notebook} -done - +python ./scripts/nb2doc/convert_notebooks.py \ + -o "doc/modules/ROOT/pages/tutorials" \ + -i "examples/" diff --git a/scripts/nb2doc/convert_notebooks.py b/scripts/nb2doc/convert_notebooks.py new file mode 100644 index 000000000..f4b85318d --- /dev/null +++ b/scripts/nb2doc/convert_notebooks.py @@ -0,0 +1,104 @@ +# reasons for not using nbconvert cli tool: +# * cannot keep output based on a given tag + +import argparse +import logging.config +import re +import sys +from pathlib import Path + +import nbconvert +from nbconvert.preprocessors import Preprocessor + +REPLACE_CELL_OUTPUT_TAG_PATTTERN = r"replace-output-with\:(.*)" +METADATA_TAG_KEY = "tags" + +TEMPLATE_DIR = Path("scripts/nb2doc/asciidoc-template") + +logging.basicConfig() +handler = logging.StreamHandler(sys.stdout) +handler.setLevel(logging.DEBUG) +logger = logging.getLogger() + + +class OutputReplacerPreprocessor(Preprocessor): + """ + Replaces the output from tagged code cell in a notebook. + Expected Tag format `replace-with:images/some.png` + """ + + def __init__(self, replace_base_dir: Path, **kw): + self._replace_base_dir = replace_base_dir + super().__init__(**kw) + + def preprocess_cell(self, cell, resources, cell_index): + """ + Apply a transformation on each cell. See base.py for details. + """ + + if replace_tags := [ + tag for tag in cell["metadata"].get(METADATA_TAG_KEY, []) if re.match(REPLACE_CELL_OUTPUT_TAG_PATTTERN, tag) + ]: + if len(replace_tags) > 1: + raise ValueError( + f"Expected one or zero tags matching `{REPLACE_CELL_OUTPUT_TAG_PATTTERN}`. But got `{replace_tags}`" + ) + new_output_file_name = replace_tags[0].split(":")[1].strip() + new_ouput_file = self._replace_base_dir.joinpath(new_output_file_name) + logger.info(f"Replace output with content from: {new_ouput_file}") + with new_ouput_file.open("r") as new_output: + # TODO: figure-out schema of cell outputs + # TODO Implement according to https://nbformat.readthedocs.io/en/latest/format_description.html#display-data + cell.outputs = [ + { + "output_type": "display_data", + "data": {"text/plain": str(new_output)}, + "metadata": {}, + } + ] + cell.execution_count = None + return cell, resources + + +def to_output_file(input_file: Path, output_dir: Path) -> Path: + return output_dir.joinpath(input_file.name.replace(".ipynb", ".adoc")) + + +def main(input_path: Path, output_dir: Path) -> None: + if input_path.is_file(): + notebooks = [input_path] + else: + notebooks = [f for f in input_path.iterdir() if f.is_file() and f.suffix == ".ipynb"] + + exporter = nbconvert.ASCIIDocExporter(template_file=str(TEMPLATE_DIR.joinpath("index.adoc.j2"))) + # Skips the "In/Out" lines before each cell + exporter.exclude_input_prompt = True + exporter.exclude_output_prompt = True + + metadata_cleaner = nbconvert.preprocessors.ClearMetadataPreprocessor(preserve_cell_metadata_mask=METADATA_TAG_KEY) + output_replacer = OutputReplacerPreprocessor(replace_base_dir=input_path) + + exporter.register_preprocessor(metadata_cleaner, enabled=True) + exporter.register_preprocessor(output_replacer, enabled=True) + + logger.info(f"Converting {len(notebooks)} notebooks.") + + for notebook in notebooks: + output_file = to_output_file(notebook, output_dir) + logger.info(f"Converting notebook from `{input_path}` to: `{output_file}`") + output = exporter.from_filename(notebook) + + converted = output[0] + + with output_file.open(mode="w") as out: + out.write(converted) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("-o", "--output", required=True, help="directory to write the result to") + parser.add_argument("-i", "--input", required=True, help="path to the notebook file") + + args = parser.parse_args() + + main(Path(args.input), Path(args.output))