Process notebooks with pypandoc and a custom pandoc filter (#2741)

svekars · malfet · web-flow · commit 30e14df26091 · 2024-03-13T09:44:45.000-07:00
Process notebooks with pypandoc and a custom pandoc filter
To fix a number of formatting issues with notebooks, adding pypandoc to process the notebooks and a custom pandoc filter and a post-processing script.
The custom filter does the following:

* Handles admonitions - replaces notes, tips, and warnings with correct HTML blocks.
* Handles blocks of code that have video frames - adds a tag ``` {.python .jupyter-code-cell}
* Handles images - prepends the pytorch.org/tutorials to the local paths so that the images display correctly.
* Handles 2-colum grid layout - replaces with sid-by side cards:

The post-porocessing script:

* Cleans up the resulting notebooks and remove the unneeded ```{=html} syntax that prevents the notebooks from rendering
* Searches for ``` {.python .jupyter-code-cell} in the generated .ipynb files and slices the cells so that the HTML video frame can be run as a code cell and display that video

Co-authored-by: Nikita Shulga &lt;2453524+malfet@users.noreply.github.com&gt;
diff --git a/.jenkins/build.sh b/.jenkins/build.sh
@@ -15,6 +15,10 @@ sudo apt-get update || sudo apt-get install libgnutls30
 sudo apt-get update
 sudo apt-get install -y --no-install-recommends unzip p7zip-full sox libsox-dev libsox-fmt-all rsync
 
+# Install pandoc (does not install from pypi)
+sudo apt-get update
+sudo apt-get install -y pandoc
+
 # NS: Path to python runtime should already be part of docker container
 # export PATH=/opt/conda/bin:$PATH
 rm -rf src
@@ -63,6 +67,9 @@ if [[ "${JOB_TYPE}" == "worker" ]]; then
   # Step 3: Run `make docs` to generate HTML files and static files for these tutorials
   make docs
 
+  # Step 3.1: Run the post-processing script:
+  python .jenkins/post_process_notebooks.py
+
   # Step 4: If any of the generated files are not related the tutorial files we want to run,
   # then we remove them
   set +x
@@ -140,6 +147,9 @@ elif [[ "${JOB_TYPE}" == "manager" ]]; then
   bash $DIR/remove_invisible_code_block_batch.sh docs
   python .jenkins/validate_tutorials_built.py
 
+  # Step 5.1: Run post-processing script on .ipynb files:
+  python .jenkins/post_process_notebooks.py
+
   # Step 6: Copy generated HTML files and static files to S3
   7z a manager.7z docs
   awsv2 s3 cp manager.7z s3://${BUCKET_NAME}/${COMMIT_ID}/manager.7z
diff --git a/.jenkins/custom_pandoc_filter.py b/.jenkins/custom_pandoc_filter.py
@@ -0,0 +1,139 @@
+from pandocfilters import toJSONFilter, Div, RawBlock, Para, Str, Space, Link, Code, CodeBlock
+import markdown
+import html
+
+def to_markdown(item, skip_octicon=False):
+    # A handler function to process strings, links, code, and code
+    # blocks
+    if item['t'] == 'Str':
+        return item['c']
+    elif item['t'] == 'Space':
+        return ' '
+    elif item['t'] == 'Link':
+        link_text = ''.join(to_markdown(i, skip_octicon) for i in item['c'][1])
+        return f'<a href="{item["c"][2][0]}">{link_text}</a>'
+    elif item['t'] == 'Code':
+        # Need to remove icticon as they don't render in .ipynb
+        if any(value == 'octicon' for key, value in item['c'][0][2]):
+            return ''
+        else:
+            # Escape the code and wrap it in <code> tags
+            return f'<code>{html.escape(item["c"][1])}</code>'
+    elif item['t'] == 'CodeBlock':
+        # Escape the code block and wrap it in <pre><code> tags
+        return f'<pre><code>{html.escape(item["c"][1])}</code></pre>'
+    else:
+        return ''
+
+
+def process_admonitions(key, value, format, meta):
+    # Replace admonitions with proper HTML.
+    if key == 'Div':
+        [[ident, classes, keyvals], contents] = value
+        if 'note' in classes:
+            color = '#54c7ec'
+            label = 'NOTE:'
+        elif 'tip' in classes:
+            color = '#6bcebb'
+            label = 'TIP:'
+        elif 'warning' in classes:
+            color = '#e94f3b'
+            label = 'WARNING:'
+        else:
+            return
+
+        note_content = []
+        for block in contents:
+            if block.get('t') == 'Para':
+                for item in block['c']:
+                    if item['t'] == 'Str':
+                        note_content.append(Str(item['c']))
+                    elif item['t'] == 'Space':
+                        note_content.append(Space())
+                    elif item['t'] == 'Link':
+                        note_content.append(Link(*item['c']))
+                    elif item['t'] == 'Code':
+                        note_content.append(Code(*item['c']))
+            elif block.get('t') == 'CodeBlock':
+                note_content.append(CodeBlock(*block['c']))
+
+        note_content_md = ''.join(to_markdown(item) for item in note_content)
+        html_content = markdown.markdown(note_content_md)
+
+        return [{'t': 'RawBlock', 'c': ['html', f'<div style="background-color: {color}; color: #fff; font-weight: 700; padding-left: 10px; padding-top: 5px; padding-bottom: 5px"><strong>{label}</strong></div>']}, {'t': 'RawBlock', 'c': ['html', '<div style="background-color: #f3f4f7; padding-left: 10px; padding-top: 10px; padding-bottom: 10px; padding-right: 10px">']}, {'t': 'RawBlock', 'c': ['html', html_content]}, {'t': 'RawBlock', 'c': ['html', '</div>']}]
+    elif key == 'RawBlock':
+    # this is needed for the cells that have embedded video.
+    # We add a special tag to those: ``` {python, .jupyter-code-cell}
+    # The post-processing script then finds those and genrates separate
+    # code cells that can load video.
+        [format, content] = value
+        if format == 'html' and 'iframe' in content:
+            # Extract the video URL
+            video_url = content.split('src="')[1].split('"')[0]
+            # Create the Python code to display the video
+            python_code = f"""
+from IPython.display import display, HTML
+html_code = \"""
+{content}
+\"""
+display(HTML(html_code))
+"""
+
+            return {'t': 'CodeBlock', 'c': [['', ['python', 'jupyter-code-cell'], []], python_code]}
+
+
+def process_images(key, value, format, meta):
+    # Add https://pytorch.org/tutorials/ to images so that they
+    # load correctly in the notebook.
+    if key != 'Image':
+        return None
+    [ident, classes, keyvals], caption, [src, title] = value
+    if not src.startswith('http'):
+        while src.startswith('../'):
+            src = src[3:]
+        if src.startswith('/_static'):
+            src = src[1:]
+        src = 'https://pytorch.org/tutorials/' + src
+
+    return {'t': 'Image', 'c': [[ident, classes, keyvals], caption, [src, title]]}
+
+
+def process_grids(key, value, format, meta):
+    # Generate side by side grid cards. Only for the two-cards layout
+    # that we use in the tutorial template.
+    if key == 'Div':
+        [[ident, classes, keyvals], contents] = value
+        if 'grid' in classes:
+            columns = ['<div style="width: 45%; float: left; padding: 20px;">',
+                       '<div style="width: 45%; float: right; padding: 20px;">']
+            column_num = 0
+            for block in contents:
+                if 't' in block and block['t'] == 'Div' and 'grid-item-card' in block['c'][0][1]:
+                    item_html = ''
+                    for item in block['c'][1]:
+                        if item['t'] == 'Para':
+                            item_html += '<h2>' + ''.join(to_markdown(i) for i in item['c']) + '</h2>'
+                        elif item['t'] == 'BulletList':
+                            item_html += '<ul>'
+                            for list_item in item['c']:
+                                item_html += '<li>' + ''.join(to_markdown(i) for i in list_item[0]['c']) + '</li>'
+                            item_html += '</ul>'
+                    columns[column_num] += item_html
+                    column_num = (column_num + 1) % 2
+            columns = [column + '</div>' for column in columns]
+            return {'t': 'RawBlock', 'c': ['html', ''.join(columns)]}
+
+def is_code_block(item):
+    return item['t'] == 'Code' and 'octicon' in item['c'][1]
+
+
+def process_all(key, value, format, meta):
+    for transform in [process_admonitions, process_images, process_grids]:
+        new_value = transform(key, value, format, meta)
+        if new_value is not None:
+            break
+    return new_value
+
+
+if __name__ == "__main__":
+    toJSONFilter(process_all)
diff --git a/.jenkins/post_process_notebooks.py b/.jenkins/post_process_notebooks.py
@@ -0,0 +1,97 @@
+import nbformat as nbf
+import os
+import re
+
+"""
+This post-processing script needs to run after the .ipynb files are
+generated. The script removes extraneous ```{=html} syntax from the
+admonitions and splits the cells that have video iframe into a 
+separate code cell that can be run to load the video directly
+in the notebook. This script is included in build.sh.
+"""
+
+
+# Pattern to search ``` {.python .jupyter-code-cell}
+pattern = re.compile(r'(.*?)``` {.python .jupyter-code-cell}\n\n(from IPython.display import display, HTML\nhtml_code = """\n.*?\n"""\ndisplay\(HTML\(html_code\)\))\n```(.*)', re.DOTALL)
+
+
+def process_video_cell(notebook_path):
+    """
+    This function finds the code blocks with the
+    "``` {.python .jupyter-code-cell}" code bocks and slices them
+    into a separe code cell (instead of markdown) which allows to
+    load the video in the notebook. The rest of the content is placed
+    in a new markdown cell.
+    """
+    print(f'Processing file: {notebook_path}')
+    notebook = nbf.read(notebook_path, as_version=4)
+
+    # Iterate over markdown cells
+    for i, cell in enumerate(notebook.cells):
+        if cell.cell_type == 'markdown':
+            match = pattern.search(cell.source)
+            if match:
+                print(f'Match found in cell {i}: {match.group(0)[:100]}...')
+                # Extract the parts before and after the video code block
+                before_html_block = match.group(1)
+                code_block = match.group(2)
+
+                # Add a comment to run the cell to display the video 
+                code_block = "# Run this cell to load the video\n" + code_block
+                # Create a new code cell
+                new_code_cell = nbf.v4.new_code_cell(source=code_block)
+
+                # Replace the original markdown cell with the part before the code block
+                cell.source = before_html_block
+
+                # Insert the new code cell after the current one
+                notebook.cells.insert(i+1, new_code_cell)
+                print(f'New code cell created with source: {new_code_cell.source}')
+
+                # If there is content after the HTML code block, create a new markdown cell
+                if len(match.group(3).strip()) > 0:
+                    after_html_block = match.group(3)
+                    new_markdown_cell = nbf.v4.new_markdown_cell(source=after_html_block)
+                    # Create a new markdown cell and add the content after code block there
+                    notebook.cells.insert(i+2, new_markdown_cell)
+
+            else:
+                # Remove ```{=html} from the code block
+                cell.source = remove_html_tag(cell.source)
+
+    nbf.write(notebook, notebook_path)
+
+
+def remove_html_tag(content):
+    """
+    Pandoc adds an extraneous ```{=html} ``` to raw HTML blocks which
+    prevents it from rendering correctly. This function removes
+    ```{=html} that we don't need.
+    """
+    content = re.sub(r'```{=html}\n<div', '<div', content)
+    content = re.sub(r'">\n```', '">', content)
+    content = re.sub(r'<\/div>\n```', '</div>\n', content)
+    content = re.sub(r'```{=html}\n</div>\n```', '</div>\n', content)
+    content = re.sub(r'```{=html}', '', content)
+    content = re.sub(r'</p>\n```', '</p>', content)
+    return content
+
+
+def walk_dir(downloads_dir):
+    """
+    Walk the dir and process all notebook files in
+    the _downloads directory and its subdirectories.
+    """
+    for root, dirs, files in os.walk(downloads_dir):
+        for filename in files:
+            if filename.endswith('.ipynb'):
+                process_video_cell(os.path.join(root, filename))
+
+
+def main():
+    downloads_dir = './docs/_downloads'
+    walk_dir(downloads_dir)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/conf.py b/conf.py
@@ -42,7 +42,8 @@
 import distutils.file_util
 import re
 from get_sphinx_filenames import SPHINX_SHOULD_RUN
-
+import pandocfilters
+import pypandoc
 import plotly.io as pio
 pio.renderers.default = 'sphinx_gallery'
 
@@ -74,7 +75,8 @@
     'sphinx.ext.intersphinx',
     'sphinx_copybutton',
     'sphinx_gallery.gen_gallery',
-    'sphinx_design'
+    'sphinx_design',
+    'nbsphinx'
 ]
 
 intersphinx_mapping = {
@@ -107,7 +109,10 @@ def reset_seeds(gallery_conf, fname):
                             "# https://pytorch.org/tutorials/beginner/colab\n"
                             "%matplotlib inline"),
     'reset_modules': (reset_seeds),
-    'ignore_pattern': r'_torch_export_nightly_tutorial.py'
+    'ignore_pattern': r'_torch_export_nightly_tutorial.py',
+    'pypandoc': {'extra_args': ['--mathjax', '--toc'],
+                 'filters': ['.jenkins/custom_pandoc_filter.py'],
+    },
 }
 
 if os.getenv('GALLERY_PATTERN'):
diff --git a/requirements.txt b/requirements.txt
@@ -4,8 +4,12 @@
 sphinx==5.0.0
 sphinx-gallery==0.11.1
 sphinx_design
+nbsphinx
 docutils==0.16
 sphinx-copybutton
+pypandoc==1.12
+pandocfilters
+markdown
 tqdm==4.66.1
 numpy==1.24.4
 matplotlib
@@ -28,7 +32,7 @@ torchx
 torchrl==0.3.0
 tensordict==0.3.0
 ax-platform
-nbformat>=4.2.0
+nbformat>==5.9.2
 datasets
 transformers
 torchmultimodal-nightly # needs to be updated to stable as soon as it's avaialable