diff --git a/.jenkins/build.sh b/.jenkins/build.sh index 6d9c69d317c..14f29bc2234 100755 --- a/.jenkins/build.sh +++ b/.jenkins/build.sh @@ -15,6 +15,10 @@ sudo apt-get update || sudo apt-get install libgnutls30 sudo apt-get update sudo apt-get install -y --no-install-recommends unzip p7zip-full sox libsox-dev libsox-fmt-all rsync +# Install pandoc (does not install from pypi) +sudo apt-get update +sudo apt-get install -y pandoc + # NS: Path to python runtime should already be part of docker container # export PATH=/opt/conda/bin:$PATH rm -rf src @@ -63,6 +67,9 @@ if [[ "${JOB_TYPE}" == "worker" ]]; then # Step 3: Run `make docs` to generate HTML files and static files for these tutorials make docs + # Step 3.1: Run the post-processing script: + python .jenkins/post_process_notebooks.py + # Step 4: If any of the generated files are not related the tutorial files we want to run, # then we remove them set +x @@ -140,6 +147,9 @@ elif [[ "${JOB_TYPE}" == "manager" ]]; then bash $DIR/remove_invisible_code_block_batch.sh docs python .jenkins/validate_tutorials_built.py + # Step 5.1: Run post-processing script on .ipynb files: + python .jenkins/post_process_notebooks.py + # Step 6: Copy generated HTML files and static files to S3 7z a manager.7z docs awsv2 s3 cp manager.7z s3://${BUCKET_NAME}/${COMMIT_ID}/manager.7z diff --git a/.jenkins/custom_pandoc_filter.py b/.jenkins/custom_pandoc_filter.py new file mode 100644 index 00000000000..f4ceb0df11e --- /dev/null +++ b/.jenkins/custom_pandoc_filter.py @@ -0,0 +1,139 @@ +from pandocfilters import toJSONFilter, Div, RawBlock, Para, Str, Space, Link, Code, CodeBlock +import markdown +import html + +def to_markdown(item, skip_octicon=False): + # A handler function to process strings, links, code, and code + # blocks + if item['t'] == 'Str': + return item['c'] + elif item['t'] == 'Space': + return ' ' + elif item['t'] == 'Link': + link_text = ''.join(to_markdown(i, skip_octicon) for i in item['c'][1]) + return f'{link_text}' + elif item['t'] == 'Code': + # Need to remove icticon as they don't render in .ipynb + if any(value == 'octicon' for key, value in item['c'][0][2]): + return '' + else: + # Escape the code and wrap it in tags + return f'{html.escape(item["c"][1])}' + elif item['t'] == 'CodeBlock': + # Escape the code block and wrap it in
 tags
+        return f'
{html.escape(item["c"][1])}
' + else: + return '' + + +def process_admonitions(key, value, format, meta): + # Replace admonitions with proper HTML. + if key == 'Div': + [[ident, classes, keyvals], contents] = value + if 'note' in classes: + color = '#54c7ec' + label = 'NOTE:' + elif 'tip' in classes: + color = '#6bcebb' + label = 'TIP:' + elif 'warning' in classes: + color = '#e94f3b' + label = 'WARNING:' + else: + return + + note_content = [] + for block in contents: + if block.get('t') == 'Para': + for item in block['c']: + if item['t'] == 'Str': + note_content.append(Str(item['c'])) + elif item['t'] == 'Space': + note_content.append(Space()) + elif item['t'] == 'Link': + note_content.append(Link(*item['c'])) + elif item['t'] == 'Code': + note_content.append(Code(*item['c'])) + elif block.get('t') == 'CodeBlock': + note_content.append(CodeBlock(*block['c'])) + + note_content_md = ''.join(to_markdown(item) for item in note_content) + html_content = markdown.markdown(note_content_md) + + return [{'t': 'RawBlock', 'c': ['html', f'
{label}
']}, {'t': 'RawBlock', 'c': ['html', '
']}, {'t': 'RawBlock', 'c': ['html', html_content]}, {'t': 'RawBlock', 'c': ['html', '
']}] + elif key == 'RawBlock': + # this is needed for the cells that have embedded video. + # We add a special tag to those: ``` {python, .jupyter-code-cell} + # The post-processing script then finds those and genrates separate + # code cells that can load video. + [format, content] = value + if format == 'html' and 'iframe' in content: + # Extract the video URL + video_url = content.split('src="')[1].split('"')[0] + # Create the Python code to display the video + python_code = f""" +from IPython.display import display, HTML +html_code = \""" +{content} +\""" +display(HTML(html_code)) +""" + + return {'t': 'CodeBlock', 'c': [['', ['python', 'jupyter-code-cell'], []], python_code]} + + +def process_images(key, value, format, meta): + # Add https://pytorch.org/tutorials/ to images so that they + # load correctly in the notebook. + if key != 'Image': + return None + [ident, classes, keyvals], caption, [src, title] = value + if not src.startswith('http'): + while src.startswith('../'): + src = src[3:] + if src.startswith('/_static'): + src = src[1:] + src = 'https://pytorch.org/tutorials/' + src + + return {'t': 'Image', 'c': [[ident, classes, keyvals], caption, [src, title]]} + + +def process_grids(key, value, format, meta): + # Generate side by side grid cards. Only for the two-cards layout + # that we use in the tutorial template. + if key == 'Div': + [[ident, classes, keyvals], contents] = value + if 'grid' in classes: + columns = ['
', + '
'] + column_num = 0 + for block in contents: + if 't' in block and block['t'] == 'Div' and 'grid-item-card' in block['c'][0][1]: + item_html = '' + for item in block['c'][1]: + if item['t'] == 'Para': + item_html += '

' + ''.join(to_markdown(i) for i in item['c']) + '

' + elif item['t'] == 'BulletList': + item_html += '
    ' + for list_item in item['c']: + item_html += '
  • ' + ''.join(to_markdown(i) for i in list_item[0]['c']) + '
  • ' + item_html += '
' + columns[column_num] += item_html + column_num = (column_num + 1) % 2 + columns = [column + '
' for column in columns] + return {'t': 'RawBlock', 'c': ['html', ''.join(columns)]} + +def is_code_block(item): + return item['t'] == 'Code' and 'octicon' in item['c'][1] + + +def process_all(key, value, format, meta): + for transform in [process_admonitions, process_images, process_grids]: + new_value = transform(key, value, format, meta) + if new_value is not None: + break + return new_value + + +if __name__ == "__main__": + toJSONFilter(process_all) diff --git a/.jenkins/post_process_notebooks.py b/.jenkins/post_process_notebooks.py new file mode 100644 index 00000000000..81f51766c3e --- /dev/null +++ b/.jenkins/post_process_notebooks.py @@ -0,0 +1,97 @@ +import nbformat as nbf +import os +import re + +""" +This post-processing script needs to run after the .ipynb files are +generated. The script removes extraneous ```{=html} syntax from the +admonitions and splits the cells that have video iframe into a +separate code cell that can be run to load the video directly +in the notebook. This script is included in build.sh. +""" + + +# Pattern to search ``` {.python .jupyter-code-cell} +pattern = re.compile(r'(.*?)``` {.python .jupyter-code-cell}\n\n(from IPython.display import display, HTML\nhtml_code = """\n.*?\n"""\ndisplay\(HTML\(html_code\)\))\n```(.*)', re.DOTALL) + + +def process_video_cell(notebook_path): + """ + This function finds the code blocks with the + "``` {.python .jupyter-code-cell}" code bocks and slices them + into a separe code cell (instead of markdown) which allows to + load the video in the notebook. The rest of the content is placed + in a new markdown cell. + """ + print(f'Processing file: {notebook_path}') + notebook = nbf.read(notebook_path, as_version=4) + + # Iterate over markdown cells + for i, cell in enumerate(notebook.cells): + if cell.cell_type == 'markdown': + match = pattern.search(cell.source) + if match: + print(f'Match found in cell {i}: {match.group(0)[:100]}...') + # Extract the parts before and after the video code block + before_html_block = match.group(1) + code_block = match.group(2) + + # Add a comment to run the cell to display the video + code_block = "# Run this cell to load the video\n" + code_block + # Create a new code cell + new_code_cell = nbf.v4.new_code_cell(source=code_block) + + # Replace the original markdown cell with the part before the code block + cell.source = before_html_block + + # Insert the new code cell after the current one + notebook.cells.insert(i+1, new_code_cell) + print(f'New code cell created with source: {new_code_cell.source}') + + # If there is content after the HTML code block, create a new markdown cell + if len(match.group(3).strip()) > 0: + after_html_block = match.group(3) + new_markdown_cell = nbf.v4.new_markdown_cell(source=after_html_block) + # Create a new markdown cell and add the content after code block there + notebook.cells.insert(i+2, new_markdown_cell) + + else: + # Remove ```{=html} from the code block + cell.source = remove_html_tag(cell.source) + + nbf.write(notebook, notebook_path) + + +def remove_html_tag(content): + """ + Pandoc adds an extraneous ```{=html} ``` to raw HTML blocks which + prevents it from rendering correctly. This function removes + ```{=html} that we don't need. + """ + content = re.sub(r'```{=html}\n\n```', '">', content) + content = re.sub(r'<\/div>\n```', '
\n', content) + content = re.sub(r'```{=html}\n\n```', '\n', content) + content = re.sub(r'```{=html}', '', content) + content = re.sub(r'

\n```', '

', content) + return content + + +def walk_dir(downloads_dir): + """ + Walk the dir and process all notebook files in + the _downloads directory and its subdirectories. + """ + for root, dirs, files in os.walk(downloads_dir): + for filename in files: + if filename.endswith('.ipynb'): + process_video_cell(os.path.join(root, filename)) + + +def main(): + downloads_dir = './docs/_downloads' + walk_dir(downloads_dir) + + +if __name__ == "__main__": + main() diff --git a/conf.py b/conf.py index e0d1d6fda6b..36bf506f864 100644 --- a/conf.py +++ b/conf.py @@ -42,7 +42,8 @@ import distutils.file_util import re from get_sphinx_filenames import SPHINX_SHOULD_RUN - +import pandocfilters +import pypandoc import plotly.io as pio pio.renderers.default = 'sphinx_gallery' @@ -74,7 +75,8 @@ 'sphinx.ext.intersphinx', 'sphinx_copybutton', 'sphinx_gallery.gen_gallery', - 'sphinx_design' + 'sphinx_design', + 'nbsphinx' ] intersphinx_mapping = { @@ -107,7 +109,10 @@ def reset_seeds(gallery_conf, fname): "# https://pytorch.org/tutorials/beginner/colab\n" "%matplotlib inline"), 'reset_modules': (reset_seeds), - 'ignore_pattern': r'_torch_export_nightly_tutorial.py' + 'ignore_pattern': r'_torch_export_nightly_tutorial.py', + 'pypandoc': {'extra_args': ['--mathjax', '--toc'], + 'filters': ['.jenkins/custom_pandoc_filter.py'], + }, } if os.getenv('GALLERY_PATTERN'): diff --git a/requirements.txt b/requirements.txt index 2aa065fd241..843362dd095 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,8 +4,12 @@ sphinx==5.0.0 sphinx-gallery==0.11.1 sphinx_design +nbsphinx docutils==0.16 sphinx-copybutton +pypandoc==1.12 +pandocfilters +markdown tqdm==4.66.1 numpy==1.24.4 matplotlib @@ -28,7 +32,7 @@ torchx torchrl==0.3.0 tensordict==0.3.0 ax-platform -nbformat>=4.2.0 +nbformat>==5.9.2 datasets transformers torchmultimodal-nightly # needs to be updated to stable as soon as it's avaialable