Skip to content

Commit 30e14df

Browse files
svekarsmalfet
andauthored
Process notebooks with pypandoc and a custom pandoc filter (#2741)
Process notebooks with pypandoc and a custom pandoc filter To fix a number of formatting issues with notebooks, adding pypandoc to process the notebooks and a custom pandoc filter and a post-processing script. The custom filter does the following: * Handles admonitions - replaces notes, tips, and warnings with correct HTML blocks. * Handles blocks of code that have video frames - adds a tag ``` {.python .jupyter-code-cell} * Handles images - prepends the pytorch.org/tutorials to the local paths so that the images display correctly. * Handles 2-colum grid layout - replaces with sid-by side cards: The post-porocessing script: * Cleans up the resulting notebooks and remove the unneeded ```{=html} syntax that prevents the notebooks from rendering * Searches for ``` {.python .jupyter-code-cell} in the generated .ipynb files and slices the cells so that the HTML video frame can be run as a code cell and display that video Co-authored-by: Nikita Shulga <2453524+malfet@users.noreply.github.com>
1 parent 8c48ada commit 30e14df

File tree

5 files changed

+259
-4
lines changed

5 files changed

+259
-4
lines changed

.jenkins/build.sh

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,10 @@ sudo apt-get update || sudo apt-get install libgnutls30
1515
sudo apt-get update
1616
sudo apt-get install -y --no-install-recommends unzip p7zip-full sox libsox-dev libsox-fmt-all rsync
1717

18+
# Install pandoc (does not install from pypi)
19+
sudo apt-get update
20+
sudo apt-get install -y pandoc
21+
1822
# NS: Path to python runtime should already be part of docker container
1923
# export PATH=/opt/conda/bin:$PATH
2024
rm -rf src
@@ -63,6 +67,9 @@ if [[ "${JOB_TYPE}" == "worker" ]]; then
6367
# Step 3: Run `make docs` to generate HTML files and static files for these tutorials
6468
make docs
6569

70+
# Step 3.1: Run the post-processing script:
71+
python .jenkins/post_process_notebooks.py
72+
6673
# Step 4: If any of the generated files are not related the tutorial files we want to run,
6774
# then we remove them
6875
set +x
@@ -140,6 +147,9 @@ elif [[ "${JOB_TYPE}" == "manager" ]]; then
140147
bash $DIR/remove_invisible_code_block_batch.sh docs
141148
python .jenkins/validate_tutorials_built.py
142149

150+
# Step 5.1: Run post-processing script on .ipynb files:
151+
python .jenkins/post_process_notebooks.py
152+
143153
# Step 6: Copy generated HTML files and static files to S3
144154
7z a manager.7z docs
145155
awsv2 s3 cp manager.7z s3://${BUCKET_NAME}/${COMMIT_ID}/manager.7z

.jenkins/custom_pandoc_filter.py

Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
from pandocfilters import toJSONFilter, Div, RawBlock, Para, Str, Space, Link, Code, CodeBlock
2+
import markdown
3+
import html
4+
5+
def to_markdown(item, skip_octicon=False):
6+
# A handler function to process strings, links, code, and code
7+
# blocks
8+
if item['t'] == 'Str':
9+
return item['c']
10+
elif item['t'] == 'Space':
11+
return ' '
12+
elif item['t'] == 'Link':
13+
link_text = ''.join(to_markdown(i, skip_octicon) for i in item['c'][1])
14+
return f'<a href="{item["c"][2][0]}">{link_text}</a>'
15+
elif item['t'] == 'Code':
16+
# Need to remove icticon as they don't render in .ipynb
17+
if any(value == 'octicon' for key, value in item['c'][0][2]):
18+
return ''
19+
else:
20+
# Escape the code and wrap it in <code> tags
21+
return f'<code>{html.escape(item["c"][1])}</code>'
22+
elif item['t'] == 'CodeBlock':
23+
# Escape the code block and wrap it in <pre><code> tags
24+
return f'<pre><code>{html.escape(item["c"][1])}</code></pre>'
25+
else:
26+
return ''
27+
28+
29+
def process_admonitions(key, value, format, meta):
30+
# Replace admonitions with proper HTML.
31+
if key == 'Div':
32+
[[ident, classes, keyvals], contents] = value
33+
if 'note' in classes:
34+
color = '#54c7ec'
35+
label = 'NOTE:'
36+
elif 'tip' in classes:
37+
color = '#6bcebb'
38+
label = 'TIP:'
39+
elif 'warning' in classes:
40+
color = '#e94f3b'
41+
label = 'WARNING:'
42+
else:
43+
return
44+
45+
note_content = []
46+
for block in contents:
47+
if block.get('t') == 'Para':
48+
for item in block['c']:
49+
if item['t'] == 'Str':
50+
note_content.append(Str(item['c']))
51+
elif item['t'] == 'Space':
52+
note_content.append(Space())
53+
elif item['t'] == 'Link':
54+
note_content.append(Link(*item['c']))
55+
elif item['t'] == 'Code':
56+
note_content.append(Code(*item['c']))
57+
elif block.get('t') == 'CodeBlock':
58+
note_content.append(CodeBlock(*block['c']))
59+
60+
note_content_md = ''.join(to_markdown(item) for item in note_content)
61+
html_content = markdown.markdown(note_content_md)
62+
63+
return [{'t': 'RawBlock', 'c': ['html', f'<div style="background-color: {color}; color: #fff; font-weight: 700; padding-left: 10px; padding-top: 5px; padding-bottom: 5px"><strong>{label}</strong></div>']}, {'t': 'RawBlock', 'c': ['html', '<div style="background-color: #f3f4f7; padding-left: 10px; padding-top: 10px; padding-bottom: 10px; padding-right: 10px">']}, {'t': 'RawBlock', 'c': ['html', html_content]}, {'t': 'RawBlock', 'c': ['html', '</div>']}]
64+
elif key == 'RawBlock':
65+
# this is needed for the cells that have embedded video.
66+
# We add a special tag to those: ``` {python, .jupyter-code-cell}
67+
# The post-processing script then finds those and genrates separate
68+
# code cells that can load video.
69+
[format, content] = value
70+
if format == 'html' and 'iframe' in content:
71+
# Extract the video URL
72+
video_url = content.split('src="')[1].split('"')[0]
73+
# Create the Python code to display the video
74+
python_code = f"""
75+
from IPython.display import display, HTML
76+
html_code = \"""
77+
{content}
78+
\"""
79+
display(HTML(html_code))
80+
"""
81+
82+
return {'t': 'CodeBlock', 'c': [['', ['python', 'jupyter-code-cell'], []], python_code]}
83+
84+
85+
def process_images(key, value, format, meta):
86+
# Add https://pytorch.org/tutorials/ to images so that they
87+
# load correctly in the notebook.
88+
if key != 'Image':
89+
return None
90+
[ident, classes, keyvals], caption, [src, title] = value
91+
if not src.startswith('http'):
92+
while src.startswith('../'):
93+
src = src[3:]
94+
if src.startswith('/_static'):
95+
src = src[1:]
96+
src = 'https://pytorch.org/tutorials/' + src
97+
98+
return {'t': 'Image', 'c': [[ident, classes, keyvals], caption, [src, title]]}
99+
100+
101+
def process_grids(key, value, format, meta):
102+
# Generate side by side grid cards. Only for the two-cards layout
103+
# that we use in the tutorial template.
104+
if key == 'Div':
105+
[[ident, classes, keyvals], contents] = value
106+
if 'grid' in classes:
107+
columns = ['<div style="width: 45%; float: left; padding: 20px;">',
108+
'<div style="width: 45%; float: right; padding: 20px;">']
109+
column_num = 0
110+
for block in contents:
111+
if 't' in block and block['t'] == 'Div' and 'grid-item-card' in block['c'][0][1]:
112+
item_html = ''
113+
for item in block['c'][1]:
114+
if item['t'] == 'Para':
115+
item_html += '<h2>' + ''.join(to_markdown(i) for i in item['c']) + '</h2>'
116+
elif item['t'] == 'BulletList':
117+
item_html += '<ul>'
118+
for list_item in item['c']:
119+
item_html += '<li>' + ''.join(to_markdown(i) for i in list_item[0]['c']) + '</li>'
120+
item_html += '</ul>'
121+
columns[column_num] += item_html
122+
column_num = (column_num + 1) % 2
123+
columns = [column + '</div>' for column in columns]
124+
return {'t': 'RawBlock', 'c': ['html', ''.join(columns)]}
125+
126+
def is_code_block(item):
127+
return item['t'] == 'Code' and 'octicon' in item['c'][1]
128+
129+
130+
def process_all(key, value, format, meta):
131+
for transform in [process_admonitions, process_images, process_grids]:
132+
new_value = transform(key, value, format, meta)
133+
if new_value is not None:
134+
break
135+
return new_value
136+
137+
138+
if __name__ == "__main__":
139+
toJSONFilter(process_all)

.jenkins/post_process_notebooks.py

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
import nbformat as nbf
2+
import os
3+
import re
4+
5+
"""
6+
This post-processing script needs to run after the .ipynb files are
7+
generated. The script removes extraneous ```{=html} syntax from the
8+
admonitions and splits the cells that have video iframe into a
9+
separate code cell that can be run to load the video directly
10+
in the notebook. This script is included in build.sh.
11+
"""
12+
13+
14+
# Pattern to search ``` {.python .jupyter-code-cell}
15+
pattern = re.compile(r'(.*?)``` {.python .jupyter-code-cell}\n\n(from IPython.display import display, HTML\nhtml_code = """\n.*?\n"""\ndisplay\(HTML\(html_code\)\))\n```(.*)', re.DOTALL)
16+
17+
18+
def process_video_cell(notebook_path):
19+
"""
20+
This function finds the code blocks with the
21+
"``` {.python .jupyter-code-cell}" code bocks and slices them
22+
into a separe code cell (instead of markdown) which allows to
23+
load the video in the notebook. The rest of the content is placed
24+
in a new markdown cell.
25+
"""
26+
print(f'Processing file: {notebook_path}')
27+
notebook = nbf.read(notebook_path, as_version=4)
28+
29+
# Iterate over markdown cells
30+
for i, cell in enumerate(notebook.cells):
31+
if cell.cell_type == 'markdown':
32+
match = pattern.search(cell.source)
33+
if match:
34+
print(f'Match found in cell {i}: {match.group(0)[:100]}...')
35+
# Extract the parts before and after the video code block
36+
before_html_block = match.group(1)
37+
code_block = match.group(2)
38+
39+
# Add a comment to run the cell to display the video
40+
code_block = "# Run this cell to load the video\n" + code_block
41+
# Create a new code cell
42+
new_code_cell = nbf.v4.new_code_cell(source=code_block)
43+
44+
# Replace the original markdown cell with the part before the code block
45+
cell.source = before_html_block
46+
47+
# Insert the new code cell after the current one
48+
notebook.cells.insert(i+1, new_code_cell)
49+
print(f'New code cell created with source: {new_code_cell.source}')
50+
51+
# If there is content after the HTML code block, create a new markdown cell
52+
if len(match.group(3).strip()) > 0:
53+
after_html_block = match.group(3)
54+
new_markdown_cell = nbf.v4.new_markdown_cell(source=after_html_block)
55+
# Create a new markdown cell and add the content after code block there
56+
notebook.cells.insert(i+2, new_markdown_cell)
57+
58+
else:
59+
# Remove ```{=html} from the code block
60+
cell.source = remove_html_tag(cell.source)
61+
62+
nbf.write(notebook, notebook_path)
63+
64+
65+
def remove_html_tag(content):
66+
"""
67+
Pandoc adds an extraneous ```{=html} ``` to raw HTML blocks which
68+
prevents it from rendering correctly. This function removes
69+
```{=html} that we don't need.
70+
"""
71+
content = re.sub(r'```{=html}\n<div', '<div', content)
72+
content = re.sub(r'">\n```', '">', content)
73+
content = re.sub(r'<\/div>\n```', '</div>\n', content)
74+
content = re.sub(r'```{=html}\n</div>\n```', '</div>\n', content)
75+
content = re.sub(r'```{=html}', '', content)
76+
content = re.sub(r'</p>\n```', '</p>', content)
77+
return content
78+
79+
80+
def walk_dir(downloads_dir):
81+
"""
82+
Walk the dir and process all notebook files in
83+
the _downloads directory and its subdirectories.
84+
"""
85+
for root, dirs, files in os.walk(downloads_dir):
86+
for filename in files:
87+
if filename.endswith('.ipynb'):
88+
process_video_cell(os.path.join(root, filename))
89+
90+
91+
def main():
92+
downloads_dir = './docs/_downloads'
93+
walk_dir(downloads_dir)
94+
95+
96+
if __name__ == "__main__":
97+
main()

conf.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,8 @@
4242
import distutils.file_util
4343
import re
4444
from get_sphinx_filenames import SPHINX_SHOULD_RUN
45-
45+
import pandocfilters
46+
import pypandoc
4647
import plotly.io as pio
4748
pio.renderers.default = 'sphinx_gallery'
4849

@@ -74,7 +75,8 @@
7475
'sphinx.ext.intersphinx',
7576
'sphinx_copybutton',
7677
'sphinx_gallery.gen_gallery',
77-
'sphinx_design'
78+
'sphinx_design',
79+
'nbsphinx'
7880
]
7981

8082
intersphinx_mapping = {
@@ -107,7 +109,10 @@ def reset_seeds(gallery_conf, fname):
107109
"# https://pytorch.org/tutorials/beginner/colab\n"
108110
"%matplotlib inline"),
109111
'reset_modules': (reset_seeds),
110-
'ignore_pattern': r'_torch_export_nightly_tutorial.py'
112+
'ignore_pattern': r'_torch_export_nightly_tutorial.py',
113+
'pypandoc': {'extra_args': ['--mathjax', '--toc'],
114+
'filters': ['.jenkins/custom_pandoc_filter.py'],
115+
},
111116
}
112117

113118
if os.getenv('GALLERY_PATTERN'):

requirements.txt

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,12 @@
44
sphinx==5.0.0
55
sphinx-gallery==0.11.1
66
sphinx_design
7+
nbsphinx
78
docutils==0.16
89
sphinx-copybutton
10+
pypandoc==1.12
11+
pandocfilters
12+
markdown
913
tqdm==4.66.1
1014
numpy==1.24.4
1115
matplotlib
@@ -28,7 +32,7 @@ torchx
2832
torchrl==0.3.0
2933
tensordict==0.3.0
3034
ax-platform
31-
nbformat>=4.2.0
35+
nbformat>==5.9.2
3236
datasets
3337
transformers
3438
torchmultimodal-nightly # needs to be updated to stable as soon as it's avaialable

0 commit comments

Comments
 (0)