From 242f4207d084dff40af106119587d9241c0de16a Mon Sep 17 00:00:00 2001
From: Ankita De <deankita@fb.com>
Date: Sat, 24 Sep 2022 10:47:53 -0700
Subject: [PATCH 01/23] [WIP] Add torchmultimodal tutorial for flava finetuning

---
 Makefile                                     |   4 +
 beginner_source/flava_finetuning_tutorial.py | 173 +++++++++++++++++++
 index.rst                                    |  17 ++
 requirements.txt                             |   3 +
 4 files changed, 197 insertions(+)
 create mode 100644 beginner_source/flava_finetuning_tutorial.py

diff --git a/Makefile b/Makefile
index 8c21384967c..a01ea69bb50 100644
--- a/Makefile
+++ b/Makefile
@@ -102,6 +102,10 @@ download:
 	wget -nv -N https://download.pytorch.org/models/resnet18-5c106cde.pth -P $(DATADIR)
 	cp $(DATADIR)/resnet18-5c106cde.pth prototype_source/data/resnet18_pretrained_float.pth
 
+	# Download vocab for beginner_source/flava_finetuning_tutorial.py
+	wget -nv -N http://dl.fbaipublicfiles.com/pythia/data/vocab.tar.gz -P $(DATADIR)
+	tar $(TAROPTS) -xzf $(DATADIR)/vocab.tar.gz -C ./beginner_source/data/
+
 
 docs:
 	make download
diff --git a/beginner_source/flava_finetuning_tutorial.py b/beginner_source/flava_finetuning_tutorial.py
new file mode 100644
index 00000000000..0a067043e6d
--- /dev/null
+++ b/beginner_source/flava_finetuning_tutorial.py
@@ -0,0 +1,173 @@
+# -*- coding: utf-8 -*-
+"""
+TorchMultimodal Tutorial : Finetuning FLAVA
+=======================
+"""
+
+######################################################################
+# Multimodal AI has recently become very popular owing to its ubiquitous
+# nature, from use cases like image captioning and visual search to more
+# recent applications like image generation from text. **TorchMultimodal
+# is a library powered by Pytorch consisting of building blocks and end to
+# end examples, aiming to enable and accelerate research in
+# multimodality**.
+# 
+# In this tutorial, we will demonstrate how to use a **pretrained SoTA
+# model called** `FLAVA <https://arxiv.org/pdf/2112.04482.pdf>`__ **from
+# TorchMultimodal library to finetune on a multimodal task i.e. visual
+# question answering** (VQA).
+# 
+
+
+######################################################################
+# Installations
+# 
+# We will use TextVQA dataset from HuggingFace for this
+# tutorial. So we install datasets in addition to TorchMultimodal
+# 
+
+# TODO: replace with install from pip when binary is ready
+!git clone https://github.com/facebookresearch/multimodal.git
+!pip install -r multimodal/requirements.txt
+import os
+import sys 
+sys.path.append(os.path.join(os.getcwd(),"multimodal"))
+sys.path.append(os.getcwd())
+!pip install datasets
+!pip install transformers
+
+
+######################################################################
+# For this tutorial, we treat VQA as a classification task. So we need to
+# download the vocab file with answer classes and create the answer to
+# label mapping.
+# 
+# We also load the `textvqa
+# dataset <https://arxiv.org/pdf/1904.08920.pdf>`__ from HuggingFace
+# 
+
+!wget http://dl.fbaipublicfiles.com/pythia/data/vocab.tar.gz
+!tar xf vocab.tar.gz
+
+
+with open("vocabs/answers_textvqa_more_than_1.txt") as f:
+  vocab = f.readlines()
+
+answer_to_idx = {}
+for idx, entry in enumerate(vocab):
+  answer_to_idx[entry.strip("\n")] = idx
+
+
+######################################################################
+# We see there are 3997 answer classes including a class representing
+# unknown answers
+# 
+
+print(len(vocab))
+print(vocab[:5])
+
+from datasets import load_dataset
+dataset = load_dataset("textvqa")
+
+from IPython.display import display, Image
+idx = 5
+print("Question: ", dataset["train"][idx]["question"])
+print("Answers: " ,dataset["train"][idx]["answers"])
+display(dataset["train"][idx]["image"].resize((500,500)))
+
+
+######################################################################
+# Next we write the transform function to convert the image and text into
+# Tensors consumable by our model - For images, we use the transforms from
+# torchvision to convert to Tensor and resize to uniform sizes - For text,
+# we tokenize (and pad) them using the BertTokenizer from HuggingFace -
+# For answers (i.e. labels), we take the most frequently occuring answer
+# as the label to train with
+# 
+
+import torch
+from torchvision import transforms
+from collections import defaultdict
+from transformers import BertTokenizer
+from functools import partial
+
+def transform(tokenizer, input):
+  batch = {}
+  image_transform = transforms.Compose([transforms.ToTensor(), transforms.Resize([224,224])])
+  image = image_transform(input["image"][0].convert("RGB"))
+  batch["image"] = [image]
+
+  tokenized=tokenizer(input["question"],return_tensors='pt',padding="max_length",max_length=512)
+  batch.update(tokenized)
+  
+  
+  ans_to_count = defaultdict(int)
+  for ans in input["answers"][0]:
+    ans_to_count[ans] += 1
+  max_value = max(ans_to_count, key=ans_to_count.get)
+  ans_idx = answer_to_idx.get(max_value,0)
+  batch["answers"] = torch.as_tensor([ans_idx])
+  
+  return batch
+
+tokenizer=BertTokenizer.from_pretrained("bert-base-uncased",padding="max_length",max_length=512)
+transform=partial(transform,tokenizer)
+dataset.set_transform(transform)
+
+
+######################################################################
+# Finally, we import the flava_model_for_classification from
+# torchmultimodal. It loads the pretrained flava checkpoint by default and
+# includes a classification head.
+# 
+# The model forward function passes the image through the visual encoder
+# and the question through the text encoder. The image and question
+# embeddings are then passed through the multimodal encoder. The final
+# embedding corresponding to the CLS token is passed through a MLP head
+# which finally gives the probability distribution over each possible
+# answers.
+# 
+
+from torchmultimodal.models.flava.model import flava_model_for_classification
+model = flava_model_for_classification(num_classes=len(vocab))
+
+
+######################################################################
+# We put together the dataset and model in a toy training loop to
+# demonstrate how to train the model for 3 iterations.
+# 
+
+from torch import nn
+BATCH_SIZE = 2
+MAX_STEPS = 3
+from torch.utils.data import DataLoader
+
+train_dataloader = DataLoader(dataset["train"], batch_size= BATCH_SIZE)
+optimizer = torch.optim.AdamW(model.parameters())
+  
+     
+epochs = 1
+for _ in range(epochs):
+  for idx, batch in enumerate(train_dataloader):
+    optimizer.zero_grad()
+    out = model(text = batch["input_ids"], image = batch["image"], labels = batch["answers"], required_embedding="mm")
+    loss = out.loss
+    loss.backward()
+    optimizer.step()
+    print(f"Loss at step {idx} = {loss}")
+    if idx > MAX_STEPS-1:
+      break
+
+
+######################################################################
+# Conclusion
+# 
+# This tutorial introduced the basics around how to finetune on a
+# multimodal task using FLAVA from TorchMultimodal. Please also check out
+# other examples from the library like
+# `MDETR <https://github.com/facebookresearch/multimodal/tree/main/torchmultimodal/models/mdetr>`__
+# which is a multimodal model for object detection and
+# `Omnivore <https://github.com/facebookresearch/multimodal/blob/main/torchmultimodal/models/omnivore.py>`__
+# which is multitask model spanning image, video and 3d classification.
+# 
+
diff --git a/index.rst b/index.rst
index 89f04219d87..5bb20a34825 100644
--- a/index.rst
+++ b/index.rst
@@ -646,6 +646,15 @@ What's new in PyTorch tutorials?
    :link: advanced/sharding.html
    :tags: TorchRec,Recommender
 
+.. Multimodality
+
+.. customcarditem::
+   :header: Introduction to TorchMultimodal
+   :card_description: TorchMultimodal is a library that provides models, primitives and examples for training multimodal tasks
+   :image: _static/img/thumbnails/torchrec.png
+   :link: beginner/flava_finetuning_tutorial.html
+   :tags: TorchMultimodal
+
 
 .. End of tutorial card section
 
@@ -919,3 +928,11 @@ Additional Resources
 
    intermediate/torchrec_tutorial
    advanced/sharding
+
+.. toctree::
+   :maxdepth: 2
+   :includehidden:
+   :hidden:
+   :caption: Multimodality
+
+   beginner/flava_finetuning_tutorial
diff --git a/requirements.txt b/requirements.txt
index dd632f368a4..cd621afed32 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -27,6 +27,9 @@ pytorch-lightning
 torchx
 ax-platform
 nbformat>=4.2.0
+datasets
+transformers
+torchmultimodal-nightly
 
 # PyTorch Theme
 -e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme

From d991b9b1a3f003a40ad855ff40f43d8c0010ca7f Mon Sep 17 00:00:00 2001
From: Ankita De <deankita@fb.com>
Date: Mon, 26 Sep 2022 11:01:06 -0700
Subject: [PATCH 02/23] [WIP] Add torchmultimodal tutorial for flava finetuning

ghstack-source-id: e04328489b09a4f53d731b7aebcb424881567531
Pull Request resolved: https://github.com/pytorch/tutorials/pull/2055
---
 Makefile                                     |   4 +
 beginner_source/flava_finetuning_tutorial.py | 172 +++++++++++++++++++
 index.rst                                    |  17 ++
 requirements.txt                             |   3 +
 4 files changed, 196 insertions(+)
 create mode 100644 beginner_source/flava_finetuning_tutorial.py

diff --git a/Makefile b/Makefile
index 8c21384967c..a01ea69bb50 100644
--- a/Makefile
+++ b/Makefile
@@ -102,6 +102,10 @@ download:
 	wget -nv -N https://download.pytorch.org/models/resnet18-5c106cde.pth -P $(DATADIR)
 	cp $(DATADIR)/resnet18-5c106cde.pth prototype_source/data/resnet18_pretrained_float.pth
 
+	# Download vocab for beginner_source/flava_finetuning_tutorial.py
+	wget -nv -N http://dl.fbaipublicfiles.com/pythia/data/vocab.tar.gz -P $(DATADIR)
+	tar $(TAROPTS) -xzf $(DATADIR)/vocab.tar.gz -C ./beginner_source/data/
+
 
 docs:
 	make download
diff --git a/beginner_source/flava_finetuning_tutorial.py b/beginner_source/flava_finetuning_tutorial.py
new file mode 100644
index 00000000000..a4f5d030a54
--- /dev/null
+++ b/beginner_source/flava_finetuning_tutorial.py
@@ -0,0 +1,172 @@
+######################################################################
+# TorchMultimodal Tutorial: FLAVA finetuning
+# --------------------------------------------
+#
+
+######################################################################
+# Multimodal AI has recently become very popular owing to its ubiquitous
+# nature, from use cases like image captioning and visual search to more
+# recent applications like image generation from text. **TorchMultimodal
+# is a library powered by Pytorch consisting of building blocks and end to
+# end examples, aiming to enable and accelerate research in
+# multimodality**.
+# 
+# In this tutorial, we will demonstrate how to use a **pretrained SoTA
+# model called** `FLAVA <https://arxiv.org/pdf/2112.04482.pdf>`__ **from
+# TorchMultimodal library to finetune on a multimodal task i.e. visual
+# question answering** (VQA).
+# 
+
+
+######################################################################
+# Installations
+# 
+# We will use TextVQA dataset from HuggingFace for this
+# tutorial. So we install datasets in addition to TorchMultimodal
+# 
+
+# TODO: replace with install from pip when binary is ready
+!git clone https://github.com/facebookresearch/multimodal.git
+!pip install -r multimodal/requirements.txt
+import os
+import sys 
+sys.path.append(os.path.join(os.getcwd(),"multimodal"))
+sys.path.append(os.getcwd())
+!pip install datasets
+!pip install transformers
+
+
+######################################################################
+# For this tutorial, we treat VQA as a classification task. So we need to
+# download the vocab file with answer classes and create the answer to
+# label mapping.
+# 
+# We also load the `textvqa
+# dataset <https://arxiv.org/pdf/1904.08920.pdf>`__ from HuggingFace
+# 
+
+!wget http://dl.fbaipublicfiles.com/pythia/data/vocab.tar.gz
+!tar xf vocab.tar.gz
+
+
+with open("vocabs/answers_textvqa_more_than_1.txt") as f:
+  vocab = f.readlines()
+
+answer_to_idx = {}
+for idx, entry in enumerate(vocab):
+  answer_to_idx[entry.strip("\n")] = idx
+
+
+######################################################################
+# We see there are 3997 answer classes including a class representing
+# unknown answers
+# 
+
+print(len(vocab))
+print(vocab[:5])
+
+from datasets import load_dataset
+dataset = load_dataset("textvqa")
+
+from IPython.display import display, Image
+idx = 5
+print("Question: ", dataset["train"][idx]["question"])
+print("Answers: " ,dataset["train"][idx]["answers"])
+display(dataset["train"][idx]["image"].resize((500,500)))
+
+
+######################################################################
+# Next we write the transform function to convert the image and text into
+# Tensors consumable by our model - For images, we use the transforms from
+# torchvision to convert to Tensor and resize to uniform sizes - For text,
+# we tokenize (and pad) them using the BertTokenizer from HuggingFace -
+# For answers (i.e. labels), we take the most frequently occuring answer
+# as the label to train with
+# 
+
+import torch
+from torchvision import transforms
+from collections import defaultdict
+from transformers import BertTokenizer
+from functools import partial
+
+def transform(tokenizer, input):
+  batch = {}
+  image_transform = transforms.Compose([transforms.ToTensor(), transforms.Resize([224,224])])
+  image = image_transform(input["image"][0].convert("RGB"))
+  batch["image"] = [image]
+
+  tokenized=tokenizer(input["question"],return_tensors='pt',padding="max_length",max_length=512)
+  batch.update(tokenized)
+  
+  
+  ans_to_count = defaultdict(int)
+  for ans in input["answers"][0]:
+    ans_to_count[ans] += 1
+  max_value = max(ans_to_count, key=ans_to_count.get)
+  ans_idx = answer_to_idx.get(max_value,0)
+  batch["answers"] = torch.as_tensor([ans_idx])
+  
+  return batch
+
+tokenizer=BertTokenizer.from_pretrained("bert-base-uncased",padding="max_length",max_length=512)
+transform=partial(transform,tokenizer)
+dataset.set_transform(transform)
+
+
+######################################################################
+# Finally, we import the flava_model_for_classification from
+# torchmultimodal. It loads the pretrained flava checkpoint by default and
+# includes a classification head.
+# 
+# The model forward function passes the image through the visual encoder
+# and the question through the text encoder. The image and question
+# embeddings are then passed through the multimodal encoder. The final
+# embedding corresponding to the CLS token is passed through a MLP head
+# which finally gives the probability distribution over each possible
+# answers.
+# 
+
+from torchmultimodal.models.flava.model import flava_model_for_classification
+model = flava_model_for_classification(num_classes=len(vocab))
+
+
+######################################################################
+# We put together the dataset and model in a toy training loop to
+# demonstrate how to train the model for 3 iterations.
+# 
+
+from torch import nn
+BATCH_SIZE = 2
+MAX_STEPS = 3
+from torch.utils.data import DataLoader
+
+train_dataloader = DataLoader(dataset["train"], batch_size= BATCH_SIZE)
+optimizer = torch.optim.AdamW(model.parameters())
+  
+     
+epochs = 1
+for _ in range(epochs):
+  for idx, batch in enumerate(train_dataloader):
+    optimizer.zero_grad()
+    out = model(text = batch["input_ids"], image = batch["image"], labels = batch["answers"], required_embedding="mm")
+    loss = out.loss
+    loss.backward()
+    optimizer.step()
+    print(f"Loss at step {idx} = {loss}")
+    if idx > MAX_STEPS-1:
+      break
+
+
+######################################################################
+# Conclusion
+# 
+# This tutorial introduced the basics around how to finetune on a
+# multimodal task using FLAVA from TorchMultimodal. Please also check out
+# other examples from the library like
+# `MDETR <https://github.com/facebookresearch/multimodal/tree/main/torchmultimodal/models/mdetr>`__
+# which is a multimodal model for object detection and
+# `Omnivore <https://github.com/facebookresearch/multimodal/blob/main/torchmultimodal/models/omnivore.py>`__
+# which is multitask model spanning image, video and 3d classification.
+# 
+
diff --git a/index.rst b/index.rst
index 89f04219d87..5bb20a34825 100644
--- a/index.rst
+++ b/index.rst
@@ -646,6 +646,15 @@ What's new in PyTorch tutorials?
    :link: advanced/sharding.html
    :tags: TorchRec,Recommender
 
+.. Multimodality
+
+.. customcarditem::
+   :header: Introduction to TorchMultimodal
+   :card_description: TorchMultimodal is a library that provides models, primitives and examples for training multimodal tasks
+   :image: _static/img/thumbnails/torchrec.png
+   :link: beginner/flava_finetuning_tutorial.html
+   :tags: TorchMultimodal
+
 
 .. End of tutorial card section
 
@@ -919,3 +928,11 @@ Additional Resources
 
    intermediate/torchrec_tutorial
    advanced/sharding
+
+.. toctree::
+   :maxdepth: 2
+   :includehidden:
+   :hidden:
+   :caption: Multimodality
+
+   beginner/flava_finetuning_tutorial
diff --git a/requirements.txt b/requirements.txt
index dd632f368a4..cd621afed32 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -27,6 +27,9 @@ pytorch-lightning
 torchx
 ax-platform
 nbformat>=4.2.0
+datasets
+transformers
+torchmultimodal-nightly
 
 # PyTorch Theme
 -e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme

From 3188b6644af25dbd65c646741c1db29faa0eb1dd Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@fb.com>
Date: Mon, 26 Sep 2022 12:48:32 -0700
Subject: [PATCH 03/23] Update

---
 beginner_source/flava_finetuning_tutorial.py | 57 ++++++++------------
 requirements.txt                             |  4 ++
 2 files changed, 27 insertions(+), 34 deletions(-)

diff --git a/beginner_source/flava_finetuning_tutorial.py b/beginner_source/flava_finetuning_tutorial.py
index 0a067043e6d..ec87ba86f7d 100644
--- a/beginner_source/flava_finetuning_tutorial.py
+++ b/beginner_source/flava_finetuning_tutorial.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 """
-TorchMultimodal Tutorial : Finetuning FLAVA
-=======================
+TorchMultimodal Tutorial: Finetuning FLAVA
+============================================
 """
 
 ######################################################################
@@ -14,43 +14,33 @@
 # 
 # In this tutorial, we will demonstrate how to use a **pretrained SoTA
 # model called** `FLAVA <https://arxiv.org/pdf/2112.04482.pdf>`__ **from
-# TorchMultimodal library to finetune on a multimodal task i.e. visual
+# TorchMultimodal library to finetune on a multimodal task i.e. visual
 # question answering** (VQA).
 # 
 
 
 ######################################################################
 # Installations
-# 
+#
+#
 # We will use TextVQA dataset from HuggingFace for this
-# tutorial. So we install datasets in addition to TorchMultimodal
-# 
+# tutorial. So we install datasets in addition to TorchMultimodal.
 
-# TODO: replace with install from pip when binary is ready
-!git clone https://github.com/facebookresearch/multimodal.git
-!pip install -r multimodal/requirements.txt
 import os
-import sys 
+import sys
 sys.path.append(os.path.join(os.getcwd(),"multimodal"))
 sys.path.append(os.getcwd())
-!pip install datasets
-!pip install transformers
-
 
 ######################################################################
 # For this tutorial, we treat VQA as a classification task. So we need to
 # download the vocab file with answer classes and create the answer to
 # label mapping.
-# 
+#
 # We also load the `textvqa
 # dataset <https://arxiv.org/pdf/1904.08920.pdf>`__ from HuggingFace
-# 
+#
 
-!wget http://dl.fbaipublicfiles.com/pythia/data/vocab.tar.gz
-!tar xf vocab.tar.gz
-
-
-with open("vocabs/answers_textvqa_more_than_1.txt") as f:
+with open("data/vocabs/answers_textvqa_more_than_1.txt") as f:
   vocab = f.readlines()
 
 answer_to_idx = {}
@@ -61,7 +51,7 @@
 ######################################################################
 # We see there are 3997 answer classes including a class representing
 # unknown answers
-# 
+#
 
 print(len(vocab))
 print(vocab[:5])
@@ -81,9 +71,9 @@
 # Tensors consumable by our model - For images, we use the transforms from
 # torchvision to convert to Tensor and resize to uniform sizes - For text,
 # we tokenize (and pad) them using the BertTokenizer from HuggingFace -
-# For answers (i.e. labels), we take the most frequently occuring answer
+# For answers (i.e. labels), we take the most frequently occuring answer
 # as the label to train with
-# 
+#
 
 import torch
 from torchvision import transforms
@@ -99,15 +89,15 @@ def transform(tokenizer, input):
 
   tokenized=tokenizer(input["question"],return_tensors='pt',padding="max_length",max_length=512)
   batch.update(tokenized)
-  
-  
+
+
   ans_to_count = defaultdict(int)
   for ans in input["answers"][0]:
     ans_to_count[ans] += 1
   max_value = max(ans_to_count, key=ans_to_count.get)
   ans_idx = answer_to_idx.get(max_value,0)
   batch["answers"] = torch.as_tensor([ans_idx])
-  
+
   return batch
 
 tokenizer=BertTokenizer.from_pretrained("bert-base-uncased",padding="max_length",max_length=512)
@@ -119,14 +109,14 @@ def transform(tokenizer, input):
 # Finally, we import the flava_model_for_classification from
 # torchmultimodal. It loads the pretrained flava checkpoint by default and
 # includes a classification head.
-# 
+#
 # The model forward function passes the image through the visual encoder
 # and the question through the text encoder. The image and question
 # embeddings are then passed through the multimodal encoder. The final
 # embedding corresponding to the CLS token is passed through a MLP head
 # which finally gives the probability distribution over each possible
 # answers.
-# 
+#
 
 from torchmultimodal.models.flava.model import flava_model_for_classification
 model = flava_model_for_classification(num_classes=len(vocab))
@@ -135,7 +125,7 @@ def transform(tokenizer, input):
 ######################################################################
 # We put together the dataset and model in a toy training loop to
 # demonstrate how to train the model for 3 iterations.
-# 
+#
 
 from torch import nn
 BATCH_SIZE = 2
@@ -144,8 +134,8 @@ def transform(tokenizer, input):
 
 train_dataloader = DataLoader(dataset["train"], batch_size= BATCH_SIZE)
 optimizer = torch.optim.AdamW(model.parameters())
-  
-     
+
+
 epochs = 1
 for _ in range(epochs):
   for idx, batch in enumerate(train_dataloader):
@@ -161,7 +151,7 @@ def transform(tokenizer, input):
 
 ######################################################################
 # Conclusion
-# 
+#
 # This tutorial introduced the basics around how to finetune on a
 # multimodal task using FLAVA from TorchMultimodal. Please also check out
 # other examples from the library like
@@ -169,5 +159,4 @@ def transform(tokenizer, input):
 # which is a multimodal model for object detection and
 # `Omnivore <https://github.com/facebookresearch/multimodal/blob/main/torchmultimodal/models/omnivore.py>`__
 # which is multitask model spanning image, video and 3d classification.
-# 
-
+#
diff --git a/requirements.txt b/requirements.txt
index 95a778aa2c0..cdbf101b477 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -48,3 +48,7 @@ wget
 gym==0.24.0
 gym-super-mario-bros==7.3.0
 timm
+
+# flava tutorial - multimodal
+packaging
+iopath

From 7002ed559f90f352bf3c0cd67afef2b09ba4cdf9 Mon Sep 17 00:00:00 2001
From: Ankita De <deankita@fb.com>
Date: Mon, 26 Sep 2022 22:36:29 -0700
Subject: [PATCH 04/23] Fix imports

---
 beginner_source/flava_finetuning_tutorial.py | 16 +++++++++-------
 requirements.txt                             |  2 --
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/beginner_source/flava_finetuning_tutorial.py b/beginner_source/flava_finetuning_tutorial.py
index 8585184fc9a..b892bc0bfe9 100644
--- a/beginner_source/flava_finetuning_tutorial.py
+++ b/beginner_source/flava_finetuning_tutorial.py
@@ -22,14 +22,16 @@
 ######################################################################
 # Installations
 #
+# We will use TextVQA dataset and bert tokenizer from HuggingFace for this
+# tutorial. So you need to install datasets and transformers in addition to TorchMultimodal.
+# When running this tutorial in Google Colab, install the required packages
+# by uncommenting the following:
 #
-# We will use TextVQA dataset from HuggingFace for this
-# tutorial. So we install datasets in addition to TorchMultimodal.
-
-import os
-import sys
-sys.path.append(os.path.join(os.getcwd(),"multimodal"))
-sys.path.append(os.getcwd())
+"""
+!pip install torchmultimodal-nightly
+!pip install datasets
+!pip install transformers
+"""
 
 ######################################################################
 # For this tutorial, we treat VQA as a classification task. So we need to
diff --git a/requirements.txt b/requirements.txt
index cdbf101b477..7f972af3fa2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -49,6 +49,4 @@ gym==0.24.0
 gym-super-mario-bros==7.3.0
 timm
 
-# flava tutorial - multimodal
-packaging
 iopath

From c33c3aa435575cba1b005b440cc466295f468dbe Mon Sep 17 00:00:00 2001
From: Ankita De <deankita@fb.com>
Date: Mon, 3 Oct 2022 09:44:03 -0700
Subject: [PATCH 05/23] Address comments

---
 beginner_source/flava_finetuning_tutorial.py | 59 ++++++++++++--------
 1 file changed, 36 insertions(+), 23 deletions(-)

diff --git a/beginner_source/flava_finetuning_tutorial.py b/beginner_source/flava_finetuning_tutorial.py
index b892bc0bfe9..d0af50a7031 100644
--- a/beginner_source/flava_finetuning_tutorial.py
+++ b/beginner_source/flava_finetuning_tutorial.py
@@ -15,13 +15,15 @@
 # In this tutorial, we will demonstrate how to use a **pretrained SoTA
 # model called** `FLAVA <https://arxiv.org/pdf/2112.04482.pdf>`__ **from
 # TorchMultimodal library to finetune on a multimodal task i.e. visual
-# question answering** (VQA).
-# 
+# question answering** (VQA). The model consists of two unimodal transformer
+# based encoders for text and image and a multimodal encoder to combine
+# the two embeddings. It is pretrained using contrastive, image text matching and 
+# text, image and multimodal masking losses.
 
 
 ######################################################################
-# Installations
-#
+# Installation
+# -----------------
 # We will use TextVQA dataset and bert tokenizer from HuggingFace for this
 # tutorial. So you need to install datasets and transformers in addition to TorchMultimodal.
 # When running this tutorial in Google Colab, install the required packages
@@ -34,13 +36,25 @@
 """
 
 ######################################################################
-# For this tutorial, we treat VQA as a classification task. So we need to
-# download the vocab file with answer classes and create the answer to
+# Steps 
+# -----
+# 1. Download the HuggingFace dataset to a directory on your computer by running the following command:
+# wget http://dl.fbaipublicfiles.com/pythia/data/vocab.tar.gz 
+# tar xf vocab.tar.gz
+# If you are running this tutorial in Google Colab, run these commands
+# in a new cell and prepend these commands with an exclamation mark (!)
+#
+#  
+# 2. For this tutorial, we treat VQA as a classification task where the inputs are images and question (text) and the output is an answer class. 
+# So we need to download the vocab file with answer classes and create the answer to
 # label mapping.
 #
 # We also load the `textvqa
-# dataset <https://arxiv.org/pdf/1904.08920.pdf>`__ from HuggingFace
+# dataset <https://arxiv.org/pdf/1904.08920.pdf>`__ containing 34602 training samples
+# (images,questions and answers) from HuggingFace
 #
+# We see there are 3997 answer classes including a class representing
+# unknown answers.
 
 with open("data/vocabs/answers_textvqa_more_than_1.txt") as f:
   vocab = f.readlines()
@@ -48,28 +62,27 @@
 answer_to_idx = {}
 for idx, entry in enumerate(vocab):
   answer_to_idx[entry.strip("\n")] = idx
-
-
-######################################################################
-# We see there are 3997 answer classes including a class representing
-# unknown answers
-#
-
 print(len(vocab))
 print(vocab[:5])
 
+
 from datasets import load_dataset
 dataset = load_dataset("textvqa")
 
-from IPython.display import display, Image
-idx = 5
-print("Question: ", dataset["train"][idx]["question"])
+######################################################################
+# Lets display a sample entry from the dataset
+
+import matplotlib.pyplot as plt
+import numpy as np 
+idx = 5 
+print("Question: ", dataset["train"][idx]["question"]) 
 print("Answers: " ,dataset["train"][idx]["answers"])
-display(dataset["train"][idx]["image"].resize((500,500)))
+im = np.asarray(dataset["train"][idx]["image"].resize((500,500)))
+plt.imshow(im) plt.show()
 
 
 ######################################################################
-# Next we write the transform function to convert the image and text into
+# 3. Next, we write the transform function to convert the image and text into
 # Tensors consumable by our model - For images, we use the transforms from
 # torchvision to convert to Tensor and resize to uniform sizes - For text,
 # we tokenize (and pad) them using the BertTokenizer from HuggingFace -
@@ -107,7 +120,7 @@ def transform(tokenizer, input):
 
 
 ######################################################################
-# Finally, we import the flava_model_for_classification from
+# 4. Finally, we import the flava_model_for_classification from
 # torchmultimodal. It loads the pretrained flava checkpoint by default and
 # includes a classification head.
 #
@@ -124,7 +137,7 @@ def transform(tokenizer, input):
 
 
 ######################################################################
-# We put together the dataset and model in a toy training loop to
+# 5. We put together the dataset and model in a toy training loop to
 # demonstrate how to train the model for 3 iterations.
 #
 
@@ -141,7 +154,7 @@ def transform(tokenizer, input):
 for _ in range(epochs):
   for idx, batch in enumerate(train_dataloader):
     optimizer.zero_grad()
-    out = model(text = batch["input_ids"], image = batch["image"], labels = batch["answers"], required_embedding="mm")
+    out = model(text = batch["input_ids"], image = batch["image"], labels = batch["answers"])
     loss = out.loss
     loss.backward()
     optimizer.step()
@@ -152,7 +165,7 @@ def transform(tokenizer, input):
 
 ######################################################################
 # Conclusion
-#
+# -------------------
 # This tutorial introduced the basics around how to finetune on a
 # multimodal task using FLAVA from TorchMultimodal. Please also check out
 # other examples from the library like

From 31d1ca4931eec6e7000894e5f97e201fbcd1106f Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@fb.com>
Date: Mon, 3 Oct 2022 12:40:54 -0700
Subject: [PATCH 06/23] Fix syntaxerror

---
 beginner_source/flava_finetuning_tutorial.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/beginner_source/flava_finetuning_tutorial.py b/beginner_source/flava_finetuning_tutorial.py
index d0af50a7031..34d7b533123 100644
--- a/beginner_source/flava_finetuning_tutorial.py
+++ b/beginner_source/flava_finetuning_tutorial.py
@@ -26,14 +26,17 @@
 # -----------------
 # We will use TextVQA dataset and bert tokenizer from HuggingFace for this
 # tutorial. So you need to install datasets and transformers in addition to TorchMultimodal.
-# When running this tutorial in Google Colab, install the required packages
-# by uncommenting the following:
 #
-"""
-!pip install torchmultimodal-nightly
-!pip install datasets
-!pip install transformers
-"""
+# .. note::
+#
+#    When running this tutorial in Google Colab, install the required packages by
+#    creating a new cell and running the following commands:
+#
+#    .. code-block::
+#
+#       !pip install torchmultimodal-nightly
+#       !pip install datasets
+#       !pip install transformers
 
 ######################################################################
 # Steps 

From 6b6563a0b9f9f71bfa1f60e996cadea15575d04d Mon Sep 17 00:00:00 2001
From: Ankita De <deankita@fb.com>
Date: Mon, 3 Oct 2022 15:32:21 -0700
Subject: [PATCH 07/23] Fix syntax

---
 beginner_source/flava_finetuning_tutorial.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/beginner_source/flava_finetuning_tutorial.py b/beginner_source/flava_finetuning_tutorial.py
index 34d7b533123..5a42799ca3f 100644
--- a/beginner_source/flava_finetuning_tutorial.py
+++ b/beginner_source/flava_finetuning_tutorial.py
@@ -81,7 +81,8 @@
 print("Question: ", dataset["train"][idx]["question"]) 
 print("Answers: " ,dataset["train"][idx]["answers"])
 im = np.asarray(dataset["train"][idx]["image"].resize((500,500)))
-plt.imshow(im) plt.show()
+plt.imshow(im)
+plt.show()
 
 
 ######################################################################

From 0fe598ca4d3eeb877ac6710ab0b775fce56344e6 Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@fb.com>
Date: Mon, 3 Oct 2022 16:17:28 -0700
Subject: [PATCH 08/23] Fix formatting

---
 beginner_source/flava_finetuning_tutorial.py | 38 ++++++++++++--------
 1 file changed, 24 insertions(+), 14 deletions(-)

diff --git a/beginner_source/flava_finetuning_tutorial.py b/beginner_source/flava_finetuning_tutorial.py
index 5a42799ca3f..92bc5031709 100644
--- a/beginner_source/flava_finetuning_tutorial.py
+++ b/beginner_source/flava_finetuning_tutorial.py
@@ -37,27 +37,36 @@
 #       !pip install torchmultimodal-nightly
 #       !pip install datasets
 #       !pip install transformers
+#
 
 ######################################################################
 # Steps 
 # -----
+# 
 # 1. Download the HuggingFace dataset to a directory on your computer by running the following command:
-# wget http://dl.fbaipublicfiles.com/pythia/data/vocab.tar.gz 
-# tar xf vocab.tar.gz
-# If you are running this tutorial in Google Colab, run these commands
-# in a new cell and prepend these commands with an exclamation mark (!)
+# 
+#    .. code-block::
+# 
+#       wget http://dl.fbaipublicfiles.com/pythia/data/vocab.tar.gz 
+#       tar xf vocab.tar.gz
+# 
+#    .. note:: 
+#       If you are running this tutorial in Google Colab, run these commands
+#       in a new cell and prepend these commands with an exclamation mark (!)
 #
 #  
-# 2. For this tutorial, we treat VQA as a classification task where the inputs are images and question (text) and the output is an answer class. 
-# So we need to download the vocab file with answer classes and create the answer to
-# label mapping.
+# 2. For this tutorial, we treat VQA as a classification task where
+#    the inputs are images and question (text) and the output is an answer class. 
+#    So we need to download the vocab file with answer classes and create the answer to
+#    label mapping.
 #
-# We also load the `textvqa
-# dataset <https://arxiv.org/pdf/1904.08920.pdf>`__ containing 34602 training samples
-# (images,questions and answers) from HuggingFace
+#    We also load the `textvqa
+#    dataset <https://arxiv.org/pdf/1904.08920.pdf>`__ containing 34602 training samples
+#    (images,questions and answers) from HuggingFace
 #
 # We see there are 3997 answer classes including a class representing
 # unknown answers.
+#
 
 with open("data/vocabs/answers_textvqa_more_than_1.txt") as f:
   vocab = f.readlines()
@@ -68,12 +77,12 @@
 print(len(vocab))
 print(vocab[:5])
 
-
 from datasets import load_dataset
 dataset = load_dataset("textvqa")
 
 ######################################################################
-# Lets display a sample entry from the dataset
+# Lets display a sample entry from the dataset:
+#
 
 import matplotlib.pyplot as plt
 import numpy as np 
@@ -91,7 +100,7 @@
 # torchvision to convert to Tensor and resize to uniform sizes - For text,
 # we tokenize (and pad) them using the BertTokenizer from HuggingFace -
 # For answers (i.e. labels), we take the most frequently occuring answer
-# as the label to train with
+# as the label to train with:
 #
 
 import torch
@@ -142,7 +151,7 @@ def transform(tokenizer, input):
 
 ######################################################################
 # 5. We put together the dataset and model in a toy training loop to
-# demonstrate how to train the model for 3 iterations.
+# demonstrate how to train the model for 3 iterations:
 #
 
 from torch import nn
@@ -170,6 +179,7 @@ def transform(tokenizer, input):
 ######################################################################
 # Conclusion
 # -------------------
+#
 # This tutorial introduced the basics around how to finetune on a
 # multimodal task using FLAVA from TorchMultimodal. Please also check out
 # other examples from the library like

From 720d370f2348d05c3376ff77bd3cba2eb0428640 Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@fb.com>
Date: Mon, 10 Oct 2022 10:02:30 -0700
Subject: [PATCH 09/23] [DO NOT MERGE] 1.13 RC Test

---
 .jenkins/build.sh | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/.jenkins/build.sh b/.jenkins/build.sh
index 3aca5ba2a01..ac7c333bcdc 100755
--- a/.jenkins/build.sh
+++ b/.jenkins/build.sh
@@ -26,8 +26,11 @@ pip install -r $DIR/../requirements.txt
 # RC Link
 # pip uninstall -y torch torchvision torchaudio torchtext
 # pip install --pre --upgrade -f https://download.pytorch.org/whl/test/cu102/torch_test.html torch torchvision torchaudio torchtext
-# pip uninstall -y torch torchvision torchaudio torchtext
-# pip install -f https://download.pytorch.org/whl/test/cu111/torch_test.html torch torchvision torchaudio torchtext
+
+# Test enabled for PyTorch 1.13 RC Below
+pip uninstall -y torch torchvision torchaudio torchtext
+pip install --extra-index-url https://download.pytorch.org/whl/test/cu116 torch torchvision torchaudio torchtext
+pip install --extra-index-url https://download.pytorch.org/whl/test torchdata
 
 # Install two language tokenizers for Translation with TorchText tutorial
 python -m spacy download en_core_web_sm

From e67331d720d55444a09d97d22e74712a88f86a8b Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@fb.com>
Date: Tue, 11 Oct 2022 10:15:46 -0700
Subject: [PATCH 10/23] Update .jenkins/build.sh

Co-authored-by: Nikita Shulga <nshulga@fb.com>
---
 .jenkins/build.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.jenkins/build.sh b/.jenkins/build.sh
index ac7c333bcdc..f314e2f9610 100755
--- a/.jenkins/build.sh
+++ b/.jenkins/build.sh
@@ -30,7 +30,6 @@ pip install -r $DIR/../requirements.txt
 # Test enabled for PyTorch 1.13 RC Below
 pip uninstall -y torch torchvision torchaudio torchtext
 pip install --extra-index-url https://download.pytorch.org/whl/test/cu116 torch torchvision torchaudio torchtext
-pip install --extra-index-url https://download.pytorch.org/whl/test torchdata
 
 # Install two language tokenizers for Translation with TorchText tutorial
 python -m spacy download en_core_web_sm

From 38939c46282ed9b04a910a19bbee1b6f167b7511 Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@fb.com>
Date: Thu, 13 Oct 2022 11:57:54 -0700
Subject: [PATCH 11/23] Update build.sh

---
 .jenkins/build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.jenkins/build.sh b/.jenkins/build.sh
index f314e2f9610..264d4bf6e42 100755
--- a/.jenkins/build.sh
+++ b/.jenkins/build.sh
@@ -29,7 +29,7 @@ pip install -r $DIR/../requirements.txt
 
 # Test enabled for PyTorch 1.13 RC Below
 pip uninstall -y torch torchvision torchaudio torchtext
-pip install --extra-index-url https://download.pytorch.org/whl/test/cu116 torch torchvision torchaudio torchtext
+pip3 install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/test/cu116/torch_test.html
 
 # Install two language tokenizers for Translation with TorchText tutorial
 python -m spacy download en_core_web_sm

From c0d5fedbc47dc1f0ad708f8a8bf0569a4f76d040 Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@fb.com>
Date: Fri, 14 Oct 2022 11:37:46 -0700
Subject: [PATCH 12/23] Update build.sh

---
 .jenkins/build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.jenkins/build.sh b/.jenkins/build.sh
index 264d4bf6e42..15b0ff90b25 100755
--- a/.jenkins/build.sh
+++ b/.jenkins/build.sh
@@ -29,7 +29,7 @@ pip install -r $DIR/../requirements.txt
 
 # Test enabled for PyTorch 1.13 RC Below
 pip uninstall -y torch torchvision torchaudio torchtext
-pip3 install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/test/cu116/torch_test.html
+pip3 install --pre torch torchvision torchaudio torchtext -f https://download.pytorch.org/whl/test/cu116/torch_test.html
 
 # Install two language tokenizers for Translation with TorchText tutorial
 python -m spacy download en_core_web_sm

From f509d8e1a3bf9a6de6554e17d8cd2cf359c76d8d Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@fb.com>
Date: Mon, 17 Oct 2022 13:33:09 -0700
Subject: [PATCH 13/23] Update build.sh

---
 .jenkins/build.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.jenkins/build.sh b/.jenkins/build.sh
index 15b0ff90b25..23cc9d4dbdc 100755
--- a/.jenkins/build.sh
+++ b/.jenkins/build.sh
@@ -29,7 +29,8 @@ pip install -r $DIR/../requirements.txt
 
 # Test enabled for PyTorch 1.13 RC Below
 pip uninstall -y torch torchvision torchaudio torchtext
-pip3 install --pre torch torchvision torchaudio torchtext -f https://download.pytorch.org/whl/test/cu116/torch_test.html
+pip3 install --pre torch torchvision torchaudio torchtext -f https://download.pytorch.org/whl/test torchdata https://download.pytorch.org/whl/test/cu116/torch_test.html
+pip install --pre --extra-index-url https://download.pytorch.org/whl/test torchdata
 
 # Install two language tokenizers for Translation with TorchText tutorial
 python -m spacy download en_core_web_sm

From d6e72e015df9b89e5ac31fd6de8710d899976b99 Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@fb.com>
Date: Mon, 17 Oct 2022 13:42:54 -0700
Subject: [PATCH 14/23] Update build.sh

---
 .jenkins/build.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.jenkins/build.sh b/.jenkins/build.sh
index 23cc9d4dbdc..f3c82763e96 100755
--- a/.jenkins/build.sh
+++ b/.jenkins/build.sh
@@ -29,8 +29,7 @@ pip install -r $DIR/../requirements.txt
 
 # Test enabled for PyTorch 1.13 RC Below
 pip uninstall -y torch torchvision torchaudio torchtext
-pip3 install --pre torch torchvision torchaudio torchtext -f https://download.pytorch.org/whl/test torchdata https://download.pytorch.org/whl/test/cu116/torch_test.html
-pip install --pre --extra-index-url https://download.pytorch.org/whl/test torchdata
+pip install --pre -f https://download.pytorch.org/whl/test torch torchvision torchaudio torchtext
 
 # Install two language tokenizers for Translation with TorchText tutorial
 python -m spacy download en_core_web_sm

From 5fbf500bf90518fe8b881c26b683b85692c3d8e3 Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@fb.com>
Date: Mon, 17 Oct 2022 13:51:19 -0700
Subject: [PATCH 15/23] Update build.sh

---
 .jenkins/build.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.jenkins/build.sh b/.jenkins/build.sh
index f3c82763e96..2ae72906c4b 100755
--- a/.jenkins/build.sh
+++ b/.jenkins/build.sh
@@ -25,11 +25,11 @@ pip install -r $DIR/../requirements.txt
 # Nightly - pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu102/torch_nightly.html
 # RC Link
 # pip uninstall -y torch torchvision torchaudio torchtext
-# pip install --pre --upgrade -f https://download.pytorch.org/whl/test/cu102/torch_test.html torch torchvision torchaudio torchtext
+# pip install --pre --upgrade -f https://download.pytorch.org/whl/test/cu102/torch_test.html torch  torchvision torchaudio torchtext 
 
 # Test enabled for PyTorch 1.13 RC Below
 pip uninstall -y torch torchvision torchaudio torchtext
-pip install --pre -f https://download.pytorch.org/whl/test torch torchvision torchaudio torchtext
+pip install --pre torch torchdata torchvision torchaudio torchtext -f https://download.pytorch.org/whl/test/cu116/torch_test.html 
 
 # Install two language tokenizers for Translation with TorchText tutorial
 python -m spacy download en_core_web_sm

From 3c7694f89a125621c7705cae2d4c99c25767286d Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@fb.com>
Date: Mon, 17 Oct 2022 15:00:37 -0700
Subject: [PATCH 16/23] Update build.sh

---
 .jenkins/build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.jenkins/build.sh b/.jenkins/build.sh
index 2ae72906c4b..7019766f144 100755
--- a/.jenkins/build.sh
+++ b/.jenkins/build.sh
@@ -29,7 +29,7 @@ pip install -r $DIR/../requirements.txt
 
 # Test enabled for PyTorch 1.13 RC Below
 pip uninstall -y torch torchvision torchaudio torchtext
-pip install --pre torch torchdata torchvision torchaudio torchtext -f https://download.pytorch.org/whl/test/cu116/torch_test.html 
+pip install --pre --upgrade -f https://download.pytorch.org/whl/test/cu116/torch_test.html torch torchdata torchvision torchaudio torchtext
 
 # Install two language tokenizers for Translation with TorchText tutorial
 python -m spacy download en_core_web_sm

From 3559c44e1b35d16ca0e31daca1f27448f9eb70e1 Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@fb.com>
Date: Mon, 17 Oct 2022 15:36:00 -0700
Subject: [PATCH 17/23] Remove functorch

---
 requirements.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 74e2da5fad3..67bec81cd2b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -13,7 +13,6 @@ torchvision
 torchtext
 torchaudio
 torchdata
-functorch>=0.2.1
 networkx
 PyHamcrest
 bs4

From 06b98742c32ffc60bf3967b266e30014c69d08e3 Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@fb.com>
Date: Wed, 19 Oct 2022 13:28:00 -0700
Subject: [PATCH 18/23] Temporarily disabling fx_numeric_suite_tutorial

---
 .jenkins/build.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.jenkins/build.sh b/.jenkins/build.sh
index 7019766f144..fee713f588f 100755
--- a/.jenkins/build.sh
+++ b/.jenkins/build.sh
@@ -49,6 +49,8 @@ if [[ "${JOB_BASE_NAME}" == *worker_* ]]; then
   # python $DIR/remove_runnable_code.py intermediate_source/spatial_transformer_tutorial.py intermediate_source/spatial_transformer_tutorial.py || true
   # Temp remove for 1.10 release.
   # python $DIR/remove_runnable_code.py advanced_source/neural_style_tutorial.py advanced_source/neural_style_tutorial.py || true
+ # Temp remove for 1.13 release.
+  python $DIR/remove_runnable_code.py beginner_source/fx_numeric_suite_tutorial.py || true
 
   # TODO: Fix bugs in these tutorials to make them runnable again
   # python $DIR/remove_runnable_code.py beginner_source/audio_classifier_tutorial.py beginner_source/audio_classifier_tutorial.py || true

From 3c0fc31990aa12094a70ca536c3131ff4343a70d Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@fb.com>
Date: Wed, 19 Oct 2022 14:19:51 -0700
Subject: [PATCH 19/23] Update build.sh

---
 .jenkins/build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.jenkins/build.sh b/.jenkins/build.sh
index fee713f588f..5263045a4bd 100755
--- a/.jenkins/build.sh
+++ b/.jenkins/build.sh
@@ -50,7 +50,7 @@ if [[ "${JOB_BASE_NAME}" == *worker_* ]]; then
   # Temp remove for 1.10 release.
   # python $DIR/remove_runnable_code.py advanced_source/neural_style_tutorial.py advanced_source/neural_style_tutorial.py || true
  # Temp remove for 1.13 release.
-  python $DIR/remove_runnable_code.py beginner_source/fx_numeric_suite_tutorial.py || true
+  python $DIR/remove_runnable_code.py beginner_source/fx_numeric_suite_tutorial.py beginner_source/fx_numeric_suite_tutorial.py || true
 
   # TODO: Fix bugs in these tutorials to make them runnable again
   # python $DIR/remove_runnable_code.py beginner_source/audio_classifier_tutorial.py beginner_source/audio_classifier_tutorial.py || true

From a449a551d8b074766b689866297a73f9af782da3 Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@fb.com>
Date: Thu, 20 Oct 2022 08:32:37 -0700
Subject: [PATCH 20/23] Disable in the validate list

---
 .jenkins/validate_tutorials_built.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.jenkins/validate_tutorials_built.py b/.jenkins/validate_tutorials_built.py
index 92570124a4e..cc01326b44c 100644
--- a/.jenkins/validate_tutorials_built.py
+++ b/.jenkins/validate_tutorials_built.py
@@ -50,6 +50,7 @@
     "recipes/Captum_Recipe",
     "hyperparameter_tuning_tutorial",
     "flask_rest_api_tutorial",
+    "fx_numeric_suite_tutorial", # remove when https://github.com/pytorch/tutorials/pull/2089 is fixed
 ]
 
 

From 047a956b922a6485eff76d3ce46f319f235be173 Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@fb.com>
Date: Thu, 20 Oct 2022 08:52:52 -0700
Subject: [PATCH 21/23] Disable ax tutorial

---
 .jenkins/validate_tutorials_built.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.jenkins/validate_tutorials_built.py b/.jenkins/validate_tutorials_built.py
index cc01326b44c..5f9d563475f 100644
--- a/.jenkins/validate_tutorials_built.py
+++ b/.jenkins/validate_tutorials_built.py
@@ -51,6 +51,7 @@
     "hyperparameter_tuning_tutorial",
     "flask_rest_api_tutorial",
     "fx_numeric_suite_tutorial", # remove when https://github.com/pytorch/tutorials/pull/2089 is fixed
+    "ax_multiobjective_nas_tutorial",
 ]
 
 

From cff152efb0ebc51fb0decaa7254d1c365065d899 Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@fb.com>
Date: Wed, 26 Oct 2022 15:00:59 -0700
Subject: [PATCH 22/23] rebase

---
 .jenkins/build.sh                    | 7 -------
 .jenkins/validate_tutorials_built.py | 2 --
 requirements.txt                     | 3 ++-
 3 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/.jenkins/build.sh b/.jenkins/build.sh
index 5263045a4bd..edd2ffa5cb3 100755
--- a/.jenkins/build.sh
+++ b/.jenkins/build.sh
@@ -25,11 +25,6 @@ pip install -r $DIR/../requirements.txt
 # Nightly - pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu102/torch_nightly.html
 # RC Link
 # pip uninstall -y torch torchvision torchaudio torchtext
-# pip install --pre --upgrade -f https://download.pytorch.org/whl/test/cu102/torch_test.html torch  torchvision torchaudio torchtext 
-
-# Test enabled for PyTorch 1.13 RC Below
-pip uninstall -y torch torchvision torchaudio torchtext
-pip install --pre --upgrade -f https://download.pytorch.org/whl/test/cu116/torch_test.html torch torchdata torchvision torchaudio torchtext
 
 # Install two language tokenizers for Translation with TorchText tutorial
 python -m spacy download en_core_web_sm
@@ -49,8 +44,6 @@ if [[ "${JOB_BASE_NAME}" == *worker_* ]]; then
   # python $DIR/remove_runnable_code.py intermediate_source/spatial_transformer_tutorial.py intermediate_source/spatial_transformer_tutorial.py || true
   # Temp remove for 1.10 release.
   # python $DIR/remove_runnable_code.py advanced_source/neural_style_tutorial.py advanced_source/neural_style_tutorial.py || true
- # Temp remove for 1.13 release.
-  python $DIR/remove_runnable_code.py beginner_source/fx_numeric_suite_tutorial.py beginner_source/fx_numeric_suite_tutorial.py || true
 
   # TODO: Fix bugs in these tutorials to make them runnable again
   # python $DIR/remove_runnable_code.py beginner_source/audio_classifier_tutorial.py beginner_source/audio_classifier_tutorial.py || true
diff --git a/.jenkins/validate_tutorials_built.py b/.jenkins/validate_tutorials_built.py
index 5f9d563475f..92570124a4e 100644
--- a/.jenkins/validate_tutorials_built.py
+++ b/.jenkins/validate_tutorials_built.py
@@ -50,8 +50,6 @@
     "recipes/Captum_Recipe",
     "hyperparameter_tuning_tutorial",
     "flask_rest_api_tutorial",
-    "fx_numeric_suite_tutorial", # remove when https://github.com/pytorch/tutorials/pull/2089 is fixed
-    "ax_multiobjective_nas_tutorial",
 ]
 
 
diff --git a/requirements.txt b/requirements.txt
index 34d777076b7..28a37d88ac2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -13,6 +13,7 @@ torchvision
 torchtext
 torchaudio
 torchdata
+functorch>=0.2.1
 networkx
 PyHamcrest
 bs4
@@ -28,7 +29,7 @@ ax-platform
 nbformat>=4.2.0
 datasets
 transformers
-torchmultimodal-nightly
+torchmultimodal-nightly # needs to be updated to stable as soon as it's avaialable
 deep_phonemizer==0.0.17
 
 # the following is necessary due to https://github.com/python/importlib_metadata/issues/411

From 1c114442e081b0e538a28fd03c6d076236a257c9 Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@fb.com>
Date: Wed, 26 Oct 2022 15:02:32 -0700
Subject: [PATCH 23/23] Small fix

---
 .jenkins/build.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.jenkins/build.sh b/.jenkins/build.sh
index edd2ffa5cb3..3aca5ba2a01 100755
--- a/.jenkins/build.sh
+++ b/.jenkins/build.sh
@@ -25,6 +25,9 @@ pip install -r $DIR/../requirements.txt
 # Nightly - pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu102/torch_nightly.html
 # RC Link
 # pip uninstall -y torch torchvision torchaudio torchtext
+# pip install --pre --upgrade -f https://download.pytorch.org/whl/test/cu102/torch_test.html torch torchvision torchaudio torchtext
+# pip uninstall -y torch torchvision torchaudio torchtext
+# pip install -f https://download.pytorch.org/whl/test/cu111/torch_test.html torch torchvision torchaudio torchtext
 
 # Install two language tokenizers for Translation with TorchText tutorial
 python -m spacy download en_core_web_sm