pytorch
diff --git a/‎.circleci/config.yml
Lines changed: 3 additions & 2 deletions b/‎.circleci/config.yml
Lines changed: 3 additions & 2 deletions
diff --git a/‎.circleci/config.yml.in
Lines changed: 3 additions & 2 deletions b/‎.circleci/config.yml.in
Lines changed: 3 additions & 2 deletions
diff --git a/‎.jenkins/build.sh
Lines changed: 3 additions & 4 deletions b/‎.jenkins/build.sh
Lines changed: 3 additions & 4 deletions
diff --git a/‎.jenkins/validate_tutorials_built.py
Lines changed: 1 addition & 0 deletions b/‎.jenkins/validate_tutorials_built.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎Makefile
Lines changed: 4 additions & 0 deletions b/‎Makefile
Lines changed: 4 additions & 0 deletions
diff --git a/‎_static/img/itt_tutorial/vtune_config.png
108 KB b/‎_static/img/itt_tutorial/vtune_config.png
108 KB
diff --git a/‎_static/img/itt_tutorial/vtune_start.png
211 KB b/‎_static/img/itt_tutorial/vtune_start.png
211 KB
diff --git a/‎_static/img/itt_tutorial/vtune_timeline.png
116 KB b/‎_static/img/itt_tutorial/vtune_timeline.png
116 KB
diff --git a/‎beginner_source/blitz/cifar10_tutorial.py
Lines changed: 2 additions & 2 deletions b/‎beginner_source/blitz/cifar10_tutorial.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎beginner_source/deep_learning_60min_blitz.rst
Lines changed: 4 additions & 4 deletions b/‎beginner_source/deep_learning_60min_blitz.rst
Lines changed: 4 additions & 4 deletions
diff --git a/‎beginner_source/flava_finetuning_tutorial.py
Lines changed: 190 additions & 0 deletions b/‎beginner_source/flava_finetuning_tutorial.py
Lines changed: 190 additions & 0 deletions
diff --git a/‎beginner_source/introyt/introyt1_tutorial.py
Lines changed: 2 additions & 2 deletions b/‎beginner_source/introyt/introyt1_tutorial.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎beginner_source/introyt/tensorboardyt_tutorial.py
Lines changed: 2 additions & 2 deletions b/‎beginner_source/introyt/tensorboardyt_tutorial.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎beginner_source/introyt/trainingyt.py
Lines changed: 1 addition & 1 deletion b/‎beginner_source/introyt/trainingyt.py
Lines changed: 1 addition & 1 deletion
@@ -134,21 +134,22 @@ pytorch_tutorial_build_defaults: &pytorch_tutorial_build_defaults
         fi
         set -x
 
+        echo 'rm /opt/cache/bin/*' | docker exec -u root -i "$id" bash
         docker cp /home/circleci/project/. "$id:/var/lib/jenkins/workspace"
 
         export COMMAND='((echo "source ./workspace/env" && echo "sudo chown -R jenkins workspace && cd workspace && ./ci_build_script.sh") | docker exec -u jenkins -i "$id" bash) 2>&1'
         echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts
 
 pytorch_tutorial_build_worker_defaults: &pytorch_tutorial_build_worker_defaults
   environment:
-    DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7"
+    DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda11.6-cudnn8-py3-gcc7"
     CUDA_VERSION: "9"
   resource_class: gpu.nvidia.small
   <<: *pytorch_tutorial_build_defaults
 
 pytorch_tutorial_build_manager_defaults: &pytorch_tutorial_build_manager_defaults
   environment:
-    DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7"
+    DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda11.6-cudnn8-py3-gcc7"
   resource_class: medium
   <<: *pytorch_tutorial_build_defaults
 
 
@@ -134,21 +134,22 @@ pytorch_tutorial_build_defaults: &pytorch_tutorial_build_defaults
         fi
         set -x
 
+        echo 'rm /opt/cache/bin/*' | docker exec -u root -i "$id" bash
         docker cp /home/circleci/project/. "$id:/var/lib/jenkins/workspace"
 
         export COMMAND='((echo "source ./workspace/env" && echo "sudo chown -R jenkins workspace && cd workspace && ./ci_build_script.sh") | docker exec -u jenkins -i "$id" bash) 2>&1'
         echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts
 
 pytorch_tutorial_build_worker_defaults: &pytorch_tutorial_build_worker_defaults
   environment:
-    DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7"
+    DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda11.6-cudnn8-py3-gcc7"
     CUDA_VERSION: "9"
   resource_class: gpu.nvidia.small
   <<: *pytorch_tutorial_build_defaults
 
 pytorch_tutorial_build_manager_defaults: &pytorch_tutorial_build_manager_defaults
   environment:
-    DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7"
+    DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda11.6-cudnn8-py3-gcc7"
   resource_class: medium
   <<: *pytorch_tutorial_build_defaults
 {% raw %}
 
@@ -25,9 +25,9 @@ pip install -r $DIR/../requirements.txt
 # Nightly - pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu102/torch_nightly.html
 # RC Link
 # pip uninstall -y torch torchvision torchaudio torchtext
-# pip install --pre --upgrade -f https://download.pytorch.org/whl/test/cu102/torch_test.html torch torchvision torchaudio torchtext
+# pip install --pre --upgrade -f https://download.pytorch.org/whl/test/cu102/torch_test.html torch  torchvision torchaudio torchtext 
 # pip uninstall -y torch torchvision torchaudio torchtext
-# pip install -f https://download.pytorch.org/whl/test/cu111/torch_test.html torch torchvision torchaudio torchtext
+# pip install --pre --upgrade -f https://download.pytorch.org/whl/test/cu116/torch_test.html torch torchdata torchvision torchaudio torchtext
 
 # Install two language tokenizers for Translation with TorchText tutorial
 python -m spacy download en_core_web_sm
@@ -47,14 +47,13 @@ if [[ "${JOB_BASE_NAME}" == *worker_* ]]; then
   # python $DIR/remove_runnable_code.py intermediate_source/spatial_transformer_tutorial.py intermediate_source/spatial_transformer_tutorial.py || true
   # Temp remove for 1.10 release.
   # python $DIR/remove_runnable_code.py advanced_source/neural_style_tutorial.py advanced_source/neural_style_tutorial.py || true
-
+  
   # TODO: Fix bugs in these tutorials to make them runnable again
   # python $DIR/remove_runnable_code.py beginner_source/audio_classifier_tutorial.py beginner_source/audio_classifier_tutorial.py || true
 
   # Remove runnable code from tensorboard_profiler_tutorial.py as it frequently crashes, see https://github.com/pytorch/pytorch/issues/74139
   # python $DIR/remove_runnable_code.py intermediate_source/tensorboard_profiler_tutorial.py intermediate_source/tensorboard_profiler_tutorial.py || true
 
-
   # Step 2: Keep certain tutorials based on file count, and remove runnable code in all other tutorials
   # IMPORTANT NOTE: We assume that each tutorial has a UNIQUE filename.
   export WORKER_ID=$(echo "${JOB_BASE_NAME}" | tr -dc '0-9')
 
@@ -50,6 +50,7 @@
     "recipes/Captum_Recipe",
     "hyperparameter_tuning_tutorial",
     "flask_rest_api_tutorial",
+    "text_to_speech_with_torchaudio",
 ]
 
 
 
@@ -102,6 +102,10 @@ download:
 	wget -nv -N https://download.pytorch.org/models/resnet18-5c106cde.pth -P $(DATADIR)
 	cp $(DATADIR)/resnet18-5c106cde.pth prototype_source/data/resnet18_pretrained_float.pth
 
+	# Download vocab for beginner_source/flava_finetuning_tutorial.py
+	wget -nv -N http://dl.fbaipublicfiles.com/pythia/data/vocab.tar.gz -P $(DATADIR)
+	tar $(TAROPTS) -xzf $(DATADIR)/vocab.tar.gz -C ./beginner_source/data/
+
 
 docs:
 	make download
 
@@ -105,7 +105,7 @@ def imshow(img):
 
 # get some random training images
 dataiter = iter(trainloader)
-images, labels = dataiter.next()
+images, labels = next(dataiter)
 
 # show images
 imshow(torchvision.utils.make_grid(images))
@@ -210,7 +210,7 @@ def forward(self, x):
 # Okay, first step. Let us display an image from the test set to get familiar.
 
 dataiter = iter(testloader)
-images, labels = dataiter.next()
+images, labels = next(dataiter)
 
 # print images
 imshow(torchvision.utils.make_grid(images))
 
@@ -37,28 +37,28 @@ packages installed.
 .. grid:: 4
 
    .. grid-item-card::  :octicon:`file-code;1em` Tensors
-      :link: /beginner/blitz/tensor_tutorial.html
+      :link: blitz/tensor_tutorial.html
 
       In this tutorial, you will learn the basics of PyTorch tensors.
       +++
       :octicon:`code;1em` Code
 
    .. grid-item-card::  :octicon:`file-code;1em` A Gentle Introduction to torch.autograd
-      :link: /beginner/blitz/autograd_tutorial.html
+      :link: blitz/autograd_tutorial.html
 
       Learn about autograd.
       +++
       :octicon:`code;1em` Code
 
    .. grid-item-card::  :octicon:`file-code;1em` Neural Networks
-      :link: /beginner/blitz/neural_networks_tutorial.html
+      :link: blitz/neural_networks_tutorial.html
 
       This tutorial demonstrates how you can train neural networks in PyTorch.
       +++
       :octicon:`code;1em` Code
 
    .. grid-item-card::  :octicon:`file-code;1em` Training a Classifier
-      :link: /beginner/blitz/cifar10_tutorial.html
+      :link: blitz/cifar10_tutorial.html
 
       Learn how to train an image classifier in PyTorch by using the
       CIFAR10 dataset.
 
@@ -0,0 +1,190 @@
+# -*- coding: utf-8 -*-
+"""
+TorchMultimodal Tutorial: Finetuning FLAVA
+============================================
+"""
+
+######################################################################
+# Multimodal AI has recently become very popular owing to its ubiquitous
+# nature, from use cases like image captioning and visual search to more
+# recent applications like image generation from text. **TorchMultimodal
+# is a library powered by Pytorch consisting of building blocks and end to
+# end examples, aiming to enable and accelerate research in
+# multimodality**.
+# 
+# In this tutorial, we will demonstrate how to use a **pretrained SoTA
+# model called** `FLAVA <https://arxiv.org/pdf/2112.04482.pdf>`__ **from
+# TorchMultimodal library to finetune on a multimodal task i.e. visual
+# question answering** (VQA). The model consists of two unimodal transformer
+# based encoders for text and image and a multimodal encoder to combine
+# the two embeddings. It is pretrained using contrastive, image text matching and 
+# text, image and multimodal masking losses.
+
+
+######################################################################
+# Installation
+# -----------------
+# We will use TextVQA dataset and bert tokenizer from HuggingFace for this
+# tutorial. So you need to install datasets and transformers in addition to TorchMultimodal.
+#
+# .. note::
+#
+#    When running this tutorial in Google Colab, install the required packages by
+#    creating a new cell and running the following commands:
+#
+#    .. code-block::
+#
+#       !pip install torchmultimodal-nightly
+#       !pip install datasets
+#       !pip install transformers
+#
+
+######################################################################
+# Steps 
+# -----
+# 
+# 1. Download the HuggingFace dataset to a directory on your computer by running the following command:
+# 
+#    .. code-block::
+# 
+#       wget http://dl.fbaipublicfiles.com/pythia/data/vocab.tar.gz 
+#       tar xf vocab.tar.gz
+# 
+#    .. note:: 
+#       If you are running this tutorial in Google Colab, run these commands
+#       in a new cell and prepend these commands with an exclamation mark (!)
+#
+#  
+# 2. For this tutorial, we treat VQA as a classification task where
+#    the inputs are images and question (text) and the output is an answer class. 
+#    So we need to download the vocab file with answer classes and create the answer to
+#    label mapping.
+#
+#    We also load the `textvqa
+#    dataset <https://arxiv.org/pdf/1904.08920.pdf>`__ containing 34602 training samples
+#    (images,questions and answers) from HuggingFace
+#
+# We see there are 3997 answer classes including a class representing
+# unknown answers.
+#
+
+with open("data/vocabs/answers_textvqa_more_than_1.txt") as f:
+  vocab = f.readlines()
+
+answer_to_idx = {}
+for idx, entry in enumerate(vocab):
+  answer_to_idx[entry.strip("\n")] = idx
+print(len(vocab))
+print(vocab[:5])
+
+from datasets import load_dataset
+dataset = load_dataset("textvqa")
+
+######################################################################
+# Lets display a sample entry from the dataset:
+#
+
+import matplotlib.pyplot as plt
+import numpy as np 
+idx = 5 
+print("Question: ", dataset["train"][idx]["question"]) 
+print("Answers: " ,dataset["train"][idx]["answers"])
+im = np.asarray(dataset["train"][idx]["image"].resize((500,500)))
+plt.imshow(im)
+plt.show()
+
+
+######################################################################
+# 3. Next, we write the transform function to convert the image and text into
+# Tensors consumable by our model - For images, we use the transforms from
+# torchvision to convert to Tensor and resize to uniform sizes - For text,
+# we tokenize (and pad) them using the BertTokenizer from HuggingFace -
+# For answers (i.e. labels), we take the most frequently occuring answer
+# as the label to train with:
+#
+
+import torch
+from torchvision import transforms
+from collections import defaultdict
+from transformers import BertTokenizer
+from functools import partial
+
+def transform(tokenizer, input):
+  batch = {}
+  image_transform = transforms.Compose([transforms.ToTensor(), transforms.Resize([224,224])])
+  image = image_transform(input["image"][0].convert("RGB"))
+  batch["image"] = [image]
+
+  tokenized=tokenizer(input["question"],return_tensors='pt',padding="max_length",max_length=512)
+  batch.update(tokenized)
+
+
+  ans_to_count = defaultdict(int)
+  for ans in input["answers"][0]:
+    ans_to_count[ans] += 1
+  max_value = max(ans_to_count, key=ans_to_count.get)
+  ans_idx = answer_to_idx.get(max_value,0)
+  batch["answers"] = torch.as_tensor([ans_idx])
+  return batch
+
+tokenizer=BertTokenizer.from_pretrained("bert-base-uncased",padding="max_length",max_length=512)
+transform=partial(transform,tokenizer)
+dataset.set_transform(transform)
+
+
+######################################################################
+# 4. Finally, we import the flava_model_for_classification from
+# torchmultimodal. It loads the pretrained flava checkpoint by default and
+# includes a classification head.
+#
+# The model forward function passes the image through the visual encoder
+# and the question through the text encoder. The image and question
+# embeddings are then passed through the multimodal encoder. The final
+# embedding corresponding to the CLS token is passed through a MLP head
+# which finally gives the probability distribution over each possible
+# answers.
+#
+
+from torchmultimodal.models.flava.model import flava_model_for_classification
+model = flava_model_for_classification(num_classes=len(vocab))
+
+
+######################################################################
+# 5. We put together the dataset and model in a toy training loop to
+# demonstrate how to train the model for 3 iterations:
+#
+
+from torch import nn
+BATCH_SIZE = 2
+MAX_STEPS = 3
+from torch.utils.data import DataLoader
+
+train_dataloader = DataLoader(dataset["train"], batch_size= BATCH_SIZE)
+optimizer = torch.optim.AdamW(model.parameters())
+
+
+epochs = 1
+for _ in range(epochs):
+  for idx, batch in enumerate(train_dataloader):
+    optimizer.zero_grad()
+    out = model(text = batch["input_ids"], image = batch["image"], labels = batch["answers"])
+    loss = out.loss
+    loss.backward()
+    optimizer.step()
+    print(f"Loss at step {idx} = {loss}")
+    if idx > MAX_STEPS-1:
+      break
+
+
+######################################################################
+# Conclusion
+# -------------------
+#
+# This tutorial introduced the basics around how to finetune on a
+# multimodal task using FLAVA from TorchMultimodal. Please also check out
+# other examples from the library like
+# `MDETR <https://github.com/facebookresearch/multimodal/tree/main/torchmultimodal/models/mdetr>`__
+# which is a multimodal model for object detection and
+# `Omnivore <https://github.com/facebookresearch/multimodal/blob/main/torchmultimodal/models/omnivore.py>`__
+# which is multitask model spanning image, video and 3d classification.
+#
@@ -369,7 +369,7 @@ def imshow(img):
 
 # get some random training images
 dataiter = iter(trainloader)
-images, labels = dataiter.next()
+images, labels = next(dataiter)
 
 # show images
 imshow(torchvision.utils.make_grid(images))
@@ -446,7 +446,7 @@ def imshow(img):
 
 # get some random training images
 dataiter = iter(trainloader)
-images, labels = dataiter.next()
+images, labels = next(dataiter)
 
 # show images
 imshow(torchvision.utils.make_grid(images))
 
@@ -115,7 +115,7 @@ def matplotlib_imshow(img, one_channel=False):
 
 # Extract a batch of 4 images
 dataiter = iter(training_loader)
-images, labels = dataiter.next()
+images, labels = next(dataiter)
 
 # Create a grid from the images and show them
 img_grid = torchvision.utils.make_grid(images)
@@ -242,7 +242,7 @@ def forward(self, x):
 
 # Again, grab a single mini-batch of images
 dataiter = iter(training_loader)
-images, labels = dataiter.next()
+images, labels = next(dataiter)
 
 # add_graph() will trace the sample input through your model,
 # and render it as a graph.
 
@@ -112,7 +112,7 @@ def matplotlib_imshow(img, one_channel=False):
         plt.imshow(np.transpose(npimg, (1, 2, 0)))
 
 dataiter = iter(training_loader)
-images, labels = dataiter.next()
+images, labels = next(dataiter)
 
 # Create a grid from the images and show them
 img_grid = torchvision.utils.make_grid(images)
Original file line number	Diff line number	Diff line change
`@@ -50,6 +50,7 @@`
`50`	`50`	`"recipes/Captum_Recipe",`
`51`	`51`	`"hyperparameter_tuning_tutorial",`
`52`	`52`	`"flask_rest_api_tutorial",`
	`53`	`+ "text_to_speech_with_torchaudio",`
`53`	`54`	`]`
`54`	`55`
`55`	`56`