diff --git a/_static/img/bert.png b/_static/img/bert.png
deleted file mode 100644
index 6e23a8acfd3..00000000000
Binary files a/_static/img/bert.png and /dev/null differ
diff --git a/_static/img/quantized_transfer_learning.png b/_static/img/quantized_transfer_learning.png
deleted file mode 100644
index c138cbdb0c1..00000000000
Binary files a/_static/img/quantized_transfer_learning.png and /dev/null differ
diff --git a/index.rst b/index.rst
index 5817cadae94..4656b66976a 100644
--- a/index.rst
+++ b/index.rst
@@ -247,18 +247,6 @@ Quantization (experimental)
:figure: /_static/img/qat.png
:description: :doc:`advanced/static_quantization_tutorial`
-.. customgalleryitem::
- :tooltip: Perform quantized transfer learning with feature extractor
- :description: :doc:`/intermediate/quantized_transfer_learning_tutorial`
- :figure: /_static/img/quantized_transfer_learning.png
-
-.. customgalleryitem::
- :tooltip: Convert a well-known state-of-the-art model like BERT into dynamic quantized model
- :description: :doc:`/intermediate/dynamic_quantization_bert_tutorial`
- :figure: /_static/img/bert.png
-
-
-
.. raw:: html
@@ -340,7 +328,7 @@ PyTorch Fundamentals In-Depth
beginner/text_sentiment_ngrams_tutorial
beginner/torchtext_translation_tutorial
beginner/transformer_tutorial
-
+
.. toctree::
:maxdepth: 2
:includehidden:
@@ -397,8 +385,6 @@ PyTorch Fundamentals In-Depth
advanced/dynamic_quantization_tutorial
advanced/static_quantization_tutorial
- intermediate/quantized_transfer_learning_tutorial
- intermediate/dynamic_quantization_bert_tutorial
.. toctree::
:maxdepth: 2
diff --git a/intermediate_source/dynamic_quantization_bert_tutorial.py b/intermediate_source/dynamic_quantization_bert_tutorial.py
deleted file mode 100644
index 0bdc91ce14c..00000000000
--- a/intermediate_source/dynamic_quantization_bert_tutorial.py
+++ /dev/null
@@ -1,661 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-(Experimental) Dynamic Quantization on HuggingFace BERT model
-==============================================================
-**Author**: `Jianyu Huang `_
-
-**Reviewed by**: `Raghuraman Krishnamoorthi `_
-
-**Edited by**: `Jessica Lin `_
-
-"""
-
-
-######################################################################
-# Introduction
-# ============
-#
-# In this tutorial, we will apply the dynamic quantization on a BERT
-# model, closely following the BERT model from the HuggingFace
-# Transformers examples (https://github.com/huggingface/transformers).
-# With this step-by-step journey, we would like to demonstrate how to
-# convert a well-known state-of-the-art model like BERT into dynamic
-# quantized model.
-#
-# - BERT, or Bidirectional Embedding Representations from Transformers,
-# is a new method of pre-training language representations which
-# achieves the state-of-the-art accuracy results on many popular
-# Natural Language Processing (NLP) tasks, such as question answering,
-# text classification, and others. The original paper can be found
-# here: https://arxiv.org/pdf/1810.04805.pdf.
-#
-# - Dynamic quantization support in PyTorch converts a float model to a
-# quantized model with static int8 or float16 data types for the
-# weights and dynamic quantization for the activations. The activations
-# are quantized dynamically (per batch) to int8 when the weights are
-# quantized to int8.
-#
-# In PyTorch, we have ``torch.quantization.quantize_dynamic`` API support
-# (https://pytorch.org/docs/stable/quantization.html#torch.quantization.quantize_dynamic),
-# which replaces specified modules with dynamic weight-only quantized
-# versions and output the quantized model.
-#
-# - We demonstrate the accuracy and inference performance results on the
-# Microsoft Research Paraphrase Corpus (MRPC) task
-# (https://www.microsoft.com/en-us/download/details.aspx?id=52398) in
-# the General Language Understanding Evaluation benchmark (GLUE)
-# (https://gluebenchmark.com/). The MRPC (Dolan and Brockett, 2005) is
-# a corpus of sentence pairs automatically extracted from online news
-# sources, with human annotations of whether the sentences in the pair
-# are semantically equivalent. Because the classes are imbalanced (68%
-# positive, 32% negative), we follow common practice and report both
-# accuracy and F1 score
-# (https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html).
-# MRPC is a common NLP task for language pair classification, as shown
-# below.
-#
-# .. raw:: html
-#
-#
-#
-# .. figure:: https://gluon-nlp.mxnet.io/_images/bert-sentence-pair.png
-# :alt: BERT for setence pair classification
-#
-# BERT for setence pair classification
-#
-# .. raw:: html
-#
-#
-#
-
-
-######################################################################
-# Setup
-# =====
-#
-# Install PyTorch and HuggingFace Transformers
-# --------------------------------------------
-#
-# To start this tutorial, let’s first follow the installation instructions
-# in PyTorch and HuggingFace Github Repo: -
-# https://github.com/pytorch/pytorch/#installation -
-# https://github.com/huggingface/transformers#installation
-#
-# In addition, we also install ``sklearn`` package, as we will reuse its
-# built-in F1 score calculation helper function.
-#
-# .. code:: shell
-#
-# !pip install sklearn
-# !pip install transformers
-
-
-######################################################################
-# Because we will be using the experimental parts of the PyTorch, it is
-# recommended to install the latest version of torch and torchvision. You
-# can find the most recent instructions on local installation here
-# https://pytorch.org/get-started/locally/. For example, to install on
-# Mac:
-#
-# .. code:: shell
-# !yes y | pip uninstall torch tochvision
-# !yes y | pip install --pre torch -f https://download.pytorch.org/whl/nightly/cu101/torch_nightly.html
-
-
-######################################################################
-# Import the necessary modules
-# ----------------------------
-#
-# In this step we import the necessary Python modules for the tutorial.
-#
-
-from __future__ import absolute_import, division, print_function
-
-import logging
-import numpy as np
-import os
-import random
-import sys
-import time
-import torch
-
-from argparse import Namespace
-from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
- TensorDataset)
-from tqdm import tqdm
-from transformers import (BertConfig, BertForSequenceClassification, BertTokenizer,)
-from transformers import glue_compute_metrics as compute_metrics
-from transformers import glue_output_modes as output_modes
-from transformers import glue_processors as processors
-from transformers import glue_convert_examples_to_features as convert_examples_to_features
-
-# Setup logging
-logger = logging.getLogger(__name__)
-logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
- datefmt = '%m/%d/%Y %H:%M:%S',
- level = logging.WARN)
-
-logging.getLogger("transformers.modeling_utils").setLevel(
- logging.WARN) # Reduce logging
-
-print(torch.__version__)
-# We set the number of threads to compare the single thread performance between FP32 and INT8 performance.
-# In the end of the tutorial, the user can set other number of threads by building PyTorch with right parallel backend.
-torch.set_num_threads(1)
-print(torch.__config__.parallel_info())
-
-
-######################################################################
-# Download the dataset
-# --------------------
-#
-# Before running MRPC tasks we download the GLUE data
-# (https://gluebenchmark.com/tasks) by running this script
-# (https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e,
-# https://github.com/nyu-mll/GLUE-baselines/blob/master/download_glue_data.py)
-# and unpack it to some directory “glue_data/MRPC”.
-#
-
-# !python download_glue_data.py --data_dir='glue_data' --tasks='MRPC' --test_labels=True
-!pwd
-!ls
-!wget https://gist.githubusercontent.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e/raw/17b8dd0d724281ed7c3b2aeeda662b92809aadd5/download_glue_data.py
-!python download_glue_data.py --data_dir='glue_data' --tasks='MRPC'
-!ls glue_data/MRPC
-
-
-######################################################################
-# Helper functions
-# ----------------
-#
-# The helper functions are built-in in transformers library. We mainly use
-# the following helper functions: one for converting the text examples
-# into the feature vectors; The other one for measuring the F1 score of
-# the predicted result.
-#
-# Convert the texts into features
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-#
-# glue_convert_examples_to_features (
-# https://github.com/huggingface/transformers/blob/master/transformers/data/processors/glue.py)
-# load a data file into a list of ``InputFeatures``.
-#
-# - Tokenize the input sequences;
-# - Insert [CLS] at the beginning;
-# - Insert [SEP] between the first sentence and the second sentence, and
-# at the end;
-# - Generate token type ids to indicate whether a token belongs to the
-# first sequence or the second sequence;
-#
-# F1 metric
-# ~~~~~~~~~
-#
-# The F1 score
-# (https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html)
-# can be interpreted as a weighted average of the precision and recall,
-# where an F1 score reaches its best value at 1 and worst score at 0. The
-# relative contribution of precision and recall to the F1 score are equal.
-# The formula for the F1 score is:
-#
-# F1 = 2 \* (precision \* recall) / (precision + recall)
-#
-
-
-######################################################################
-# Fine-tune the BERT model
-# ========================
-#
-
-
-######################################################################
-# The spirit of BERT is to pre-train the language representations and then
-# to fine-tune the deep bi-directional representations on a wide range of
-# tasks with minimal task-dependent parameters, and achieves
-# state-of-the-art results. In this tutorial, we will focus on fine-tuning
-# with the pre-trained BERT model to classify semantically equivalent
-# sentence pairs on MRPC task.
-#
-# To fine-tune the pre-trained BERT model (“bert-base-uncased” model in
-# HuggingFace transformers) for the MRPC task, you can follow the command
-# in (https://github.com/huggingface/transformers/tree/master/examples):
-#
-# ::
-#
-# export GLUE_DIR=./glue_data
-# export TASK_NAME=MRPC
-# export OUT_DIR=/mnt/homedir/jianyuhuang/public/bert/$TASK_NAME/
-# python ./run_glue.py \
-# --model_type bert \
-# --model_name_or_path bert-base-uncased \
-# --task_name $TASK_NAME \
-# --do_train \
-# --do_eval \
-# --do_lower_case \
-# --data_dir $GLUE_DIR/$TASK_NAME \
-# --max_seq_length 128 \
-# --per_gpu_eval_batch_size=8 \
-# --per_gpu_train_batch_size=8 \
-# --learning_rate 2e-5 \
-# --num_train_epochs 3.0 \
-# --save_steps 100000 \
-# --output_dir $OUT_DIR
-#
-# We provide the fined-tuned BERT model for MRPC task here (We did the
-# fine-tuning on CPUs with a total train batch size of 8):
-#
-# https://drive.google.com/drive/folders/1mGBx0t-YJAWXHbgab2f_IimaMiVHlKh-
-#
-# To save time, you can manually copy the fined-tuned BERT model for MRPC
-# task in your Google Drive (Create the same “BERT_Quant_Tutorial/MRPC”
-# folder in the Google Drive directory), and then mount your Google Drive
-# on your runtime using an authorization code, so that we can directly
-# read and write the models into Google Drive in the following steps.
-#
-
-from google.colab import drive
-drive.mount('/content/drive')
-
-!ls
-!pwd
-
-
-######################################################################
-# Set global configurations
-# -------------------------
-#
-
-
-######################################################################
-# Here we set the global configurations for evaluating the fine-tuned BERT
-# model before and after the dynamic quantization.
-#
-
-configs = Namespace()
-
-# The output directory for the fine-tuned model.
-# configs.output_dir = "/mnt/homedir/jianyuhuang/public/bert/MRPC/"
-configs.output_dir = "/content/drive/My Drive/BERT_Quant_Tutorial/MRPC/"
-# configs.output_dir = "./MRPC/"
-
-# The data directory for the MRPC task in the GLUE benchmark.
-# configs.data_dir = "/mnt/homedir/jianyuhuang/public/bert/glue_data/MRPC"
-# configs.data_dir = "./glue_data/MRPC"
-configs.data_dir = "/content/glue_data/MRPC"
-
-# The model name or path for the pre-trained model.
-configs.model_name_or_path = "bert-base-uncased"
-# The maximum length of an input sequence
-configs.max_seq_length = 128
-
-# Prepare GLUE task.
-configs.task_name = "MRPC".lower()
-configs.processor = processors[configs.task_name]()
-configs.output_mode = output_modes[configs.task_name]
-configs.label_list = configs.processor.get_labels()
-configs.model_type = "bert".lower()
-configs.do_lower_case = True
-
-# Set the device, batch size, topology, and caching flags.
-configs.device = "cpu"
-configs.per_gpu_eval_batch_size = 8
-configs.n_gpu = 0
-configs.local_rank = -1
-configs.overwrite_cache = False
-
-
-# Set random seed for reproducibility.
-def set_seed(seed):
- random.seed(seed)
- np.random.seed(seed)
- torch.manual_seed(seed)
-set_seed(42)
-
-
-######################################################################
-# Load the fine-tuned BERT model
-# ------------------------------
-#
-
-
-######################################################################
-# We load the tokenizer and fine-tuned BERT sequence classifier model
-# (FP32) from the ``configs.output_dir``.
-#
-
-tokenizer = BertTokenizer.from_pretrained(
- configs.output_dir, do_lower_case=configs.do_lower_case)
-
-model = BertForSequenceClassification.from_pretrained(configs.output_dir)
-model.to(configs.device)
-
-
-######################################################################
-# Define the tokenize and evaluation function
-# -------------------------------------------
-#
-# We reuse the tokenize and evaluation function from
-# https://github.com/huggingface/transformers/blob/master/examples/run_glue.py.
-#
-
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-def evaluate(args, model, tokenizer, prefix=""):
- # Loop to handle MNLI double evaluation (matched, mis-matched)
- eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,)
- eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,)
-
- results = {}
- for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
- eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True)
-
- if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
- os.makedirs(eval_output_dir)
-
- args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
- # Note that DistributedSampler samples randomly
- eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
- eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
-
- # multi-gpu eval
- if args.n_gpu > 1:
- model = torch.nn.DataParallel(model)
-
- # Eval!
- logger.info("***** Running evaluation {} *****".format(prefix))
- logger.info(" Num examples = %d", len(eval_dataset))
- logger.info(" Batch size = %d", args.eval_batch_size)
- eval_loss = 0.0
- nb_eval_steps = 0
- preds = None
- out_label_ids = None
- for batch in tqdm(eval_dataloader, desc="Evaluating"):
- model.eval()
- batch = tuple(t.to(args.device) for t in batch)
-
- with torch.no_grad():
- inputs = {'input_ids': batch[0],
- 'attention_mask': batch[1],
- 'labels': batch[3]}
- if args.model_type != 'distilbert':
- inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet'] else None # XLM, DistilBERT and RoBERTa don't use segment_ids
- outputs = model(**inputs)
- tmp_eval_loss, logits = outputs[:2]
-
- eval_loss += tmp_eval_loss.mean().item()
- nb_eval_steps += 1
- if preds is None:
- preds = logits.detach().cpu().numpy()
- out_label_ids = inputs['labels'].detach().cpu().numpy()
- else:
- preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
- out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)
-
- eval_loss = eval_loss / nb_eval_steps
- if args.output_mode == "classification":
- preds = np.argmax(preds, axis=1)
- elif args.output_mode == "regression":
- preds = np.squeeze(preds)
- result = compute_metrics(eval_task, preds, out_label_ids)
- results.update(result)
-
- output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
- with open(output_eval_file, "w") as writer:
- logger.info("***** Eval results {} *****".format(prefix))
- for key in sorted(result.keys()):
- logger.info(" %s = %s", key, str(result[key]))
- writer.write("%s = %s\n" % (key, str(result[key])))
-
- return results
-
-
-def load_and_cache_examples(args, task, tokenizer, evaluate=False):
- if args.local_rank not in [-1, 0] and not evaluate:
- torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache
-
- processor = processors[task]()
- output_mode = output_modes[task]
- # Load data features from cache or dataset file
- cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}'.format(
- 'dev' if evaluate else 'train',
- list(filter(None, args.model_name_or_path.split('/'))).pop(),
- str(args.max_seq_length),
- str(task)))
- if os.path.exists(cached_features_file) and not args.overwrite_cache:
- logger.info("Loading features from cached file %s", cached_features_file)
- features = torch.load(cached_features_file)
- else:
- logger.info("Creating features from dataset file at %s", args.data_dir)
- label_list = processor.get_labels()
- if task in ['mnli', 'mnli-mm'] and args.model_type in ['roberta']:
- # HACK(label indices are swapped in RoBERTa pretrained model)
- label_list[1], label_list[2] = label_list[2], label_list[1]
- examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
- features = convert_examples_to_features(examples,
- tokenizer,
- label_list=label_list,
- max_length=args.max_seq_length,
- output_mode=output_mode,
- pad_on_left=bool(args.model_type in ['xlnet']), # pad on the left for xlnet
- pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
- pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0,
- )
- if args.local_rank in [-1, 0]:
- logger.info("Saving features into cached file %s", cached_features_file)
- torch.save(features, cached_features_file)
-
- if args.local_rank == 0 and not evaluate:
- torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache
-
- # Convert to Tensors and build dataset
- all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
- all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
- all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
- if output_mode == "classification":
- all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
- elif output_mode == "regression":
- all_labels = torch.tensor([f.label for f in features], dtype=torch.float)
-
- dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
- return dataset
-
-
-
-######################################################################
-# Apply the dynamic quantization
-# ==============================
-#
-# We call ``torch.quantization.quantize_dynamic`` on the model to apply
-# the dynamic quantization on the HuggingFace BERT model. Specifically,
-#
-# - We specify that we want the torch.nn.Linear modules in our model to
-# be quantized;
-# - We specify that we want weights to be converted to quantized int8
-# values.
-#
-
-quantized_model = torch.quantization.quantize_dynamic(
- model, {torch.nn.Linear}, dtype=torch.qint8
-)
-print(quantized_model)
-
-
-
-######################################################################
-# In PyTorch 1.4 release, we further add the per-channel quantization
-# support for dynamic quantization.
-#
-# .. figure:: https://drive.google.com/open?id=1N6P70MR6jJ2tcFnFJ2lROLSFqmiOY--g
-# :alt: Per Tensor Quantization for Weight
-#
-# Per Tensor Quantization for Weight
-#
-# .. figure:: https://drive.google.com/open?id=1nyjUKP5qtkRCJPKtUaXXwhglLMQQ0Dfs
-# :alt: Per Channel Quantization for Weight
-#
-# Per Channel Quantization for Weight
-#
-
-qconfig_dict = {
- torch.nn.Linear: torch.quantization.per_channel_dynamic_qconfig
-}
-per_channel_quantized_model = torch.quantization.quantize_dynamic(
- model, qconfig_dict, dtype=torch.qint8
-)
-
-
-######################################################################
-# Check the model size
-# --------------------
-#
-# Let’s first check the model size. We can observe a significant reduction
-# in model size:
-#
-
-def print_size_of_model(model):
- torch.save(model.state_dict(), "temp.p")
- print('Size (MB):', os.path.getsize("temp.p")/1e6)
- os.remove('temp.p')
-
-print_size_of_model(model)
-print_size_of_model(quantized_model)
-# print_size_of_model(per_channel_quantized_model)
-
-
-
-
-######################################################################
-# The BERT model used in this tutorial (bert-base-uncased) has a
-# vocabulary size V of 30522. With the embedding size of 768, the total
-# size of the word embedding table is ~ 4 (Bytes/FP32) \* 30522 \* 768 =
-# 90 MB. So with the help of quantization, the model size of the
-# non-embedding table part is reduced from 350 MB (FP32 model) to 90 MB
-# (INT8 model).
-#
-
-
-######################################################################
-# Evaluate the inference accuracy and time
-# ----------------------------------------
-#
-# Next, let’s compare the inference time as well as the evaluation
-# accuracy between the original FP32 model and the INT8 model after the
-# dynamic quantization.
-#
-
-# Evaluate the original FP32 BERT model
-def time_model_evaluation(model, configs, tokenizer):
- eval_start_time = time.time()
- result = evaluate(configs, model, tokenizer, prefix="")
- eval_end_time = time.time()
- eval_duration_time = eval_end_time - eval_start_time
- print(result)
- print("Evaluate total time (seconds): {0:.1f}".format(eval_duration_time))
-
-time_model_evaluation(model, configs, tokenizer)
-
-# Evaluate the INT8 BERT model after the dynamic quantization
-time_model_evaluation(quantized_model, configs, tokenizer)
-
-
-######################################################################
-# Running this locally on a MacBook Pro, without quantization, inference
-# (for all 408 examples in MRPC dataset) takes about 160 seconds, and with
-# quantization it takes just about 90 seconds. We summarize the results
-# for running the quantized BERT model inference on a Macbook Pro as the
-# follows:
-#
-# ::
-#
-# | Prec | F1 score | Model Size | 1 thread | 4 threads |
-# | FP32 | 0.9019 | 438 MB | 160 sec | 85 sec |
-# | INT8 | 0.8953 | 181 MB | 90 sec | 46 sec |
-#
-# We have 0.6% F1 score accuracy after applying the post-training dynamic
-# quantization on the fine-tuned BERT model on the MRPC task. As a
-# comparison, in the recent paper [3] (Table 1), it achieved 0.8788 by
-# applying the post-training dynamic quantization and 0.8956 by applying
-# the quantization-aware training. The main reason is that we support the
-# asymmetric quantization in PyTorch while that paper supports the
-# symmetric quantization only.
-#
-# Note that we set the number of threads to 1 for the single-thread
-# comparison in this tutorial. We also support the intra-op
-# parallelization for these quantized INT8 operators. The users can now
-# set multi-thread by ``torch.set_num_threads(N)`` (``N`` is the number of
-# intra-op parallelization threads). One preliminary requirement to enable
-# the intra-op parallelization support is to build PyTorch with the right
-# backend such as OpenMP, Native, or TBB
-# (https://pytorch.org/docs/stable/notes/cpu_threading_torchscript_inference.html#build-options).
-# You can use ``torch.__config__.parallel_info()`` to check the
-# parallelization settings. On the same MacBook Pro using PyTorch with
-# Native backend for parallelization, we can get about 46 seconds for
-# processing the evaluation of MRPC dataset.
-#
-
-# Evaluate the INT8 BERT model after the per-channel dynamic quantization
-time_model_evaluation(per_channel_quantized_model, configs, tokenizer)
-
-
-
-######################################################################
-# Serialize the quantized model
-# -----------------------------
-#
-# We can serialize and save the quantized model for the future use.
-#
-
-quantized_output_dir = configs.output_dir + "quantized/"
-if not os.path.exists(quantized_output_dir):
- os.makedirs(quantized_output_dir)
-quantized_model.save_pretrained(quantized_output_dir)
-
-
-######################################################################
-# Conclusion
-# ==========
-#
-# In this tutorial, we demonstrated how to demonstrate how to convert a
-# well-known state-of-the-art NLP model like BERT into dynamic quantized
-# model. Dynamic quantization can reduce the size of the model while only
-# having a limited implication on accuracy.
-#
-# Thanks for reading! As always, we welcome any feedback, so please create
-# an issue here (https://github.com/pytorch/pytorch/issues) if you have
-# any.
-#
-
-
-######################################################################
-# References
-# ==========
-#
-# [1] J.Devlin, M. Chang, K. Lee and K. Toutanova, BERT: Pre-training of
-# Deep Bidirectional Transformers for Language Understanding (2018)
-#
-# [2] HuggingFace Transformers.
-# https://github.com/huggingface/transformers
-#
-# [3] O. Zafrir, G. Boudoukh, P. Izsak, & M. Wasserblat (2019). Q8BERT:
-# Quantized 8bit BERT. arXiv preprint arXiv:1910.06188.
-#
-
-
-######################################################################
-#
-#
-
-
diff --git a/intermediate_source/quantized_transfer_learning_tutorial.py b/intermediate_source/quantized_transfer_learning_tutorial.py
deleted file mode 100644
index 750d2c9ff29..00000000000
--- a/intermediate_source/quantized_transfer_learning_tutorial.py
+++ /dev/null
@@ -1,530 +0,0 @@
-"""
-Quantized Transfer Learning for Computer Vision Tutorial
-========================================================
-
-**Author**: `Zafar Takhirov `_
-
-**Reviewed by**: `Raghuraman Krishnamoorthi `_
-
-**Edited by**: `Jessica Lin `_
-
-This tutorial builds on the original `PyTorch Transfer Learning `_
-tutorial, written by
-`Sasank Chilamkurthy `_.
-
-Transfer learning refers to techniques to use a pretrained model for
-application on a different data-set. Typical scenarios look as follows:
-
-1. **ConvNet as fixed feature extractor**: Here, you “freeze”[#1]\_ the
- weights for all of the network parameters except that of the final
- several layers (aka “the head”, usually fully connected layers).
- These last layers are replaced with new ones initialized with random
- weights and only these layers are trained.
-2. **Finetuning the convnet**: Instead of random initializaion, you
- initialize the network with a pretrained network, like the one that
- is trained on imagenet 1000 dataset. Rest of the training looks as
- usual. It is common to set the learning rate to a smaller number, as
- the network is already considered to be trained.
-
-You can also combine the above two scenarios, and execute them both:
-First you can freeze the feature extractor, and train the head. After
-that, you can unfreeze the feature extractor (or part of it), set the
-learning rate to something smaller, and continue training.
-
-In this part you will use the first scenario – extracting the features
-using a quantized model.
-
-.. rubric:: Footnotes
-
-.. [#1] “Freezing” the model/layer means running it only in inference
-mode, and not allowing its parameters to be updated during the training.
-
-We will start by doing the necessary imports:
-"""
-
-# imports
-import matplotlib.pyplot as plt
-import numpy as np
-import time
-import copy
-
-plt.rc('axes', labelsize=18, titlesize=18)
-plt.rc('figure', titlesize=18)
-plt.rc('font', family='DejaVu Sans', serif='Times', size=18)
-plt.rc('legend', fontsize=18)
-plt.rc('lines', linewidth=3)
-plt.rc('text', usetex=False) # TeX might not be supported
-plt.rc('xtick', labelsize=18)
-plt.rc('ytick', labelsize=18)
-
-######################################################################
-# Installing the Nightly Build
-# ----------------------------
-#
-# Because you will be using the experimental parts of the PyTorch, it is
-# recommended to install the latest version of ``torch`` and
-# ``torchvision``. You can find the most recent instructions on local
-# installation `here `_.
-# For example, to install on Mac:
-#
-# .. code:: shell
-#
-# pip install numpy
-# pip install --pre torch torchvision -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
-#
-
-
-
-
-######################################################################
-# Load Data (section not needed as it is covered in the original tutorial)
-# ------------------------------------------------------------------------
-#
-# We will use ``torchvision`` and ``torch.utils.data`` packages to load
-# the data.
-#
-# The problem you are going to solve today is classifying **ants** and
-# **bees** from images. The dataset contains about 120 training images
-# each for ants and bees. There are 75 validation images for each class.
-# This is considered a very small dataset to generalize on. However, since
-# we are using transfer learning, we should be able to generalize
-# reasonably well.
-#
-# *This dataset is a very small subset of imagenet.*
-#
-# .. Note :: Download the data from
-# `here `_
-# and extract it to the ``data`` directory.
-#
-
-import requests
-import os
-import zipfile
-
-DATA_URL = 'https://download.pytorch.org/tutorial/hymenoptera_data.zip'
-DATA_PATH = os.path.join('.', 'data')
-FILE_NAME = os.path.join(DATA_PATH, 'hymenoptera_data.zip')
-
-if not os.path.isfile(FILE_NAME):
- print("Downloading the data...")
- os.makedirs('data', exist_ok=True)
- with requests.get(DATA_URL) as req:
- with open(FILE_NAME, 'wb') as f:
- f.write(req.content)
- if 200 <= req.status_code < 300:
- print("Download complete!")
- else:
- print("Download failed!")
-else:
- print(FILE_NAME, "already exists, skipping download...")
-
-with zipfile.ZipFile(FILE_NAME, 'r') as zip_ref:
- print("Unzipping...")
- zip_ref.extractall('data')
-
-DATA_PATH = os.path.join(DATA_PATH, 'hymenoptera_data')
-
-import torch
-from torchvision import transforms, datasets
-
-# Data augmentation and normalization for training
-# Just normalization for validation
-data_transforms = {
- 'train': transforms.Compose([
- transforms.Resize(224),
- transforms.RandomCrop(224),
- transforms.RandomHorizontalFlip(),
- transforms.ToTensor(),
- transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
- ]),
- 'val': transforms.Compose([
- transforms.Resize(224),
- transforms.CenterCrop(224),
- transforms.ToTensor(),
- transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
- ]),
-}
-
-image_datasets = {x: datasets.ImageFolder(os.path.join(DATA_PATH, x),
- data_transforms[x])
- for x in ['train', 'val']}
-dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=16,
- shuffle=True, num_workers=8)
- for x in ['train', 'val']}
-dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}
-class_names = image_datasets['train'].classes
-
-device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-
-
-######################################################################
-# Visualize a few images
-# ^^^^^^^^^^^^^^^^^^^^^^
-#
-# Let’s visualize a few training images so as to understand the data
-# augmentations.
-#
-
-import torchvision
-
-def imshow(inp, title=None, ax=None, figsize=(5, 5)):
- """Imshow for Tensor."""
- inp = inp.numpy().transpose((1, 2, 0))
- mean = np.array([0.485, 0.456, 0.406])
- std = np.array([0.229, 0.224, 0.225])
- inp = std * inp + mean
- inp = np.clip(inp, 0, 1)
- if ax is None:
- fig, ax = plt.subplots(1, figsize=figsize)
- ax.imshow(inp)
- ax.set_xticks([])
- ax.set_yticks([])
- if title is not None:
- ax.set_title(title)
-
-# Get a batch of training data
-inputs, classes = next(iter(dataloaders['train']))
-
-# Make a grid from batch
-out = torchvision.utils.make_grid(inputs, nrow=4)
-
-fig, ax = plt.subplots(1, figsize=(10, 10))
-imshow(out, title=[class_names[x] for x in classes], ax=ax)
-
-
-######################################################################
-# Training the model
-# ------------------
-#
-# Now, let’s write a general function to train a model. Here, we will
-# illustrate:
-#
-# - Scheduling the learning rate
-# - Saving the best model
-#
-# In the following, parameter ``scheduler`` is an LR scheduler object from
-# ``torch.optim.lr_scheduler``.
-#
-
-def train_model(model, criterion, optimizer, scheduler, num_epochs=25, device='cpu'):
- since = time.time()
-
- best_model_wts = copy.deepcopy(model.state_dict())
- best_acc = 0.0
-
- for epoch in range(num_epochs):
- print('Epoch {}/{}'.format(epoch, num_epochs - 1))
- print('-' * 10)
-
- # Each epoch has a training and validation phase
- for phase in ['train', 'val']:
- if phase == 'train':
- model.train() # Set model to training mode
- else:
- model.eval() # Set model to evaluate mode
-
- running_loss = 0.0
- running_corrects = 0
-
- # Iterate over data.
- for inputs, labels in dataloaders[phase]:
- inputs = inputs.to(device)
- labels = labels.to(device)
-
- # zero the parameter gradients
- optimizer.zero_grad()
-
- # forward
- # track history if only in train
- with torch.set_grad_enabled(phase == 'train'):
- outputs = model(inputs)
- _, preds = torch.max(outputs, 1)
- loss = criterion(outputs, labels)
-
- # backward + optimize only if in training phase
- if phase == 'train':
- loss.backward()
- optimizer.step()
-
- # statistics
- running_loss += loss.item() * inputs.size(0)
- running_corrects += torch.sum(preds == labels.data)
- if phase == 'train':
- scheduler.step()
-
- epoch_loss = running_loss / dataset_sizes[phase]
- epoch_acc = running_corrects.double() / dataset_sizes[phase]
-
- print('{} Loss: {:.4f} Acc: {:.4f}'.format(
- phase, epoch_loss, epoch_acc))
-
- # deep copy the model
- if phase == 'val' and epoch_acc > best_acc:
- best_acc = epoch_acc
- best_model_wts = copy.deepcopy(model.state_dict())
-
- print()
-
- time_elapsed = time.time() - since
- print('Training complete in {:.0f}m {:.0f}s'.format(
- time_elapsed // 60, time_elapsed % 60))
- print('Best val Acc: {:4f}'.format(best_acc))
-
- # load best model weights
- model.load_state_dict(best_model_wts)
- return model
-
-
-######################################################################
-# Visualizing the model predictions ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-#
-# Generic function to display predictions for a few images
-#
-
-def visualize_model(model, rows=3, cols=3):
- was_training = model.training
- model.eval()
- current_row = current_col = 0
- fig, ax = plt.subplots(rows, cols, figsize=(cols*2, rows*2))
-
- with torch.no_grad():
- for idx, (imgs, lbls) in enumerate(dataloaders['val']):
- imgs = imgs.cpu()
- lbls = lbls.cpu()
-
- outputs = model(imgs)
- _, preds = torch.max(outputs, 1)
-
- for jdx in range(imgs.size()[0]):
- imshow(imgs.data[jdx], ax=ax[current_row, current_col])
- ax[current_row, current_col].axis('off')
- ax[current_row, current_col].set_title('predicted: {}'.format(class_names[preds[jdx]]))
-
- current_col += 1
- if current_col >= cols:
- current_row += 1
- current_col = 0
- if current_row >= rows:
- model.train(mode=was_training)
- return
- model.train(mode=was_training)
-
-
-######################################################################
-# Part 1. Training a Custom Classifier based on a Quantized Feature Extractor
-# ---------------------------------------------------------------------------
-#
-# In this section you will use a “frozen” quantized feature extractor, and
-# train a custom classifier head on top of it. Unlike floating point
-# models, you don’t need to set requires_grad=False for the quantized
-# model, as it has no trainable parameters. Please, refer to the
-# documentation https://pytorch.org/docs/stable/quantization.html\ \_ for
-# more details.
-#
-# Load a pretrained model: for this exercise you will be using ResNet-18
-# https://pytorch.org/hub/pytorch_vision_resnet/\ \_.
-#
-
-import torchvision.models.quantization as models
-
-# We will need the number of filters in the `fc` for future use.
-# Here the size of each output sample is set to 2.
-# Alternatively, it can be generalized to nn.Linear(num_ftrs, len(class_names)).
-model_fe = models.resnet18(pretrained=True, progress=True, quantize=True)
-num_ftrs = model_fe.fc.in_features
-
-
-######################################################################
-# At this point you need to mofify the pretrained model: Because the model
-# has the quantize/dequantize blocks in the beginning and the end, butt we
-# will only uuse the feature extractor, the dequantizatioin layer has to
-# move right before the linear layer (the head). The easiest way of doing
-# it is to wrap the model under the ``nn.Sequential``.
-#
-# The first step to do, is to isolate the feature extractor in the ResNet
-# model. Although in this example you are tasked to use all layers except
-# ``fc`` as the feature extractor, in reality, you can take as many parts
-# as you need. This would be useful in case you would like to replace some
-# of the convolutional layers as well.
-#
-
-
-######################################################################
-# **Notice that when isolating the feature extractor from a quantized
-# model, you have to place the quantizer in the beginning and in the end
-# of it.**
-#
-
-from torch import nn
-
-def create_combined_model(model_fe):
- # Step 1. Isolate the feature extractor.
- model_fe_features = nn.Sequential(
- model_fe.quant, # Quantize the input
- model_fe.conv1,
- model_fe.bn1,
- model_fe.relu,
- model_fe.maxpool,
- model_fe.layer1,
- model_fe.layer2,
- model_fe.layer3,
- model_fe.layer4,
- model_fe.avgpool,
- model_fe.dequant, # Dequantize the output
- )
-
- # Step 2. Create a new "head"
- new_head = nn.Sequential(
- nn.Dropout(p=0.5),
- nn.Linear(num_ftrs, 2),
- )
-
- # Step 3. Combine, and don't forget the quant stubs.
- new_model = nn.Sequential(
- model_fe_features,
- nn.Flatten(1),
- new_head,
- )
- return new_model
-
-new_model = create_combined_model(model_fe)
-
-
-######################################################################
-# .. warning:: Currently the quantized models can only be run on CPU.
-# However, it is possible to send the non-quantized parts of the model to
-# a GPU.
-#
-
-import torch.optim as optim
-new_model = new_model.to('cpu')
-
-criterion = nn.CrossEntropyLoss()
-
-# Note that we are only training the head.
-optimizer_ft = optim.SGD(new_model.parameters(), lr=0.01, momentum=0.9)
-
-# Decay LR by a factor of 0.1 every 7 epochs
-exp_lr_scheduler = optim.lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)
-
-
-######################################################################
-# Train and evaluate
-# ------------------
-#
-# This step takes around 15-25 min on CPU. Because the quantized model can
-# only run on the CPU, you cannot run the training on GPU.
-#
-
-new_model = train_model(new_model, criterion, optimizer_ft, exp_lr_scheduler,
- num_epochs=25, device='cpu')
-
-visualize_model(new_model)
-plt.tight_layout()
-
-
-######################################################################
-# **Part 2. Finetuning the quantizable model**
-#
-# In this part, we fine tune the feature extractor used for transfer
-# learning, and quantize the feature extractor. Note that in both part 1
-# and 2, the feature extractor is quantized. The difference is that in
-# part 1, we use a pretrained quantized model. In this part, we create a
-# quantized feature extractor after fine tuning on the data-set of
-# interest, so this is a way to get better accuracy with transfer learning
-# while having the benefits of quantization. Note that in our specific
-# example, the training set is really small (120 images) so the benefits
-# of fine tuning the entire model is not apparent. However, the procedure
-# shown here will improve accuracy for transfer learning with larger
-# datasets.
-#
-# The pretrained feature extractor must be quantizable, i.e we need to do
-# the following: 1. Fuse (Conv, BN, ReLU), (Conv, BN) and (Conv, ReLU)
-# using torch.quantization.fuse_modules. 2. Connect the feature extractor
-# with a custom head. This requires dequantizing the output of the feature
-# extractor. 3. Insert fake-quantization modules at appropriate locations
-# in the feature extractor to mimic quantization during training.
-#
-# For step (1), we use models from torchvision/models/quantization, which
-# support a member method fuse_model, which fuses all the conv, bn, and
-# relu modules. In general, this would require calling the
-# torch.quantization.fuse_modules API with the list of modules to fuse.
-#
-# Step (2) is done by the function create_custom_model function that we
-# used in the previous section.
-#
-# Step (3) is achieved by using torch.quantization.prepare_qat, which
-# inserts fake-quantization modules.
-#
-# Step (4) Fine tune the model with the desired custom head.
-#
-# Step (5) We convert the fine tuned model into a quantized model (only
-# the feature extractor is quantized) by calling
-# torch.quantization.convert
-#
-# .. note:: Because of the random initialization your results might differ
-# from the results shown here.
-#
-
-model = models.resnet18(pretrained=True, progress=True, quantize=False) # notice `quantize=False`
-num_ftrs = model.fc.in_features
-
-# Step 1
-model.train()
-model.fuse_model()
-# Step 2
-model_ft = create_combined_model(model)
-model_ft[0].qconfig = torch.quantization.default_qat_qconfig # Use default QAT configuration
-# Step 3
-model_ft = torch.quantization.prepare_qat(model_ft, inplace=True)
-
-
-
-
-######################################################################
-# Finetuning the model
-# --------------------
-#
-# We fine tune the entire model including the feature extractor. In
-# general, this will lead to higher accuracy. However, due to the small
-# training set used here, we end up overfitting to the training set.
-#
-
-# Step 4. Fine tune the model
-
-for param in model_ft.parameters():
- param.requires_grad = True
-
-model_ft.cuda() # We can fine-tune on GPU
-
-criterion = nn.CrossEntropyLoss()
-
-# Note that we are training everything, so the learning rate is lower
-# Notice the smaller learning rate
-optimizer_ft = optim.SGD(model_ft.parameters(), lr=1e-3, momentum=0.9, weight_decay=0.1)
-
-# Decay LR by a factor of 0.3 every several epochs
-exp_lr_scheduler = optim.lr_scheduler.StepLR(optimizer_ft, step_size=5, gamma=0.3)
-
-model_ft_tuned = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler,
- num_epochs=25, device='cuda')
-
-# Step 5. Convert to quantized model
-
-from torch.quantization import convert
-model_ft_tuned.cpu()
-
-model_quantized_and_trained = convert(model_ft_tuned, inplace=False)
-
-
-
-######################################################################
-# Lets see how the quantized model performs on a few images
-#
-
-visualize_model(model_quantized_and_trained)
-
-plt.ioff()
-plt.tight_layout()
-plt.show()
-