From 8a5c0bea9f25e399b49c6c1a2b25c61f7fc62296 Mon Sep 17 00:00:00 2001
From: Jing Xu <jing.xu@intel.com>
Date: Mon, 4 Oct 2021 15:42:02 +0900
Subject: [PATCH 1/6] init for recipe for intel extension for pytorch

---
 .../recipes/intel_extension_for_pytorch.py    | 309 ++++++++++++++++++
 recipes_source/recipes_index.rst              |   9 +
 2 files changed, 318 insertions(+)
 create mode 100644 recipes_source/recipes/intel_extension_for_pytorch.py

diff --git a/recipes_source/recipes/intel_extension_for_pytorch.py b/recipes_source/recipes/intel_extension_for_pytorch.py
new file mode 100644
index 00000000000..e54df64310f
--- /dev/null
+++ b/recipes_source/recipes/intel_extension_for_pytorch.py
@@ -0,0 +1,309 @@
+"""
+Intel(R) Extension for PyTorch*
+************************f
+**Author**: `Jing Xu <https://github.com/jingxu10>`_
+
+Intel® Extension for PyTorch* extends PyTorch with optimizations for extra
+performance boost on Intel hardware. Most of the optimizations will be
+included in stock PyTorch releases eventually, and the intention of the
+extension is to deliver up to date features and optimizations for PyTorch
+on Intel hardware, examples include AVX-512 Vector Neural Network
+Instructions (AVX512 VNNI) and Intel® Advanced Matrix Extensions (Intel® AMX).
+
+Intel® Extension for PyTorch* has been released as an open–source project
+at `Github <https://github.com/intel/intel-extension-for-pytorch>`_.
+
+Features
+--------
+"""
+
+###############################################################################
+# * Ease-of-use Python API: Intel® Extension for PyTorch* provides simple
+# frontend Python APIs and utilities for users to get performance optimizations
+# such as graph optimization and operator optimization with minor code changes.
+# Typically, only 2 to 3 clauses are required to be added to the original code.
+
+# * Channels Last: Comparing to the default NCHW memory format, channels_last
+# (NHWC) memory format could further accelerate convolutional neural networks.
+# In Intel® Extension for PyTorch*, NHWC memory format  has been enabled for
+# most key CPU operators, though not all of them have been merged to PyTorch
+# master branch yet. They are expected to be fully landed in PyTorch upstream
+# soon.
+
+# * Auto Mixed Precision (AMP): Low precision data type BFloat16 has been
+# natively supported on the 3rd Generation Xeon scalable Servers (aka Cooper
+# Lake) with AVX512 instruction set and will be  supported on the next
+# generation of Intel® Xeon® Scalable Processors with Intel® Advanced Matrix
+# Extensions (Intel® AMX) instruction set with further boosted performance. The
+# support of Auto Mixed Precision (AMP) with BFloat16 for CPU and BFloat16
+# optimization of operators have been  massively enabled in Intel® Extension
+# for PyTorch*, and partially upstreamed to PyTorch master branch. Most of
+# these optimizations will be landed in PyTorch master through PRs that are
+# being submitted and reviewed. Graph Optimization: To optimize performance
+# further with torchscript, Intel® Extension for PyTorch* supports fusion of
+# frequently used operator patterns, like Conv2D+ReLU, Linear+ReLU, etc. The
+# benefit of the fusions are delivered to users in a transparant fashion.
+# Detailed fusion patterns supported can be found `here <https://github.com/intel/intel-extension-for-pytorch>`_.
+
+# * Operator Optimization: Intel® Extension for PyTorch* also optimizes
+# operators and implements several customized operators for performance. A few
+# ATen operators are replaced by their optimized counterparts in Intel®
+# Extension for PyTorch* via ATen registration mechanism. Moreover, some
+# customized operators are implemented for several popular topologies. For
+# instance, ROIAlign and NMS are defined in Mask R-CNN. To improve performance
+# of these topologies, Intel® Extension for PyTorch* also optimized these
+# customized operators.
+
+###############################################################################
+# Getting Started
+# ---------------
+
+# Minor code changes are required for users to get start with Intel® Extension
+# for PyTorch*. Both PyTorch imperative mode and TorchScript mode are
+# supported. This section introduces usage of Intel® Extension for PyTorch* API
+# functions for both imperative mode and TorchScript mode, covering data type
+# Float32 and BFloat16. C++ usage will also be introduced at the end.
+
+# You just need to import Intel® Extension for PyTorch* package and apply its
+# optimize function against the model object. If it is a training workload, the
+# optimize function also needs to be applied against the optimizer object.
+
+# For training and inference with BFloat16 data type, torch.cpu.amp has been
+# enabled in PyTorch upstream to support mixed precision with convenience, and
+# BFloat16 datatype has been enabled excessively for CPU operators in PyTorch
+# upstream and Intel® Extension for PyTorch*. Running torch.cpu.amp will match
+# each operator to its appropriate datatype and returns the best possible
+# performance.
+
+# The code changes that are required for Intel® Extension for PyTorch* are
+# highlighted with comments in a line above.
+
+###############################################################################
+# Training
+# ~~~~~~~~
+
+###############################################################################
+# Float32
+# *******
+
+import torch
+import torch.nn as nn
+# Import intel_extension_for_pytorch
+import intel_extension_for_pytorch as ipex
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+        self.linear = nn.Linear(4, 5)
+
+    def forward(self, input):
+        return self.linear(input)
+
+model = Model()
+model.set_state_dict(torch.load(PATH))
+optimizer.set_state_dict(torch.load(PATH))
+
+# Invoke optimize function against the model object and optimizer object
+model, optimizer = ipex.optimize(model, optimizer, dtype=torch.float32)
+
+for images, label in train_loader():
+    # Setting memory_format to torch.channels_last could improve performance with 4D input data. This is optional.
+    images = images.to(memory_format=torch.channels_last)
+
+    loss = criterion(model(images), label)
+    loss.backward()
+    optimizer.step()
+torch.save(model.state_dict(), PATH)
+torch.save(optimizer.state_dict(), PATH)
+
+###############################################################################
+# BFloat16
+# ********
+
+import torch
+import torch.nn as nn
+# Import intel_extension_for_pytorch
+import intel_extension_for_pytorch as ipex
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+        self.linear = nn.Linear(4, 5)
+
+    def forward(self, input):
+        return self.linear(input)
+
+model = Model()
+model.set_state_dict(torch.load(PATH))
+optimizer.set_state_dict(torch.load(PATH))
+
+# Invoke optimize function against the model object and optimizer object with data type set to torch.bfloat16
+model, optimizer = ipex.optimize(model, optimizer, dtype=torch.bfloat16)
+
+for images, label in train_loader():
+    with torch.cpu.amp.autocast():
+        # Setting memory_format to torch.channels_last could improve performance with 4D input data. This is optional.
+        images = images.to(memory_format=torch.channels_last)
+        loss = criterion(model(images), label)
+    loss.backward()
+    optimizer.step()
+torch.save(model.state_dict(), PATH)
+torch.save(optimizer.state_dict(), PATH)
+
+###############################################################################
+# Inference - Imperative Mode
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+###############################################################################
+# Float32
+# *******
+
+import torch
+import torch.nn as nn
+
+# Import intel_extension_for_pytorch
+import intel_extension_for_pytorch as ipex
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+        self.linear = nn.Linear(4, 5)
+
+    def forward(self, input):
+        return self.linear(input)
+
+input = torch.randn(2, 4)
+model = Model()
+# Invoke optimize function against the model object
+model = ipex.optimize(model)
+res = model(input)
+2.2.2.	BFloat16
+import torch
+import torch.nn as nn
+
+# Import intel_extension_for_pytorch
+import intel_extension_for_pytorch as ipex
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+        self.linear = nn.Linear(4, 5)
+
+    def forward(self, input):
+        return self.linear(input)
+
+input = torch.randn(2, 4)
+model = Model()
+# Invoke optimize function against the model object with data type set to torch.bfloat16
+model = ipex.optimize(model, dtype=torch.bfloat16)
+with torch.cpu.amp.autocast():
+    res = model(input)
+
+###############################################################################
+# Inference - TorchScript Mode
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+# TorchScript mode makes graph optimization possible, hence improves
+# performance for some topologies. Intel® Extension for PyTorch* enables most
+# commonly used operator pattern fusion, and users can get the performance
+# benefit without additional code changes.
+
+###############################################################################
+# Float32
+# *******
+
+import torch
+import torch.nn as nn
+
+# Import intel_extension_for_pytorch
+import intel_extension_for_pytorch as ipex
+
+# oneDNN graph fusion is enabled by default, uncomment the line below to disable it explicitly
+# ipex.enable_onednn_fusion(False)
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+        self.linear = nn.Linear(4, 5)
+
+    def forward(self, input):
+        return self.linear(input)
+
+input = torch.randn(2, 4)
+model = Model()
+# Invoke optimize function against the model object
+model = ipex.optimize(model)
+model = torch.jit.trace(model, torch.rand(args.batch_size, 3, 224, 224))
+model = torch.jit.freeze(model)
+res = model(input)
+
+###############################################################################
+# BFloat16
+# ********
+
+import torch
+import torch.nn as nn
+
+# Import intel_extension_for_pytorch
+import intel_extension_for_pytorch as ipex
+
+# oneDNN graph fusion is enabled by default, uncomment the line below to disable it explicitly
+# ipex.enable_onednn_fusion(False)
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+        self.linear = nn.Linear(4, 5)
+
+    def forward(self, input):
+        return self.linear(input)
+
+input = torch.randn(2, 4)
+model = Model()
+# Invoke optimize function against the model with data type set to torch.bfloat16
+model = ipex.optimize(model, dtype=torch.bfloat16)
+with torch.cpu.amp.autocast():
+    model = torch.jit.trace(model, torch.rand(args.batch_size, 3, 224, 224))
+    model = torch.jit.freeze(model)
+       res = model(input)
+
+###############################################################################
+# C++
+# ---
+
+# To work with libtorch, C++ library of PyTorch, Intel® Extension for PyTorch*
+# provides its C++ dynamic library as well. The C++ library is supposed to handle
+# inference workload only, such as service deployment. For regular development,
+# please use Python interface. Comparing to usage of libtorch, no specific code
+# changes are required, except for converting input data into channels last data
+# format. During compilation, Intel optimizations will be activated automatically
+# once C++ dynamic library of Intel® Extension for PyTorch* is linked.
+#
+# #include <torch/script.h>
+# #include <iostream>
+# #include <memory>
+#
+# int main(int argc, const char* argv[]) {
+#   torch::jit::script::Module module;
+#   try {
+#     module = torch::jit::load(argv[1]);
+#   }
+#   catch (const c10::Error& e) {
+#     std::cerr << "error loading the model\n";
+#     return -1;
+#   }
+# std::vector<torch::jit::IValue> inputs;
+# // make sure input data are converted to channels last format
+# inputs.push_back(torch::ones({1, 3, 224, 224}).to(c10::MemoryFormat::ChannelsLast));
+#
+# at::Tensor output = module.forward(inputs).toTensor();
+#
+#   return 0;
+# }
+
+###############################################################################
+# Tutorials
+# ~~~~~~~~~
+
+# Numerous tutorials will be hosted on Intel® Extension for PyTorch* Github
+# repo. Please visit the Github repo for detailed info.
diff --git a/recipes_source/recipes_index.rst b/recipes_source/recipes_index.rst
index ec69fff9bf1..3f79d7f3f6e 100644
--- a/recipes_source/recipes_index.rst
+++ b/recipes_source/recipes_index.rst
@@ -239,6 +239,15 @@ Recipes are bite-sized, actionable examples of how to use specific PyTorch featu
    :link: ../recipes/recipes/tuning_guide.html
    :tags: Model-Optimization
 
+.. Intel(R) Extension for PyTorch*
+
+.. customcarditem::
+   :header: Intel(R) Extension for PyTorch*
+   :card_description: Introduction of Intel(R) Extension for PyTorch*
+   :image: ../_static/img/thumbnails/cropped/profiler.png
+   :link: ../recipes/recipes/intel_extension_for_pytorch.html
+   :tags: Model-Optimization
+
 .. Distributed Training
 
 .. customcarditem::

From f59fa6033fd1a337aaf15d35a301b4ba15b81e3a Mon Sep 17 00:00:00 2001
From: michaelhsu <michaelhsu170@gmail.com>
Date: Tue, 5 Oct 2021 09:24:19 +0900
Subject: [PATCH 2/6] update intel_extension_for_pytorch.py

---
 .../recipes/intel_extension_for_pytorch.py    | 176 +++++++++---------
 recipes_source/recipes_index.rst              |   5 +-
 2 files changed, 92 insertions(+), 89 deletions(-)

diff --git a/recipes_source/recipes/intel_extension_for_pytorch.py b/recipes_source/recipes/intel_extension_for_pytorch.py
index e54df64310f..2a00d65ae2e 100644
--- a/recipes_source/recipes/intel_extension_for_pytorch.py
+++ b/recipes_source/recipes/intel_extension_for_pytorch.py
@@ -1,9 +1,9 @@
 """
-Intel(R) Extension for PyTorch*
-************************f
+Intel® Extension for PyTorch*
+*******************************
 **Author**: `Jing Xu <https://github.com/jingxu10>`_
 
-Intel® Extension for PyTorch* extends PyTorch with optimizations for extra
+Intel Extension for PyTorch* extends PyTorch with optimizations for extra
 performance boost on Intel hardware. Most of the optimizations will be
 included in stock PyTorch releases eventually, and the intention of the
 extension is to deliver up to date features and optimizations for PyTorch
@@ -15,59 +15,58 @@
 
 Features
 --------
-"""
 
-###############################################################################
-# * Ease-of-use Python API: Intel® Extension for PyTorch* provides simple
-# frontend Python APIs and utilities for users to get performance optimizations
-# such as graph optimization and operator optimization with minor code changes.
-# Typically, only 2 to 3 clauses are required to be added to the original code.
-
-# * Channels Last: Comparing to the default NCHW memory format, channels_last
-# (NHWC) memory format could further accelerate convolutional neural networks.
-# In Intel® Extension for PyTorch*, NHWC memory format  has been enabled for
-# most key CPU operators, though not all of them have been merged to PyTorch
-# master branch yet. They are expected to be fully landed in PyTorch upstream
-# soon.
-
-# * Auto Mixed Precision (AMP): Low precision data type BFloat16 has been
-# natively supported on the 3rd Generation Xeon scalable Servers (aka Cooper
-# Lake) with AVX512 instruction set and will be  supported on the next
-# generation of Intel® Xeon® Scalable Processors with Intel® Advanced Matrix
-# Extensions (Intel® AMX) instruction set with further boosted performance. The
-# support of Auto Mixed Precision (AMP) with BFloat16 for CPU and BFloat16
-# optimization of operators have been  massively enabled in Intel® Extension
-# for PyTorch*, and partially upstreamed to PyTorch master branch. Most of
-# these optimizations will be landed in PyTorch master through PRs that are
-# being submitted and reviewed. Graph Optimization: To optimize performance
-# further with torchscript, Intel® Extension for PyTorch* supports fusion of
-# frequently used operator patterns, like Conv2D+ReLU, Linear+ReLU, etc. The
-# benefit of the fusions are delivered to users in a transparant fashion.
-# Detailed fusion patterns supported can be found `here <https://github.com/intel/intel-extension-for-pytorch>`_.
-
-# * Operator Optimization: Intel® Extension for PyTorch* also optimizes
-# operators and implements several customized operators for performance. A few
-# ATen operators are replaced by their optimized counterparts in Intel®
-# Extension for PyTorch* via ATen registration mechanism. Moreover, some
-# customized operators are implemented for several popular topologies. For
-# instance, ROIAlign and NMS are defined in Mask R-CNN. To improve performance
-# of these topologies, Intel® Extension for PyTorch* also optimized these
-# customized operators.
+* Ease-of-use Python API: Intel® Extension for PyTorch* provides simple
+  frontend Python APIs and utilities for users to get performance optimizations
+  such as graph optimization and operator optimization with minor code changes.
+  Typically, only 2 to 3 clauses are required to be added to the original code.
+* Channels Last: Comparing to the default NCHW memory format, channels_last
+  (NHWC) memory format could further accelerate convolutional neural networks.
+  In Intel® Extension for PyTorch*, NHWC memory format  has been enabled for
+  most key CPU operators, though not all of them have been merged to PyTorch
+  master branch yet. They are expected to be fully landed in PyTorch upstream
+  soon.
+* Auto Mixed Precision (AMP): Low precision data type BFloat16 has been
+  natively supported on the 3rd Generation Xeon scalable Servers (aka Cooper
+  Lake) with AVX512 instruction set and will be  supported on the next
+  generation of Intel® Xeon® Scalable Processors with Intel® Advanced Matrix
+  Extensions (Intel® AMX) instruction set with further boosted performance. The
+  support of Auto Mixed Precision (AMP) with BFloat16 for CPU and BFloat16
+  optimization of operators have been  massively enabled in Intel® Extension
+  for PyTorch*, and partially upstreamed to PyTorch master branch. Most of
+  these optimizations will be landed in PyTorch master through PRs that are
+  being submitted and reviewed. Graph Optimization: To optimize performance
+  further with torchscript, Intel® Extension for PyTorch* supports fusion of
+  frequently used operator patterns, like Conv2D+ReLU, Linear+ReLU, etc. The
+  benefit of the fusions are delivered to users in a transparant fashion.
+  Detailed fusion patterns supported can be found `here <https://github.com/intel/intel-extension-for-pytorch>`_.
+* Operator Optimization: Intel® Extension for PyTorch* also optimizes
+  operators and implements several customized operators for performance. A few
+  ATen operators are replaced by their optimized counterparts in Intel®
+  Extension for PyTorch* via ATen registration mechanism. Moreover, some
+  customized operators are implemented for several popular topologies. For
+  instance, ROIAlign and NMS are defined in Mask R-CNN. To improve performance
+  of these topologies, Intel® Extension for PyTorch* also optimized these
+  customized operators.
+"""
 
 ###############################################################################
 # Getting Started
 # ---------------
 
+###############################################################################
 # Minor code changes are required for users to get start with Intel® Extension
 # for PyTorch*. Both PyTorch imperative mode and TorchScript mode are
 # supported. This section introduces usage of Intel® Extension for PyTorch* API
 # functions for both imperative mode and TorchScript mode, covering data type
 # Float32 and BFloat16. C++ usage will also be introduced at the end.
 
+###############################################################################
 # You just need to import Intel® Extension for PyTorch* package and apply its
 # optimize function against the model object. If it is a training workload, the
 # optimize function also needs to be applied against the optimizer object.
 
+###############################################################################
 # For training and inference with BFloat16 data type, torch.cpu.amp has been
 # enabled in PyTorch upstream to support mixed precision with convenience, and
 # BFloat16 datatype has been enabled excessively for CPU operators in PyTorch
@@ -75,6 +74,7 @@
 # each operator to its appropriate datatype and returns the best possible
 # performance.
 
+###############################################################################
 # The code changes that are required for Intel® Extension for PyTorch* are
 # highlighted with comments in a line above.
 
@@ -84,7 +84,7 @@
 
 ###############################################################################
 # Float32
-# *******
+# ^^^^^^^
 
 import torch
 import torch.nn as nn
@@ -102,15 +102,13 @@ def forward(self, input):
 model = Model()
 model.set_state_dict(torch.load(PATH))
 optimizer.set_state_dict(torch.load(PATH))
-
 # Invoke optimize function against the model object and optimizer object
 model, optimizer = ipex.optimize(model, optimizer, dtype=torch.float32)
 
-for images, label in train_loader():
+for images, label in train_loader():
     # Setting memory_format to torch.channels_last could improve performance with 4D input data. This is optional.
-    images = images.to(memory_format=torch.channels_last)
-
-    loss = criterion(model(images), label)
+    images = images.to(memory_format=torch.channels_last)
+    loss = criterion(model(images), label)
     loss.backward()
     optimizer.step()
 torch.save(model.state_dict(), PATH)
@@ -118,7 +116,7 @@ def forward(self, input):
 
 ###############################################################################
 # BFloat16
-# ********
+# ^^^^^^^^
 
 import torch
 import torch.nn as nn
@@ -136,15 +134,14 @@ def forward(self, input):
 model = Model()
 model.set_state_dict(torch.load(PATH))
 optimizer.set_state_dict(torch.load(PATH))
-
 # Invoke optimize function against the model object and optimizer object with data type set to torch.bfloat16
 model, optimizer = ipex.optimize(model, optimizer, dtype=torch.bfloat16)
 
-for images, label in train_loader():
-    with torch.cpu.amp.autocast():
+for images, label in train_loader():
+    with torch.cpu.amp.autocast():
         # Setting memory_format to torch.channels_last could improve performance with 4D input data. This is optional.
-        images = images.to(memory_format=torch.channels_last)
-        loss = criterion(model(images), label)
+        images = images.to(memory_format=torch.channels_last)
+        loss = criterion(model(images), label)
     loss.backward()
     optimizer.step()
 torch.save(model.state_dict(), PATH)
@@ -156,11 +153,10 @@ def forward(self, input):
 
 ###############################################################################
 # Float32
-# *******
+# """""""
 
 import torch
 import torch.nn as nn
-
 # Import intel_extension_for_pytorch
 import intel_extension_for_pytorch as ipex
 
@@ -177,10 +173,13 @@ def forward(self, input):
 # Invoke optimize function against the model object
 model = ipex.optimize(model)
 res = model(input)
-2.2.2.	BFloat16
+
+###############################################################################
+# BFloat16
+# ^^^^^^^^
+
 import torch
 import torch.nn as nn
-
 # Import intel_extension_for_pytorch
 import intel_extension_for_pytorch as ipex
 
@@ -203,6 +202,7 @@ def forward(self, input):
 # Inference - TorchScript Mode
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
+###############################################################################
 # TorchScript mode makes graph optimization possible, hence improves
 # performance for some topologies. Intel® Extension for PyTorch* enables most
 # commonly used operator pattern fusion, and users can get the performance
@@ -210,11 +210,10 @@ def forward(self, input):
 
 ###############################################################################
 # Float32
-# *******
+# """""""
 
 import torch
 import torch.nn as nn
-
 # Import intel_extension_for_pytorch
 import intel_extension_for_pytorch as ipex
 
@@ -239,11 +238,10 @@ def forward(self, input):
 
 ###############################################################################
 # BFloat16
-# ********
+# ^^^^^^^^
 
 import torch
 import torch.nn as nn
-
 # Import intel_extension_for_pytorch
 import intel_extension_for_pytorch as ipex
 
@@ -262,15 +260,16 @@ def forward(self, input):
 model = Model()
 # Invoke optimize function against the model with data type set to torch.bfloat16
 model = ipex.optimize(model, dtype=torch.bfloat16)
-with torch.cpu.amp.autocast():
+with torch.cpu.amp.autocast():
     model = torch.jit.trace(model, torch.rand(args.batch_size, 3, 224, 224))
     model = torch.jit.freeze(model)
-       res = model(input)
+    res = model(input)
 
 ###############################################################################
 # C++
-# ---
+# ~~~
 
+###############################################################################
 # To work with libtorch, C++ library of PyTorch, Intel® Extension for PyTorch*
 # provides its C++ dynamic library as well. The C++ library is supposed to handle
 # inference workload only, such as service deployment. For regular development,
@@ -278,32 +277,35 @@ def forward(self, input):
 # changes are required, except for converting input data into channels last data
 # format. During compilation, Intel optimizations will be activated automatically
 # once C++ dynamic library of Intel® Extension for PyTorch* is linked.
-#
-# #include <torch/script.h>
-# #include <iostream>
-# #include <memory>
-#
-# int main(int argc, const char* argv[]) {
-#   torch::jit::script::Module module;
-#   try {
-#     module = torch::jit::load(argv[1]);
-#   }
-#   catch (const c10::Error& e) {
-#     std::cerr << "error loading the model\n";
-#     return -1;
-#   }
-# std::vector<torch::jit::IValue> inputs;
-# // make sure input data are converted to channels last format
-# inputs.push_back(torch::ones({1, 3, 224, 224}).to(c10::MemoryFormat::ChannelsLast));
-#
-# at::Tensor output = module.forward(inputs).toTensor();
-#
-#   return 0;
-# }
+
+'''
+#include <torch/script.h>
+#include <iostream>
+#include <memory>
+
+int main(int argc, const char* argv[]) {
+  torch::jit::script::Module module;
+  try {
+    module = torch::jit::load(argv[1]);
+  }
+  catch (const c10::Error& e) {
+    std::cerr << "error loading the model\n";
+    return -1;
+  }
+std::vector<torch::jit::IValue> inputs;
+// make sure input data are converted to channels last format
+inputs.push_back(torch::ones({1, 3, 224, 224}).to(c10::MemoryFormat::ChannelsLast));
+
+at::Tensor output = module.forward(inputs).toTensor();
+
+  return 0;
+}
+'''
 
 ###############################################################################
 # Tutorials
-# ~~~~~~~~~
+# ---------
 
+###############################################################################
 # Numerous tutorials will be hosted on Intel® Extension for PyTorch* Github
-# repo. Please visit the Github repo for detailed info.
+# repo. Please visit the `Github repo <https://github.com/intel/intel-extension-for-pytorch>`_ for detailed info.
diff --git a/recipes_source/recipes_index.rst b/recipes_source/recipes_index.rst
index 3f79d7f3f6e..d7799606bff 100644
--- a/recipes_source/recipes_index.rst
+++ b/recipes_source/recipes_index.rst
@@ -242,8 +242,8 @@ Recipes are bite-sized, actionable examples of how to use specific PyTorch featu
 .. Intel(R) Extension for PyTorch*
 
 .. customcarditem::
-   :header: Intel(R) Extension for PyTorch*
-   :card_description: Introduction of Intel(R) Extension for PyTorch*
+   :header: Intel® Extension for PyTorch*
+   :card_description: Introduction of Intel® Extension for PyTorch*
    :image: ../_static/img/thumbnails/cropped/profiler.png
    :link: ../recipes/recipes/intel_extension_for_pytorch.html
    :tags: Model-Optimization
@@ -304,6 +304,7 @@ Recipes are bite-sized, actionable examples of how to use specific PyTorch featu
    /recipes/recipes/dynamic_quantization
    /recipes/recipes/amp_recipe
    /recipes/recipes/tuning_guide
+   /recipes/recipes/intel_extension_for_pytorch
    /recipes/torchscript_inference
    /recipes/deployment_with_flask
    /recipes/distributed_rpc_profiling

From 3272f2b0d80cb055b6e8c1b2a194ecf1ac03ac22 Mon Sep 17 00:00:00 2001
From: Jing Xu <jing.xu@intel.com>
Date: Tue, 5 Oct 2021 16:22:18 +0900
Subject: [PATCH 3/6] update intel_extension_for_pytorch.py for c++ part

---
 .../recipes/intel_extension_for_pytorch.py    | 83 +++++++++++++------
 1 file changed, 57 insertions(+), 26 deletions(-)

diff --git a/recipes_source/recipes/intel_extension_for_pytorch.py b/recipes_source/recipes/intel_extension_for_pytorch.py
index 2a00d65ae2e..1cfb5dee894 100644
--- a/recipes_source/recipes/intel_extension_for_pytorch.py
+++ b/recipes_source/recipes/intel_extension_for_pytorch.py
@@ -22,24 +22,27 @@
   Typically, only 2 to 3 clauses are required to be added to the original code.
 * Channels Last: Comparing to the default NCHW memory format, channels_last
   (NHWC) memory format could further accelerate convolutional neural networks.
-  In Intel® Extension for PyTorch*, NHWC memory format  has been enabled for
+  In Intel® Extension for PyTorch*, NHWC memory format has been enabled for
   most key CPU operators, though not all of them have been merged to PyTorch
   master branch yet. They are expected to be fully landed in PyTorch upstream
   soon.
 * Auto Mixed Precision (AMP): Low precision data type BFloat16 has been
   natively supported on the 3rd Generation Xeon scalable Servers (aka Cooper
-  Lake) with AVX512 instruction set and will be  supported on the next
+  Lake) with AVX512 instruction set and will be supported on the next
   generation of Intel® Xeon® Scalable Processors with Intel® Advanced Matrix
   Extensions (Intel® AMX) instruction set with further boosted performance. The
   support of Auto Mixed Precision (AMP) with BFloat16 for CPU and BFloat16
-  optimization of operators have been  massively enabled in Intel® Extension
+  optimization of operators have been massively enabled in Intel® Extension
   for PyTorch*, and partially upstreamed to PyTorch master branch. Most of
   these optimizations will be landed in PyTorch master through PRs that are
-  being submitted and reviewed. Graph Optimization: To optimize performance
-  further with torchscript, Intel® Extension for PyTorch* supports fusion of
-  frequently used operator patterns, like Conv2D+ReLU, Linear+ReLU, etc. The
-  benefit of the fusions are delivered to users in a transparant fashion.
-  Detailed fusion patterns supported can be found `here <https://github.com/intel/intel-extension-for-pytorch>`_.
+  being submitted and reviewed.
+* Graph Optimization: To optimize performance further with torchscript,
+  Intel® Extension for PyTorch* supports fusion of frequently used operator
+  patterns, like Conv2D+ReLU, Linear+ReLU, etc. The benefit of the fusions are
+  delivered to users in a transparant fashion. Detailed fusion patterns
+  supported can be found `here <https://github.com/intel/intel-extension-for-pytorch>`_.
+  The graph optimization will be up-streamed to PyTorch with the introduction
+  of oneDNN Graph API.
 * Operator Optimization: Intel® Extension for PyTorch* also optimizes
   operators and implements several customized operators for performance. A few
   ATen operators are replaced by their optimized counterparts in Intel®
@@ -275,37 +278,65 @@ def forward(self, input):
 # inference workload only, such as service deployment. For regular development,
 # please use Python interface. Comparing to usage of libtorch, no specific code
 # changes are required, except for converting input data into channels last data
-# format. During compilation, Intel optimizations will be activated automatically
+# format. Compilation follows the recommended methodology with CMake. Detailed
+# instructions can be found in `PyTorch tutorial <https://pytorch.org/tutorials/advanced/cpp_export.html#depending-on-libtorch-and-building-the-application>`_.
+# During compilation, Intel optimizations will be activated automatically
 # once C++ dynamic library of Intel® Extension for PyTorch* is linked.
 
+###############################################################################
+# **example-app.cpp**
+
 '''
 #include <torch/script.h>
 #include <iostream>
 #include <memory>
 
 int main(int argc, const char* argv[]) {
-  torch::jit::script::Module module;
-  try {
-    module = torch::jit::load(argv[1]);
-  }
-  catch (const c10::Error& e) {
-    std::cerr << "error loading the model\n";
-    return -1;
-  }
-std::vector<torch::jit::IValue> inputs;
-// make sure input data are converted to channels last format
-inputs.push_back(torch::ones({1, 3, 224, 224}).to(c10::MemoryFormat::ChannelsLast));
-
-at::Tensor output = module.forward(inputs).toTensor();
-
-  return 0;
+    torch::jit::script::Module module;
+    try {
+        module = torch::jit::load(argv[1]);
+    }
+    catch (const c10::Error& e) {
+        std::cerr << "error loading the model\n";
+        return -1;
+    }
+    std::vector<torch::jit::IValue> inputs;
+    // make sure input data are converted to channels last format
+    inputs.push_back(torch::ones({1, 3, 224, 224}).to(c10::MemoryFormat::ChannelsLast));
+
+    at::Tensor output = module.forward(inputs).toTensor();
+
+    return 0;
 }
 '''
 
+###############################################################################
+# **CMakeList.txt**
+
+'''
+cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
+project(example-app)
+
+find_package(Torch REQUIRED)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS} -Wl,--no-as-needed")
+
+add_executable(example-app example-app.cpp)
+# Link the binary aganist the C++ dynamic library file of Intel® Extension for PyTorch*
+target_link_libraries(example-app "${TORCH_LIBRARIES}" "${INTEL_EXTENSION_FOR_PYTORCH_PATH}/lib/lib<TBA>.so")
+
+set_property(TARGET example-app PROPERTY CXX_STANDARD 14)
+'''
+
+###############################################################################
+# **Command for compilation**
+
+'''
+cmake -DCMAKE_PREFIX_PATH=<LIBPYTORCH_PATH> -DINTEL_EXTENSION_FOR_PYTORCH_PATH=<INTEL_EXTENSION_FOR_PYTORCH_INSTALLATION_PATH> ..
+'''
+
 ###############################################################################
 # Tutorials
 # ---------
 
 ###############################################################################
-# Numerous tutorials will be hosted on Intel® Extension for PyTorch* Github
-# repo. Please visit the `Github repo <https://github.com/intel/intel-extension-for-pytorch>`_ for detailed info.
+# Please visit `Intel® Extension for PyTorch* Github repo <https://github.com/intel/intel-extension-for-pytorch>`_ for more tutorials.

From 5784a8da7d291b6f2619e61c975594dc78fa2aa0 Mon Sep 17 00:00:00 2001
From: Jing Xu <jing.xu@intel.com>
Date: Tue, 5 Oct 2021 17:55:12 +0900
Subject: [PATCH 4/6] update c++ so file name

---
 recipes_source/recipes/intel_extension_for_pytorch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/recipes_source/recipes/intel_extension_for_pytorch.py b/recipes_source/recipes/intel_extension_for_pytorch.py
index 1cfb5dee894..5c2046992bc 100644
--- a/recipes_source/recipes/intel_extension_for_pytorch.py
+++ b/recipes_source/recipes/intel_extension_for_pytorch.py
@@ -322,7 +322,7 @@ def forward(self, input):
 
 add_executable(example-app example-app.cpp)
 # Link the binary aganist the C++ dynamic library file of Intel® Extension for PyTorch*
-target_link_libraries(example-app "${TORCH_LIBRARIES}" "${INTEL_EXTENSION_FOR_PYTORCH_PATH}/lib/lib<TBA>.so")
+target_link_libraries(example-app "${TORCH_LIBRARIES}" "${INTEL_EXTENSION_FOR_PYTORCH_PATH}/lib/libintel-ext-pt-cpu.so")
 
 set_property(TARGET example-app PROPERTY CXX_STANDARD 14)
 '''

From 1101ae5ded6fd60197e24c00599255fa91ec41a5 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nikita.shulga@gmail.com>
Date: Wed, 6 Oct 2021 10:33:18 -0700
Subject: [PATCH 5/6] Fix typos

---
 recipes_source/recipes/intel_extension_for_pytorch.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/recipes_source/recipes/intel_extension_for_pytorch.py b/recipes_source/recipes/intel_extension_for_pytorch.py
index 5c2046992bc..d7510651be0 100644
--- a/recipes_source/recipes/intel_extension_for_pytorch.py
+++ b/recipes_source/recipes/intel_extension_for_pytorch.py
@@ -39,7 +39,7 @@
 * Graph Optimization: To optimize performance further with torchscript,
   Intel® Extension for PyTorch* supports fusion of frequently used operator
   patterns, like Conv2D+ReLU, Linear+ReLU, etc. The benefit of the fusions are
-  delivered to users in a transparant fashion. Detailed fusion patterns
+  delivered to users in a transparent fashion. Detailed fusion patterns
   supported can be found `here <https://github.com/intel/intel-extension-for-pytorch>`_.
   The graph optimization will be up-streamed to PyTorch with the introduction
   of oneDNN Graph API.
@@ -321,7 +321,7 @@ def forward(self, input):
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS} -Wl,--no-as-needed")
 
 add_executable(example-app example-app.cpp)
-# Link the binary aganist the C++ dynamic library file of Intel® Extension for PyTorch*
+# Link the binary against the C++ dynamic library file of Intel® Extension for PyTorch*
 target_link_libraries(example-app "${TORCH_LIBRARIES}" "${INTEL_EXTENSION_FOR_PYTORCH_PATH}/lib/libintel-ext-pt-cpu.so")
 
 set_property(TARGET example-app PROPERTY CXX_STANDARD 14)

From 7b0dc3812ed9393cef5d1631929f9ad655109320 Mon Sep 17 00:00:00 2001
From: Jing Xu <jing.xu@intel.com>
Date: Thu, 7 Oct 2021 15:14:49 +0900
Subject: [PATCH 6/6] fixed issue for inference sample codes

---
 .../recipes/intel_extension_for_pytorch.py     | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/recipes_source/recipes/intel_extension_for_pytorch.py b/recipes_source/recipes/intel_extension_for_pytorch.py
index d7510651be0..31d01079f37 100644
--- a/recipes_source/recipes/intel_extension_for_pytorch.py
+++ b/recipes_source/recipes/intel_extension_for_pytorch.py
@@ -173,8 +173,9 @@ def forward(self, input):
 
 input = torch.randn(2, 4)
 model = Model()
+model.eval()
 # Invoke optimize function against the model object
-model = ipex.optimize(model)
+model = ipex.optimize(model, dtype=torch.float32)
 res = model(input)
 
 ###############################################################################
@@ -196,6 +197,7 @@ def forward(self, input):
 
 input = torch.randn(2, 4)
 model = Model()
+model.eval()
 # Invoke optimize function against the model object with data type set to torch.bfloat16
 model = ipex.optimize(model, dtype=torch.bfloat16)
 with torch.cpu.amp.autocast():
@@ -233,9 +235,10 @@ def forward(self, input):
 
 input = torch.randn(2, 4)
 model = Model()
+model.eval()
 # Invoke optimize function against the model object
-model = ipex.optimize(model)
-model = torch.jit.trace(model, torch.rand(args.batch_size, 3, 224, 224))
+model = ipex.optimize(model, dtype=torch.float32)
+model = torch.jit.trace(model, torch.randn(2, 4))
 model = torch.jit.freeze(model)
 res = model(input)
 
@@ -261,10 +264,11 @@ def forward(self, input):
 
 input = torch.randn(2, 4)
 model = Model()
+model.eval()
 # Invoke optimize function against the model with data type set to torch.bfloat16
 model = ipex.optimize(model, dtype=torch.bfloat16)
 with torch.cpu.amp.autocast():
-    model = torch.jit.trace(model, torch.rand(args.batch_size, 3, 224, 224))
+    model = torch.jit.trace(model, torch.randn(2, 4))
     model = torch.jit.freeze(model)
     res = model(input)
 
@@ -327,6 +331,12 @@ def forward(self, input):
 set_property(TARGET example-app PROPERTY CXX_STANDARD 14)
 '''
 
+###############################################################################
+# **Note:** Since Intel® Extension for PyTorch* is still under development, name of
+# the c++ dynamic library in the master branch may defer to
+# *libintel-ext-pt-cpu.so* shown above. Please check the name out in the
+# installation folder. The so file name starts with *libintel-*.
+
 ###############################################################################
 # **Command for compilation**