intel
diff --git a/‎.gitignore
Lines changed: 3 additions & 2 deletions b/‎.gitignore
Lines changed: 3 additions & 2 deletions
diff --git a/‎cmake/CPU.cmake
Lines changed: 4 additions & 3 deletions b/‎cmake/CPU.cmake
Lines changed: 4 additions & 3 deletions
diff --git a/‎intel_pytorch_extension_py/__init__.py
Lines changed: 1 addition & 3 deletions b/‎intel_pytorch_extension_py/__init__.py
Lines changed: 1 addition & 3 deletions
diff --git a/‎intel_pytorch_extension_py/ops/embeddingbag.py
Lines changed: 23 additions & 1 deletion b/‎intel_pytorch_extension_py/ops/embeddingbag.py
Lines changed: 23 additions & 1 deletion
diff --git a/‎scripts/cpu/common/codegen.py
Lines changed: 15 additions & 0 deletions b/‎scripts/cpu/common/codegen.py
Lines changed: 15 additions & 0 deletions
diff --git a/‎scripts/cpu/gen-dense-cpu-ops.py
Lines changed: 33 additions & 27 deletions b/‎scripts/cpu/gen-dense-cpu-ops.py
Lines changed: 33 additions & 27 deletions
diff --git a/‎scripts/cpu/gen-sparse-cpu-ops.py
Lines changed: 43 additions & 33 deletions b/‎scripts/cpu/gen-sparse-cpu-ops.py
Lines changed: 43 additions & 33 deletions
@@ -11,6 +11,7 @@
 .coverage
 .hypothesis
 .mypy_cache
+*.so*
 */*.pyc
 */*.so*
 */**/__pycache__
@@ -91,9 +92,9 @@ torch/version.py
 intel_pytorch_extension_py/version.py
 torch_ipex/csrc/version.cpp
 torch_ipex/csrc/aten_ipex_sparse_type_default.*
-torch_ipex/csrc/cpu/SparseOPs.*
+torch_ipex/csrc/cpu/SparseOPs*
 torch_ipex/csrc/cpu/OPs.*
-torch_ipex/csrc/cpu/DenseOPs.*
+torch_ipex/csrc/cpu/DenseOPs*
 
 cscope.*
 
 
@@ -17,7 +17,7 @@ list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules)
 
 FIND_PACKAGE(AVX)
 
-IF (NOT C_AVX512_FOUND)
+IF (NOT C_AVX512_FOUND AND NOT CXX_AVX512_FOUND)
   message(FATAL_ERROR "Please build IPEX on Machines that support AVX512.")
 ENDIF()
 
@@ -58,13 +58,14 @@ endif()
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pedantic")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=redundant-decls")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=old-style-cast")
-IF (C_AVX512_FOUND)
+IF (C_AVX512_FOUND OR CXX_AVX512_FOUND)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DAVX512")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512bw")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512vl")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mf16c")
 ENDIF()
-IF (C_AVX512_BF16_FOUND)
+IF (C_AVX512_BF16_FOUND OR CXX_AVX512_BF16_FOUND)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512bf16 -DAVX512_BF16")
 ENDIF()
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
 
@@ -3,6 +3,4 @@
 from .version import __version__
 from .optim import *
 from .ops import *
-import _torch_ipex as core
-
-core._initialize_aten_bindings()
+import _torch_ipex as core
@@ -3,6 +3,8 @@
 from torch.autograd import Function
 import _torch_ipex as core
 
+'''
+# extension for BF16 fast path only
 torch_embedding_bag = torch.embedding_bag
 def embeddingbag(weights, inputs, offsets, scale_grad_by_freq, mode, sparse, per_sample_weights, include_last_offset):
     if weights.dtype == torch.float:
@@ -12,21 +14,41 @@ def embeddingbag(weights, inputs, offsets, scale_grad_by_freq, mode, sparse, per
         ret = (ret, None, None, None)
     else:
         assert(0, "unimplement embeddingbag path in extension")
-
+'''
+def embeddingbag(weights, indices, offsets, scale_grad_by_freq, mode, sparse, per_sample_weights, include_last_offset):
+    ret = EmbeddingBagFunction.apply(weights, indices, offsets, scale_grad_by_freq, mode, sparse, per_sample_weights, include_last_offset)
     return ret
 
 
 class EmbeddingBagFunction(Function):
+    '''
     @staticmethod
     def forward(ctx, weights, inputs, offsets):
         ctx.save_for_backward(weights, inputs, offsets)
         output = core.embedding_bag_forward(weights, inputs, offsets)
         return output
+    '''
+    @staticmethod
+    def forward(ctx, weight, indices, offsets, scale_grad_by_freq, mode, sparse, per_sample_weights, include_last_offset):
+        ctx.scale_grad_by_freq = scale_grad_by_freq
+        ctx.mode = mode
+        ctx.sparse = sparse
+        ctx.num_weight = weight.size(0)
+        ctx.save_for_backward(indices, offsets, per_sample_weights)
+        ret = core.embedding_bag_forward(weight, indices, offsets, scale_grad_by_freq, mode, sparse, per_sample_weights, include_last_offset)
+        return ret
 
+    '''
     @staticmethod
     def backward(ctx, grad_out):
         weights, inputs, offsets = ctx.saved_tensors
         grad_weight = core.embedding_bag_backward(grad_out, weights, inputs, offsets)
         return (grad_weight, None, None)
+    '''
+    @staticmethod
+    def backward(ctx, grad, offset2bag, bag_size, maximum_indices):
+        indices, offsets, per_sample_weights = ctx.saved_tensors
+        grad_weight = core.embedding_bag_backward(grad, indices, offsets, offset2bag, bag_size, maximum_indices, ctx.num_weight, ctx.scale_grad_by_freq, ctx.mode, ctx.sparse, per_sample_weights)
+        return grad_weight, None, None, None, None, None, None, None
 
 torch.embedding_bag = embeddingbag
@@ -0,0 +1,15 @@
+import os
+
+def write_or_skip(filepath, content):
+    try:
+        with open(filepath, 'r') as f:
+            old_content = f.read()
+    except IOError:
+        old_content = None
+
+    if old_content != content:
+        with open(filepath, 'w') as f:
+            print('writing', filepath)
+            f.write(content)
+    else:
+        print('skipped writing', filepath)
@@ -9,6 +9,7 @@
 import sys
 import json
 
+from common.codegen import write_or_skip
 from common.cpp_sig_parser import CPPSig
 from common.aten_sig_parser import AtenSig
 
@@ -92,6 +93,12 @@
     .op(torch::RegisterOperators::options().schema("{}")
       .impl_unboxedOnlyKernel<{}, &{}>(at::DispatchKey::DPCPPTensorId)
       .aliasAnalysis(c10::AliasAnalysisKind::FROM_SCHEMA))"""
+
+_REG_BLOCK = """
+namespace {{
+  static auto dispatch = torch::RegisterOperators(){reg_ops};
+}}"""
+
 _H_HEADER = """// Autogenerated file by {gen}. Do not edit directly!
 #pragma once
 
@@ -105,8 +112,6 @@ class AtenIpexCPUDefault {{
 {hfuncs}
 }};
 
-void RegisterIpexDenseOPs();
-
 }}  // namespace cpu
 
 }}  // namespace torch_ipex
@@ -145,9 +150,7 @@ def __init__(self, reg_dec_file_path, func_file_path, op_h_file_path, op_cpp_fil
         self._reg_dec_file_path = reg_dec_file_path
         self._func_file_path = func_file_path
         self._op_h_file_path = op_h_file_path
-        self._op_h_file = None
         self._op_cpp_file_path = op_cpp_file_path
-        self._op_cpp_file = None
         self._sigs = []
         self._err_info = []
         self._func_data = ''
@@ -223,9 +226,6 @@ def prepare_functions(self):
         with open(self._func_file_path, 'r') as ff:
             self._func_data = ff.read()
 
-        self._op_h_file = open(self._op_h_file_path, 'w')
-        self._op_cpp_file = open(self._op_cpp_file_path, 'w')
-
         print('Extracted {} functions ({} errors) from {}'.format(
               len(self._sigs),
               len(self._err_info),
@@ -452,22 +452,37 @@ def gen_fallback_post_code(self, cpp_sig):
     def gen_head_dec_code(self, cpp_func_str_h):
         return '  static {};\n'.format(cpp_func_str_h)
 
+    def gen_cpu_ops_shard(self, func_defs, cpp_path, header_path, num_shards=1):
+        head_file_content = _H_HEADER.format(gen=os.path.basename(sys.argv[0]), hfuncs=''.join([f['dec'] for f in func_defs]))
+        write_or_skip(header_path, head_file_content)
+
+        shards = [[] for _ in range(num_shards)]
+        for idx, func in enumerate(func_defs):
+            shards[idx % num_shards].append(func)
+
+        for idx, shard in enumerate(shards):
+            regs_code = _REG_BLOCK.format(reg_ops=''.join([f['reg'] for f in shard]))
+            defs_code = ''.join([f['def'] for f in shard])
+
+            filename, ext = os.path.splitext(cpp_path)
+            shard_filepath = '%s_%s%s' % (filename, idx, ext)
+            shard_content = _CPP_HEADER.format(gen=os.path.basename(sys.argv[0]), funcs=defs_code, regs=regs_code)
+            write_or_skip(shard_filepath, shard_content)
+
     def gen_code(self):
         self.prepare_functions()
         assert len(self._err_info) == 0
 
         def is_conv_overrideable_func(fname):
             return fname in ['convolution_overrideable', 'convolution_backward_overrideable']
 
-        func_decs = []
-        func_regs = []
         func_defs = []
-        for cpp_sig, aten_sig, cpp_func_sig_str, aten_func_sig_str in self._sigs:
+        for cpp_sig, _, cpp_func_sig_str, aten_func_sig_str in self._sigs:
             cpp_func_str_h, cpp_func_str_cpp = self.gen_func_signature(cpp_func_sig_str)
             # Gen declaration code for head file
-            func_decs.append(self.gen_head_dec_code(cpp_func_str_h))
+            func_dec = self.gen_head_dec_code(cpp_func_str_h)
 
-            func_regs.append(_REG_PATTERN.format(aten_func_sig_str, self.get_func_dec(cpp_sig), "AtenIpexCPUDefault::" + cpp_sig.def_name))
+            func_reg = _REG_PATTERN.format(aten_func_sig_str, self.get_func_dec(cpp_sig), "AtenIpexCPUDefault::" + cpp_sig.def_name)
 
             # Gen definition code for cpp file
             code = '{} {{\n'.format(cpp_func_str_cpp)
@@ -480,23 +495,14 @@ def is_conv_overrideable_func(fname):
                 code += self.gen_fallback_code(cpp_sig)
                 code += self.gen_fallback_post_code(cpp_sig)
 
-            code += '}\n'
-
-            code += '\n'
-
-            func_defs.append(code)
-
-        head_file_content = _H_HEADER.format(gen=os.path.basename(sys.argv[0]), hfuncs=''.join(func_decs))
-
-        regs_code = 'void RegisterIpexDenseOPs() {\n'
-        regs_code += '  static auto dispatch = torch::RegisterOperators()\n'
-        regs_code += ''.join(func_regs)
-        regs_code += ';\n}\n'
+            code += '}\n\n'
 
-        source_file_content = _CPP_HEADER.format(gen=os.path.basename(sys.argv[0]), funcs=''.join(func_defs), regs=regs_code)
-        print(head_file_content, file=self._op_h_file)
-        print(source_file_content, file=self._op_cpp_file)
+            func_defs.append({'dec': func_dec, 'reg': func_reg, 'def': code})
 
+        self.gen_cpu_ops_shard(func_defs,
+                               cpp_path=self._op_cpp_file_path,
+                               header_path=self._op_h_file_path,
+                               num_shards=8)
 
 if __name__ == '__main__':
     arg_parser = argparse.ArgumentParser()
 
@@ -9,6 +9,7 @@
 import sys
 import json
 
+from common.codegen import write_or_skip
 from common.cpp_sig_parser import CPPSig
 from common.aten_sig_parser import AtenSig
 
@@ -47,6 +48,13 @@
     .op(torch::RegisterOperators::options().schema("{}")
       .impl_unboxedOnlyKernel<{}, &{}>(at::DispatchKey::SparseDPCPPTensorId)
       .aliasAnalysis(c10::AliasAnalysisKind::FROM_SCHEMA))"""
+
+_REG_BLOCK = """
+namespace {{
+  static auto dispatch = torch::RegisterOperators(){reg_ops};
+}}"""
+
+
 _H_HEADER = """// Autogenerated file by {gen}. Do not edit directly!
 #pragma once
 
@@ -60,8 +68,6 @@ class AtenIpexCPUSparse {{
 {hfuncs}
 }};
 
-void RegisterIpexSparseOPs();
-
 }}  // namespace cpu
 
 }}  // namespace torch_ipex
@@ -100,9 +106,7 @@ def __init__(self, reg_dec_file_path, func_file_path, sparse_dec_file_path, spar
         self._sparse_dec_file_path = sparse_dec_file_path
         self._sparse_attr_file_path = sparse_attr_file_path
         self._op_h_file_path = op_h_file_path
-        self._op_h_file = None
         self._op_cpp_file_path = op_cpp_file_path
-        self._op_cpp_file = None
         self._sigs = []
         self._sparse_attr_data = ''
         self._sparse_sigs = []
@@ -155,8 +159,8 @@ def prepare_functions(self):
                 continue
             cpp_func_sig_str = m.group(1)
             _sparse_sig_strs.append(cpp_func_sig_str)
-            print(cpp_func_sig_str)
-        print("********************")
+        #     print(cpp_func_sig_str)
+        # print("********************")
 
         # Parse SparseAttrType.h
         with open(self._sparse_attr_file_path, 'r') as ff:
@@ -202,9 +206,6 @@ def prepare_functions(self):
                 self._err_info.append((cpp_func_sig, str(e)))
                 print('Error parsing "{}": {}'.format(cpp_func_sig, e), file=sys.stderr)
 
-        self._op_h_file = open(self._op_h_file_path, 'w')
-        self._op_cpp_file = open(self._op_cpp_file_path, 'w')
-
         print('Extracted {} functions ({} errors) from {}'.format(
               len(self._sigs),
               len(self._err_info),
@@ -369,44 +370,53 @@ def gen_fallback_post_code(self, cpp_sig):
     def gen_head_dec_code(self, cpp_func_str_h):
         return '  static {};\n'.format(cpp_func_str_h)
 
+    def gen_cpu_ops_shard(self, func_defs, cpp_path, header_path, num_shards=1):
+        head_file_content = _H_HEADER.format(gen=os.path.basename(sys.argv[0]), hfuncs=''.join([f['dec'] for f in func_defs]))
+        write_or_skip(header_path, head_file_content)
+
+        shards = [[] for _ in range(num_shards)]
+        for idx, func in enumerate(func_defs):
+            shards[idx % num_shards].append(func)
+
+        for idx, shard in enumerate(shards):
+            regs_code = _REG_BLOCK.format(reg_ops=''.join([f['reg'] for f in shard]))
+            defs_code = ''.join([f['def'] for f in shard])
+
+            filename, ext = os.path.splitext(cpp_path)
+            shard_filepath = '%s_%s%s' % (filename, idx, ext)
+            shard_content = _CPP_HEADER.format(gen=os.path.basename(sys.argv[0]), funcs=defs_code, regs=regs_code)
+            write_or_skip(shard_filepath, shard_content)
+
     def gen_code(self):
         self.prepare_functions()
         assert len(self._err_info) == 0
 
-        func_decs = []
-        func_regs = []
         func_defs = []
         for cpp_sparse_sig, _, cpp_sparse_func_sig_str, aten_func_sig_str in self._sigs:
-            func_regs.append(_REG_PATTERN.format(aten_func_sig_str, self.get_func_dec(cpp_sparse_sig), "AtenIpexCPUSparse::" + cpp_sparse_sig.def_name))
             # Gen declaration code for head file
             cpp_func_str_h, cpp_func_str_cpp = self.gen_func_signature(cpp_sparse_func_sig_str)
-            func_decs.append(self.gen_head_dec_code(cpp_func_str_h))
+            func_dec = self.gen_head_dec_code(cpp_func_str_h)
 
-            # Since we have pre-defined attr OPs, we don't need to regenerate it
-            if self.is_sparse_attr_function(cpp_sparse_sig.def_name):
-                continue
+            func_reg = _REG_PATTERN.format(aten_func_sig_str, self.get_func_dec(cpp_sparse_sig), "AtenIpexCPUSparse::" + cpp_sparse_sig.def_name)
 
-            # Gen definition code for cpp file
-            code = '{} {{\n'.format(cpp_func_str_cpp)
-            code += self.gen_fallback_prepare_code(cpp_sparse_sig)
-            code += self.gen_fallback_code(cpp_sparse_sig)
-            code += self.gen_fallback_post_code(cpp_sparse_sig)
-
-            code += '}\n\n'
-
-            func_defs.append(code)
+            code = ''
+            # Since we have pre-defined attr OPs, we don't need to regenerate it
+            if not self.is_sparse_attr_function(cpp_sparse_sig.def_name):
 
-        head_file_content = _H_HEADER.format(gen=os.path.basename(sys.argv[0]), hfuncs=''.join(func_decs))
+                # Gen definition code for cpp file
+                code += '{} {{\n'.format(cpp_func_str_cpp)
+                code += self.gen_fallback_prepare_code(cpp_sparse_sig)
+                code += self.gen_fallback_code(cpp_sparse_sig)
+                code += self.gen_fallback_post_code(cpp_sparse_sig)
 
-        regs_code = 'void RegisterIpexSparseOPs() {\n'
-        regs_code += '  static auto dispatch = torch::RegisterOperators()\n'
-        regs_code += ''.join(func_regs)
-        regs_code += ';\n}\n'
+                code += '}\n\n'
 
-        source_file_content = _CPP_HEADER.format(gen=os.path.basename(sys.argv[0]), funcs=''.join(func_defs), regs=regs_code)
-        print(head_file_content, file=self._op_h_file)
-        print(source_file_content, file=self._op_cpp_file)
+            func_defs.append({'dec': func_dec, 'reg': func_reg, 'def': code})
 
+        self.gen_cpu_ops_shard(func_defs,
+                               cpp_path=self._op_cpp_file_path,
+                               header_path=self._op_h_file_path,
+                               num_shards=1)
 
 if __name__ == '__main__':
     arg_parser = argparse.ArgumentParser()