diff --git a/.gitignore b/.gitignore
index e4bd68a61..83b135eb0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,6 +11,7 @@
 .coverage
 .hypothesis
 .mypy_cache
+*.so*
 */*.pyc
 */*.so*
 */**/__pycache__
@@ -91,9 +92,9 @@ torch/version.py
 intel_pytorch_extension_py/version.py
 torch_ipex/csrc/version.cpp
 torch_ipex/csrc/aten_ipex_sparse_type_default.*
-torch_ipex/csrc/cpu/SparseOPs.*
+torch_ipex/csrc/cpu/SparseOPs*
 torch_ipex/csrc/cpu/OPs.*
-torch_ipex/csrc/cpu/DenseOPs.*
+torch_ipex/csrc/cpu/DenseOPs*
 
 cscope.*
      	
diff --git a/intel_pytorch_extension_py/__init__.py b/intel_pytorch_extension_py/__init__.py
index 82c769a12..0d9bb30d6 100644
--- a/intel_pytorch_extension_py/__init__.py
+++ b/intel_pytorch_extension_py/__init__.py
@@ -3,6 +3,4 @@
 from .version import __version__
 from .optim import *
 from .ops import *
-import _torch_ipex as core
-
-core._initialize_aten_bindings()
+import _torch_ipex as core
\ No newline at end of file
diff --git a/scripts/cpu/common/codegen.py b/scripts/cpu/common/codegen.py
new file mode 100644
index 000000000..70e52521a
--- /dev/null
+++ b/scripts/cpu/common/codegen.py
@@ -0,0 +1,15 @@
+import os
+
+def write_or_skip(filepath, content):
+    try:
+        with open(filepath, 'r') as f:
+            old_content = f.read()
+    except IOError:
+        old_content = None
+
+    if old_content != content:
+        with open(filepath, 'w') as f:
+            print('writing', filepath)
+            f.write(content)
+    else:
+        print('skipped writing', filepath)
\ No newline at end of file
diff --git a/scripts/cpu/gen-dense-cpu-ops.py b/scripts/cpu/gen-dense-cpu-ops.py
index 4096416fc..dce834882 100755
--- a/scripts/cpu/gen-dense-cpu-ops.py
+++ b/scripts/cpu/gen-dense-cpu-ops.py
@@ -9,6 +9,7 @@
 import sys
 import json
 
+from common.codegen import write_or_skip
 from common.cpp_sig_parser import CPPSig
 from common.aten_sig_parser import AtenSig
 
@@ -92,6 +93,12 @@
     .op(torch::RegisterOperators::options().schema("{}")
       .impl_unboxedOnlyKernel<{}, &{}>(at::DispatchKey::DPCPPTensorId)
       .aliasAnalysis(c10::AliasAnalysisKind::FROM_SCHEMA))"""
+
+_REG_BLOCK = """
+namespace {{
+  static auto dispatch = torch::RegisterOperators(){reg_ops};
+}}"""
+
 _H_HEADER = """// Autogenerated file by {gen}. Do not edit directly!
 #pragma once
 
@@ -105,8 +112,6 @@ class AtenIpexCPUDefault {{
 {hfuncs}
 }};
 
-void RegisterIpexDenseOPs();
-
 }}  // namespace cpu
 
 }}  // namespace torch_ipex
@@ -145,9 +150,7 @@ def __init__(self, reg_dec_file_path, func_file_path, op_h_file_path, op_cpp_fil
         self._reg_dec_file_path = reg_dec_file_path
         self._func_file_path = func_file_path
         self._op_h_file_path = op_h_file_path
-        self._op_h_file = None
         self._op_cpp_file_path = op_cpp_file_path
-        self._op_cpp_file = None
         self._sigs = []
         self._err_info = []
         self._func_data = ''
@@ -223,9 +226,6 @@ def prepare_functions(self):
         with open(self._func_file_path, 'r') as ff:
             self._func_data = ff.read()
 
-        self._op_h_file = open(self._op_h_file_path, 'w')
-        self._op_cpp_file = open(self._op_cpp_file_path, 'w')
-
         print('Extracted {} functions ({} errors) from {}'.format(
               len(self._sigs),
               len(self._err_info),
@@ -452,6 +452,23 @@ def gen_fallback_post_code(self, cpp_sig):
     def gen_head_dec_code(self, cpp_func_str_h):
         return '  static {};\n'.format(cpp_func_str_h)
 
+    def gen_cpu_ops_shard(self, func_defs, cpp_path, header_path, num_shards=1):
+        head_file_content = _H_HEADER.format(gen=os.path.basename(sys.argv[0]), hfuncs=''.join([f['dec'] for f in func_defs]))
+        write_or_skip(header_path, head_file_content)
+
+        shards = [[] for _ in range(num_shards)]
+        for idx, func in enumerate(func_defs):
+            shards[idx % num_shards].append(func)
+
+        for idx, shard in enumerate(shards):
+            regs_code = _REG_BLOCK.format(reg_ops=''.join([f['reg'] for f in shard]))
+            defs_code = ''.join([f['def'] for f in shard])
+
+            filename, ext = os.path.splitext(cpp_path)
+            shard_filepath = f'{filename}_{idx}{ext}'
+            shard_content = _CPP_HEADER.format(gen=os.path.basename(sys.argv[0]), funcs=defs_code, regs=regs_code)
+            write_or_skip(shard_filepath, shard_content)
+
     def gen_code(self):
         self.prepare_functions()
         assert len(self._err_info) == 0
@@ -459,15 +476,13 @@ def gen_code(self):
         def is_conv_overrideable_func(fname):
             return fname in ['convolution_overrideable', 'convolution_backward_overrideable']
 
-        func_decs = []
-        func_regs = []
         func_defs = []
-        for cpp_sig, aten_sig, cpp_func_sig_str, aten_func_sig_str in self._sigs:
+        for cpp_sig, _, cpp_func_sig_str, aten_func_sig_str in self._sigs:
             cpp_func_str_h, cpp_func_str_cpp = self.gen_func_signature(cpp_func_sig_str)
             # Gen declaration code for head file
-            func_decs.append(self.gen_head_dec_code(cpp_func_str_h))
+            func_dec = self.gen_head_dec_code(cpp_func_str_h)
 
-            func_regs.append(_REG_PATTERN.format(aten_func_sig_str, self.get_func_dec(cpp_sig), "AtenIpexCPUDefault::" + cpp_sig.def_name))
+            func_reg = _REG_PATTERN.format(aten_func_sig_str, self.get_func_dec(cpp_sig), "AtenIpexCPUDefault::" + cpp_sig.def_name)
 
             # Gen definition code for cpp file
             code = '{} {{\n'.format(cpp_func_str_cpp)
@@ -480,23 +495,14 @@ def is_conv_overrideable_func(fname):
                 code += self.gen_fallback_code(cpp_sig)
                 code += self.gen_fallback_post_code(cpp_sig)
 
-            code += '}\n'
-
-            code += '\n'
-
-            func_defs.append(code)
-
-        head_file_content = _H_HEADER.format(gen=os.path.basename(sys.argv[0]), hfuncs=''.join(func_decs))
-
-        regs_code = 'void RegisterIpexDenseOPs() {\n'
-        regs_code += '  static auto dispatch = torch::RegisterOperators()\n'
-        regs_code += ''.join(func_regs)
-        regs_code += ';\n}\n'
+            code += '}\n\n'
 
-        source_file_content = _CPP_HEADER.format(gen=os.path.basename(sys.argv[0]), funcs=''.join(func_defs), regs=regs_code)
-        print(head_file_content, file=self._op_h_file)
-        print(source_file_content, file=self._op_cpp_file)
+            func_defs.append({'dec': func_dec, 'reg': func_reg, 'def': code})
 
+        self.gen_cpu_ops_shard(func_defs,
+                               cpp_path=self._op_cpp_file_path,
+                               header_path=self._op_h_file_path,
+                               num_shards=8)
 
 if __name__ == '__main__':
     arg_parser = argparse.ArgumentParser()
diff --git a/scripts/cpu/gen-sparse-cpu-ops.py b/scripts/cpu/gen-sparse-cpu-ops.py
index 3f99c9e18..e09941a9b 100755
--- a/scripts/cpu/gen-sparse-cpu-ops.py
+++ b/scripts/cpu/gen-sparse-cpu-ops.py
@@ -9,6 +9,7 @@
 import sys
 import json
 
+from common.codegen import write_or_skip
 from common.cpp_sig_parser import CPPSig
 from common.aten_sig_parser import AtenSig
 
@@ -47,6 +48,13 @@
     .op(torch::RegisterOperators::options().schema("{}")
       .impl_unboxedOnlyKernel<{}, &{}>(at::DispatchKey::SparseDPCPPTensorId)
       .aliasAnalysis(c10::AliasAnalysisKind::FROM_SCHEMA))"""
+
+_REG_BLOCK = """
+namespace {{
+  static auto dispatch = torch::RegisterOperators(){reg_ops};
+}}"""
+
+
 _H_HEADER = """// Autogenerated file by {gen}. Do not edit directly!
 #pragma once
 
@@ -60,8 +68,6 @@ class AtenIpexCPUSparse {{
 {hfuncs}
 }};
 
-void RegisterIpexSparseOPs();
-
 }}  // namespace cpu
 
 }}  // namespace torch_ipex
@@ -100,9 +106,7 @@ def __init__(self, reg_dec_file_path, func_file_path, sparse_dec_file_path, spar
         self._sparse_dec_file_path = sparse_dec_file_path
         self._sparse_attr_file_path = sparse_attr_file_path
         self._op_h_file_path = op_h_file_path
-        self._op_h_file = None
         self._op_cpp_file_path = op_cpp_file_path
-        self._op_cpp_file = None
         self._sigs = []
         self._sparse_attr_data = ''
         self._sparse_sigs = []
@@ -155,8 +159,8 @@ def prepare_functions(self):
                 continue
             cpp_func_sig_str = m.group(1)
             _sparse_sig_strs.append(cpp_func_sig_str)
-            print(cpp_func_sig_str)
-        print("********************")
+        #     print(cpp_func_sig_str)
+        # print("********************")
 
         # Parse SparseAttrType.h
         with open(self._sparse_attr_file_path, 'r') as ff:
@@ -202,9 +206,6 @@ def prepare_functions(self):
                 self._err_info.append((cpp_func_sig, str(e)))
                 print('Error parsing "{}": {}'.format(cpp_func_sig, e), file=sys.stderr)
 
-        self._op_h_file = open(self._op_h_file_path, 'w')
-        self._op_cpp_file = open(self._op_cpp_file_path, 'w')
-
         print('Extracted {} functions ({} errors) from {}'.format(
               len(self._sigs),
               len(self._err_info),
@@ -369,44 +370,53 @@ def gen_fallback_post_code(self, cpp_sig):
     def gen_head_dec_code(self, cpp_func_str_h):
         return '  static {};\n'.format(cpp_func_str_h)
 
+    def gen_cpu_ops_shard(self, func_defs, cpp_path, header_path, num_shards=1):
+        head_file_content = _H_HEADER.format(gen=os.path.basename(sys.argv[0]), hfuncs=''.join([f['dec'] for f in func_defs]))
+        write_or_skip(header_path, head_file_content)
+
+        shards = [[] for _ in range(num_shards)]
+        for idx, func in enumerate(func_defs):
+            shards[idx % num_shards].append(func)
+
+        for idx, shard in enumerate(shards):
+            regs_code = _REG_BLOCK.format(reg_ops=''.join([f['reg'] for f in shard]))
+            defs_code = ''.join([f['def'] for f in shard])
+
+            filename, ext = os.path.splitext(cpp_path)
+            shard_filepath = f'{filename}_{idx}{ext}'
+            shard_content = _CPP_HEADER.format(gen=os.path.basename(sys.argv[0]), funcs=defs_code, regs=regs_code)
+            write_or_skip(shard_filepath, shard_content)
+
     def gen_code(self):
         self.prepare_functions()
         assert len(self._err_info) == 0
 
-        func_decs = []
-        func_regs = []
         func_defs = []
         for cpp_sparse_sig, _, cpp_sparse_func_sig_str, aten_func_sig_str in self._sigs:
-            func_regs.append(_REG_PATTERN.format(aten_func_sig_str, self.get_func_dec(cpp_sparse_sig), "AtenIpexCPUSparse::" + cpp_sparse_sig.def_name))
             # Gen declaration code for head file
             cpp_func_str_h, cpp_func_str_cpp = self.gen_func_signature(cpp_sparse_func_sig_str)
-            func_decs.append(self.gen_head_dec_code(cpp_func_str_h))
+            func_dec = self.gen_head_dec_code(cpp_func_str_h)
 
-            # Since we have pre-defined attr OPs, we don't need to regenerate it
-            if self.is_sparse_attr_function(cpp_sparse_sig.def_name):
-                continue
+            func_reg = _REG_PATTERN.format(aten_func_sig_str, self.get_func_dec(cpp_sparse_sig), "AtenIpexCPUSparse::" + cpp_sparse_sig.def_name)
 
-            # Gen definition code for cpp file
-            code = '{} {{\n'.format(cpp_func_str_cpp)
-            code += self.gen_fallback_prepare_code(cpp_sparse_sig)
-            code += self.gen_fallback_code(cpp_sparse_sig)
-            code += self.gen_fallback_post_code(cpp_sparse_sig)
-
-            code += '}\n\n'
-
-            func_defs.append(code)
+            code = ''
+            # Since we have pre-defined attr OPs, we don't need to regenerate it
+            if not self.is_sparse_attr_function(cpp_sparse_sig.def_name):
 
-        head_file_content = _H_HEADER.format(gen=os.path.basename(sys.argv[0]), hfuncs=''.join(func_decs))
+                # Gen definition code for cpp file
+                code += '{} {{\n'.format(cpp_func_str_cpp)
+                code += self.gen_fallback_prepare_code(cpp_sparse_sig)
+                code += self.gen_fallback_code(cpp_sparse_sig)
+                code += self.gen_fallback_post_code(cpp_sparse_sig)
 
-        regs_code = 'void RegisterIpexSparseOPs() {\n'
-        regs_code += '  static auto dispatch = torch::RegisterOperators()\n'
-        regs_code += ''.join(func_regs)
-        regs_code += ';\n}\n'
+                code += '}\n\n'
 
-        source_file_content = _CPP_HEADER.format(gen=os.path.basename(sys.argv[0]), funcs=''.join(func_defs), regs=regs_code)
-        print(head_file_content, file=self._op_h_file)
-        print(source_file_content, file=self._op_cpp_file)
+            func_defs.append({'dec': func_dec, 'reg': func_reg, 'def': code})
 
+        self.gen_cpu_ops_shard(func_defs,
+                               cpp_path=self._op_cpp_file_path,
+                               header_path=self._op_h_file_path,
+                               num_shards=1)
 
 if __name__ == '__main__':
     arg_parser = argparse.ArgumentParser()
diff --git a/setup.py b/setup.py
index 7a6a13904..663f573eb 100644
--- a/setup.py
+++ b/setup.py
@@ -151,6 +151,10 @@ def run(self):
 class DPCPPBuild(build_ext, object):
   def run(self):
     print("run")
+
+    # Generate the code before globbing!
+    generate_ipex_cpu_aten_code(base_dir)
+
     cmake = find_executable('cmake3') or find_executable('cmake')
     if cmake is None:
       raise RuntimeError(
@@ -170,6 +174,7 @@ def build_extension(self, ext):
       os.mkdir(ext.build_dir)
 
     build_type = 'Release'
+    use_ninja = False
 
     if _check_env_flag('DEBUG'):
       build_type = 'Debug'
@@ -193,6 +198,10 @@ def build_extension(self, ext):
     if _check_env_flag("DPCPP_ENABLE_PROFILING"):
       cmake_args += ['-DDPCPP_ENABLE_PROFILING=1']
 
+    if _check_env_flag("USE_NINJA"):
+      use_ninja = True
+      cmake_args += ['-GNinja']
+
     build_args = ['-j', str(multiprocessing.cpu_count())]
 
     env = os.environ.copy()
@@ -203,7 +212,10 @@ def build_extension(self, ext):
       check_call([self.cmake, ext.project_dir] + cmake_args, cwd=ext.build_dir, env=env)
 
     # build_args += ['VERBOSE=1']
-    check_call(['make'] + build_args, cwd=ext.build_dir, env=env)
+    if use_ninja:
+      check_call(['ninja'] + build_args, cwd=ext.build_dir, env=env)
+    else:
+      check_call(['make'] + build_args, cwd=ext.build_dir, env=env)
 
 
 ipex_git_sha, torch_git_sha = get_git_head_sha(base_dir)
@@ -212,8 +224,6 @@ def build_extension(self, ext):
 # Generate version info (torch_xla.__version__)
 create_version_files(base_dir, version, ipex_git_sha, torch_git_sha)
 
-# Generate the code before globbing!
-generate_ipex_cpu_aten_code(base_dir)
 
 # Constant known variables used throughout this file
 
diff --git a/tests/cpu/common_device_type.py b/tests/cpu/common_device_type.py
index ee6349eef..a68cfe42d 100644
--- a/tests/cpu/common_device_type.py
+++ b/tests/cpu/common_device_type.py
@@ -50,7 +50,6 @@
 import unittest
 import torch
 import _torch_ipex as ipex
-ipex._initialize_aten_bindings()
 import copy
 from common_utils import TestCase, TEST_WITH_ROCM, TEST_MKL, \
     skipCUDANonDefaultStreamIf
diff --git a/tests/cpu/test_conf.py b/tests/cpu/test_conf.py
index 2df27c2e9..5ca82af61 100644
--- a/tests/cpu/test_conf.py
+++ b/tests/cpu/test_conf.py
@@ -5,7 +5,6 @@
 
 import torch
 import _torch_ipex as ipex
-ipex._initialize_aten_bindings()
 
 import torch.nn as nn
 import torch.backends.cudnn as cudnn
diff --git a/tests/cpu/test_lazy_reorder.py b/tests/cpu/test_lazy_reorder.py
index 34d4f1fc0..fb26d9437 100644
--- a/tests/cpu/test_lazy_reorder.py
+++ b/tests/cpu/test_lazy_reorder.py
@@ -12,7 +12,6 @@
 import sys
 import torch
 import _torch_ipex as ipex
-ipex._initialize_aten_bindings()
 import intel_pytorch_extension
 
 import torch.nn as nn
diff --git a/tests/cpu/test_rn50_cpu_ops.py b/tests/cpu/test_rn50_cpu_ops.py
index c53c78a3f..2f5a8ec8a 100644
--- a/tests/cpu/test_rn50_cpu_ops.py
+++ b/tests/cpu/test_rn50_cpu_ops.py
@@ -56,7 +56,6 @@
 
 import torch
 import _torch_ipex as ipex
-ipex._initialize_aten_bindings()
 import intel_pytorch_extension
 
 import torch.nn as nn
diff --git a/torch_ipex/csrc/aten_ipex_type.cpp b/torch_ipex/csrc/aten_ipex_type.cpp
index bb1c08285..e1def87d1 100644
--- a/torch_ipex/csrc/aten_ipex_type.cpp
+++ b/torch_ipex/csrc/aten_ipex_type.cpp
@@ -10,18 +10,4 @@
 
 namespace torch_ipex {
 
-namespace {
-
-void AtenInitialize() {
-  cpu::RegisterIpexDenseOPs();
-  cpu::RegisterIpexSparseOPs();
-}
-
-}  // namespace
-
-void AtenIpexType::InitializeAtenBindings() {
-  static std::once_flag once;
-  std::call_once(once, []() { AtenInitialize(); });
-}
-
 } // namespace torch_ipe
diff --git a/torch_ipex/csrc/aten_ipex_type.h b/torch_ipex/csrc/aten_ipex_type.h
index e42bc083d..281d0b3f2 100644
--- a/torch_ipex/csrc/aten_ipex_type.h
+++ b/torch_ipex/csrc/aten_ipex_type.h
@@ -6,8 +6,6 @@ namespace torch_ipex {
 
 // Base ATEN Type class where the IPE specific overrides should be defined.
 class AtenIpexType {
- public:
-  static void InitializeAtenBindings();
 };
 
 } // namespace torch_ipex
diff --git a/torch_ipex/csrc/init_python_bindings.cpp b/torch_ipex/csrc/init_python_bindings.cpp
index 6cb5b7c2f..d2632b359 100644
--- a/torch_ipex/csrc/init_python_bindings.cpp
+++ b/torch_ipex/csrc/init_python_bindings.cpp
@@ -54,8 +54,6 @@ dil::dims getDilTensorStrides(const at::Tensor &tensor) {
 /// ****************************
 
 void InitIpexModuleBindings(py::module m) {
-  m.def("_initialize_aten_bindings",
-        []() { AtenIpexType::InitializeAtenBindings(); });
   m.def("_get_git_revs", []() { return GetRevisions(); });
   m.def("enable_auto_dnnl", []() { AutoOptConfig::singleton().set_auto_dnnl(true); });
   m.def("disable_auto_dnnl", []() { AutoOptConfig::singleton().set_auto_dnnl(false); });