Update on "Modernize extension-cpp; refactor code"

zou3519 · zou3519 · commit 3076890018af · 2024-04-10T14:31:00.000-07:00
This PR:
- creates a single unified build for extension-cpp (instead of having
  separate cpu/cuda setup.pys).
- Updates the build system to use pyproject.toml (instead of only
  setup.py)
- Uses TORCH_LIBRARY to bind operators (instead of using PyBind)

There will be more future work to add improvements (e.g. torch.compile
support) and also fix up the corresponding C++ extensions tutorial.

Test Plan:
- Refactored all of the tests under test/

[ghstack-poisoned]
diff --git a/README.md b/README.md
@@ -2,6 +2,8 @@
 
 An example of writing a C++/CUDA extension for PyTorch. See
 [here](http://pytorch.org/tutorials/advanced/cpp_extension.html) for the accompanying tutorial.
+This repo demonstrates how to write an example `extension_cpp.ops.lltm`
+custom op that has both custom CPU and CUDA kernels.
 
 To build:
 ```
diff --git a/setup.py b/setup.py
@@ -21,10 +21,11 @@
 
 def get_extensions():
     debug_mode = os.getenv("DEBUG", "0") == "1"
+    use_cuda = os.getenv("USE_CUDA", "1") == "1"
     if debug_mode:
         print("Compiling in debug mode")
 
-    use_cuda = torch.cuda.is_available() and CUDA_HOME is not None
+    use_cuda = use_cuda and torch.cuda.is_available() and CUDA_HOME is not None
     extension = CUDAExtension if use_cuda else CppExtension
 
     extra_link_args = []
diff --git a/test/test_extension.py b/test/test_extension.py
@@ -34,22 +34,24 @@ def _test_correctness(self, device):
         torch.testing.assert_close(result, expected)
 
     def test_correctness_cpu(self):
-        self._test_lltm_correctness("cpu")
+        self._test_correctness("cpu")
 
+    @unittest.skipIf(not torch.cuda.is_available(), "requires cuda")
     def test_correctness_cuda(self):
-        self._test_lltm_correctness("cuda")
+        self._test_correctness("cuda")
 
     def _test_gradients(self, device):
         args = sample_inputs(device)
         torch.autograd.gradcheck(extension_cpp.ops.lltm, args)
 
     def test_gradients_cpu(self):
-        self._test_lltm_grad("cpu")
+        self._test_gradients("cpu")
 
     # This is supposed to succeed, there's probably a bug in the CUDA kernel.
     @unittest.expectedFailure
+    @unittest.skipIf(not torch.cuda.is_available(), "requires cuda")
     def test_gradients_cuda(self):
-        self._test_lltm_grad("cuda")
+        self._test_gradients("cuda")
 
 
 if __name__ == "__main__":