diff --git a/INSTALL.md b/INSTALL.md index d9b9f0209..ed9e2b6ba 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -7,7 +7,7 @@ The core library is written in PyTorch. Several components have underlying implementation in CUDA for improved performance. A subset of these components have CPU implementations in C++/Pytorch. It is advised to use PyTorch3d with GPU support in order to use all the features. -- Linux or macOS +- Linux or macOS or Windows - Python ≥ 3.6 - PyTorch 1.4 - torchvision that matches the PyTorch installation. You can install them together at pytorch.org to make sure of this. @@ -72,3 +72,41 @@ To rebuild after installing from a local clone run, `rm -rf build/ **/*.so` then ``` MACOSX_DEPLOYMENT_TARGET=10.14 CC=clang CXX=clang++ pip install -e . ``` + +**Install from local clone on Windows:** + +If you are using pre-compiled pytorch 1.4 and torchvision 0.5, you should make the following changes to the pytorch source code to successfully compile with Visual Studio 2019 (MSVC 19.16.27034) and CUDA 10.1. + +Change python/Lib/site-packages/torch/include/csrc/jit/script/module.h + +L466, 476, 493, 506, 536 +``` +-static constexpr * ++static const * +``` +Change python/Lib/site-packages/torch/include/csrc/jit/argument_spec.h + +L190 +``` +-static constexpr size_t DEPTH_LIMIT = 128; ++static const size_t DEPTH_LIMIT = 128; +``` + +Change python/Lib/site-packages/torch/include/pybind11/cast.h + +L1449 +``` +-explicit operator type&() { return *(this->value); } ++explicit operator type& () { return *((type*)(this->value)); } +``` + +After patching, you can go to "x64 Native Tools Command Prompt for VS 2019" to compile and install +``` +cd pytorch3d +python3 setup.py install +``` +After installing, verify whether all unit tests have passed +``` +cd tests +python3 -m unittest discover -p *.py +``` \ No newline at end of file diff --git a/pytorch3d/csrc/gather_scatter/gather_scatter.cu b/pytorch3d/csrc/gather_scatter/gather_scatter.cu index 0b34b86a1..826a6ab70 100644 --- a/pytorch3d/csrc/gather_scatter/gather_scatter.cu +++ b/pytorch3d/csrc/gather_scatter/gather_scatter.cu @@ -5,7 +5,7 @@ // TODO(T47953967) to make this cuda kernel support all datatypes. __global__ void gather_scatter_kernel( const float* __restrict__ input, - const long* __restrict__ edges, + const int64_t* __restrict__ edges, float* __restrict__ output, bool directed, bool backward, @@ -21,8 +21,8 @@ __global__ void gather_scatter_kernel( // Edges are split evenly across the blocks. for (int e = blockIdx.x; e < E; e += gridDim.x) { // Get indices of vertices which form the edge. - const long v0 = edges[2 * e + v0_idx]; - const long v1 = edges[2 * e + v1_idx]; + const int64_t v0 = edges[2 * e + v0_idx]; + const int64_t v1 = edges[2 * e + v1_idx]; // Split vertex features evenly across threads. // This implementation will be quite wasteful when D<128 since there will be @@ -57,7 +57,7 @@ at::Tensor gather_scatter_cuda( gather_scatter_kernel<<>>( input.data_ptr(), - edges.data_ptr(), + edges.data_ptr(), output.data_ptr(), directed, backward, diff --git a/pytorch3d/csrc/nearest_neighbor_points/nearest_neighbor_points.cu b/pytorch3d/csrc/nearest_neighbor_points/nearest_neighbor_points.cu index e1f119d00..d5bebb66b 100644 --- a/pytorch3d/csrc/nearest_neighbor_points/nearest_neighbor_points.cu +++ b/pytorch3d/csrc/nearest_neighbor_points/nearest_neighbor_points.cu @@ -6,7 +6,7 @@ template __device__ void WarpReduce( volatile scalar_t* min_dists, - volatile long* min_idxs, + volatile int64_t* min_idxs, const size_t tid) { // s = 32 if (min_dists[tid] > min_dists[tid + 32]) { @@ -57,7 +57,7 @@ template __global__ void NearestNeighborKernel( const scalar_t* __restrict__ points1, const scalar_t* __restrict__ points2, - long* __restrict__ idx, + int64_t* __restrict__ idx, const size_t N, const size_t P1, const size_t P2, @@ -74,7 +74,7 @@ __global__ void NearestNeighborKernel( extern __shared__ char shared_buf[]; scalar_t* x = (scalar_t*)shared_buf; // scalar_t[DD] scalar_t* min_dists = &x[D_2]; // scalar_t[NUM_THREADS] - long* min_idxs = (long*)&min_dists[blockDim.x]; // long[NUM_THREADS] + int64_t* min_idxs = (int64_t*)&min_dists[blockDim.x]; // int64_t[NUM_THREADS] const size_t n = blockIdx.y; // index of batch element. const size_t i = blockIdx.x; // index of point within batch element. @@ -147,14 +147,14 @@ template __global__ void NearestNeighborKernelD3( const scalar_t* __restrict__ points1, const scalar_t* __restrict__ points2, - long* __restrict__ idx, + int64_t* __restrict__ idx, const size_t N, const size_t P1, const size_t P2) { // Single shared memory buffer which is split and cast to different types. extern __shared__ char shared_buf[]; scalar_t* min_dists = (scalar_t*)shared_buf; // scalar_t[NUM_THREADS] - long* min_idxs = (long*)&min_dists[blockDim.x]; // long[NUM_THREADS] + int64_t* min_idxs = (int64_t*)&min_dists[blockDim.x]; // int64_t[NUM_THREADS] const size_t D = 3; const size_t n = blockIdx.y; // index of batch element. @@ -230,12 +230,12 @@ at::Tensor NearestNeighborIdxCuda(at::Tensor p1, at::Tensor p2) { // Use the specialized kernel for D=3. AT_DISPATCH_FLOATING_TYPES(p1.type(), "nearest_neighbor_v3_cuda", ([&] { size_t shared_size = threads * sizeof(size_t) + - threads * sizeof(long); + threads * sizeof(int64_t); NearestNeighborKernelD3 <<>>( p1.data_ptr(), p2.data_ptr(), - idx.data_ptr(), + idx.data_ptr(), N, P1, P2); @@ -248,11 +248,11 @@ at::Tensor NearestNeighborIdxCuda(at::Tensor p1, at::Tensor p2) { // need to be rounded to the next even size. size_t D_2 = D + (D % 2); size_t shared_size = (D_2 + threads) * sizeof(size_t); - shared_size += threads * sizeof(long); + shared_size += threads * sizeof(int64_t); NearestNeighborKernel<<>>( p1.data_ptr(), p2.data_ptr(), - idx.data_ptr(), + idx.data_ptr(), N, P1, P2, diff --git a/pytorch3d/csrc/rasterize_meshes/geometry_utils.cuh b/pytorch3d/csrc/rasterize_meshes/geometry_utils.cuh index ddb33349a..f1bac1127 100644 --- a/pytorch3d/csrc/rasterize_meshes/geometry_utils.cuh +++ b/pytorch3d/csrc/rasterize_meshes/geometry_utils.cuh @@ -7,7 +7,11 @@ #include "float_math.cuh" // Set epsilon for preventing floating point errors and division by 0. +#ifdef _MSC_VER +#define kEpsilon 1e-30f +#else const auto kEpsilon = 1e-30; +#endif // Determines whether a point p is on the right side of a 2D line segment // given by the end points v0, v1. @@ -93,7 +97,7 @@ BarycentricCoordsBackward( const float2& v2, const float3& grad_bary_upstream) { const float area = EdgeFunctionForward(v2, v0, v1) + kEpsilon; - const float area2 = pow(area, 2.0); + const float area2 = pow(area, 2.0f); const float e0 = EdgeFunctionForward(p, v1, v2); const float e1 = EdgeFunctionForward(p, v2, v0); const float e2 = EdgeFunctionForward(p, v0, v1); diff --git a/pytorch3d/csrc/rasterize_points/rasterize_points_cpu.cpp b/pytorch3d/csrc/rasterize_points/rasterize_points_cpu.cpp index fdb4f852d..3ea2a2134 100644 --- a/pytorch3d/csrc/rasterize_points/rasterize_points_cpu.cpp +++ b/pytorch3d/csrc/rasterize_points/rasterize_points_cpu.cpp @@ -7,7 +7,7 @@ // Given a pixel coordinate 0 <= i < S, convert it to a normalized device // coordinate in the range [-1, 1]. The NDC range is divided into S evenly-sized // pixels, and assume that each pixel falls in the *center* of its range. -inline float PixToNdc(const int i, const int S) { +static float PixToNdc(const int i, const int S) { // NDC x-offset + (i * pixel_width + half_pixel_width) return -1 + (2 * i + 1.0f) / S; } @@ -74,7 +74,7 @@ std::tuple RasterizePointsNaiveCpu( return std::make_tuple(point_idxs, zbuf, pix_dists); } -std::tuple RasterizePointsCoarseCpu( +torch::Tensor RasterizePointsCoarseCpu( const torch::Tensor& points, const int image_size, const float radius, @@ -140,7 +140,7 @@ std::tuple RasterizePointsCoarseCpu( bin_y_max = bin_y_min + bin_width; } } - return std::make_tuple(points_per_bin, bin_points); + return bin_points; } torch::Tensor RasterizePointsBackwardCpu(