Enable detectron on AMD GPU

Sandeep Kumar · facebook-github-bot · commit 6248266d911f · 2019-03-12T16:29:42.000-07:00
Summary: Pull Request resolved: pytorch#17862 Differential Revision: D14429234 Pulled By: bddppq fbshipit-source-id: 5cb8750bd9db0ff8a179977d2bfbb180265cce81
diff --git a/modules/CMakeLists.txt b/modules/CMakeLists.txt
@@ -1,3 +1,4 @@
+project(modules CXX C)
 add_subdirectory(detectron)
 add_subdirectory(module_test)
 add_subdirectory(observers)
diff --git a/modules/detectron/CMakeLists.txt b/modules/detectron/CMakeLists.txt
@@ -1,5 +1,6 @@
 file(GLOB Detectron_CPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.cc)
 file(GLOB Detectron_GPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.cu)
+file(GLOB_RECURSE Detectron_HIP_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.hip)
 
 if (BUILD_CAFFE2_OPS)
   if (USE_OPENMP AND OPENMP_FOUND)
@@ -19,6 +20,16 @@ if (BUILD_CAFFE2_OPS)
     if (MSVC)
       install(FILES $<TARGET_PDB_FILE:caffe2_detectron_ops_gpu> DESTINATION lib OPTIONAL)
     endif()
+  elseif(USE_ROCM)
+    hip_include_directories(${Caffe2_HIP_INCLUDES})
+    set_source_files_properties(${Detectron_HIP_SRCS} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
+    HIP_ADD_LIBRARY(
+        caffe2_detectron_ops_hip SHARED
+        ${Detectron_CPU_SRCS}
+        ${Detectron_HIP_SRCS})
+    target_compile_options(caffe2_detectron_ops_hip PRIVATE ${HIP_CXX_FLAGS})
+    target_link_libraries(caffe2_detectron_ops_hip caffe2_hip)
+    install(TARGETS caffe2_detectron_ops_hip DESTINATION lib)
   elseif(NOT IOS_PLATFORM)
     add_library(caffe2_detectron_ops SHARED ${Detectron_CPU_SRCS})
     target_link_libraries(caffe2_detectron_ops caffe2 ${OpenMP_link})
diff --git a/modules/detectron/batch_permutation_op.cu b/modules/detectron/batch_permutation_op.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "batch_permutation_op.h"
+#include "modules/detectron/batch_permutation_op.h"
 #include "caffe2/core/context_gpu.h"
 
 namespace caffe2 {
diff --git a/modules/detectron/group_spatial_softmax_op.cu b/modules/detectron/group_spatial_softmax_op.cu
@@ -17,7 +17,7 @@
 #include <cfloat>
 
 #include "caffe2/core/context_gpu.h"
-#include "group_spatial_softmax_op.h"
+#include "modules/detectron/group_spatial_softmax_op.h"
 
 namespace caffe2 {
 
diff --git a/modules/detectron/ps_roi_pool_op.cu b/modules/detectron/ps_roi_pool_op.cu
@@ -73,7 +73,7 @@
 #include <cfloat>
 
 #include "caffe2/core/context_gpu.h"
-#include "ps_roi_pool_op.h"
+#include "modules/detectron/ps_roi_pool_op.h"
 
 namespace caffe2 {
 
@@ -123,8 +123,8 @@ __global__ void PSRoIPoolForward(
       roundf(offset_bottom_rois[4]) + 1.) * spatial_scale;
 
     // Force too small ROIs to be 1x1
-    T roi_width = max(roi_end_w - roi_start_w, 0.1);  // avoid 0
-    T roi_height = max(roi_end_h - roi_start_h, 0.1);
+    T roi_width = c10::cuda::compat::max(roi_end_w - roi_start_w, static_cast<T>(0.1));  // avoid 0
+    T roi_height = c10::cuda::compat::max(roi_end_h - roi_start_h, static_cast<T>(0.1));
 
     // Compute w and h at bottom
     T bin_size_h = roi_height / static_cast<T>(pooled_height);
@@ -200,8 +200,8 @@ __global__ void PSRoIPoolBackward(
       roundf(offset_bottom_rois[4]) + 1.) * spatial_scale;
 
     // Force too small ROIs to be 1x1
-    T roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0
-    T roi_height = max(roi_end_h - roi_start_h, 0.1);
+    T roi_width = c10::cuda::compat::max(roi_end_w - roi_start_w, static_cast<T>(0.1)); //avoid 0
+    T roi_height = c10::cuda::compat::max(roi_end_h - roi_start_h, static_cast<T>(0.1));
 
     // Compute w and h at bottom
     T bin_size_h = roi_height / static_cast<T>(pooled_height);
diff --git a/modules/detectron/roi_pool_f_op.cu b/modules/detectron/roi_pool_f_op.cu
@@ -17,7 +17,7 @@
 #include <cfloat>
 
 #include "caffe2/core/context_gpu.h"
-#include "roi_pool_f_op.h"
+#include "modules/detectron/roi_pool_f_op.h"
 
 namespace caffe2 {
 
diff --git a/modules/detectron/sample_as_op.cu b/modules/detectron/sample_as_op.cu
@@ -21,7 +21,7 @@ Y's output samples are the samples of X for which L > 0.
 #include <cfloat>
 
 #include "caffe2/core/context_gpu.h"
-#include "sample_as_op.h"
+#include "modules/detectron/sample_as_op.h"
 
 #include <stdio.h>
 
diff --git a/modules/detectron/select_smooth_l1_loss_op.cu b/modules/detectron/select_smooth_l1_loss_op.cu
@@ -15,7 +15,7 @@
  */
 
 #include "caffe2/core/context_gpu.h"
-#include "select_smooth_l1_loss_op.h"
+#include "modules/detectron/select_smooth_l1_loss_op.h"
 
 namespace caffe2 {
 
@@ -38,11 +38,11 @@ __global__ void SelectSmoothL1Kernel(
       float y_hat = Y_hat[ind];
       float y = Y[i * 4 + j];
       float val = y_hat - y;
-      float abs_val = abs(val);
+      float abs_val = c10::cuda::compat::abs(val);
       if (abs_val < beta) {
-        out[ind] = (0.5 * val * val / beta) / max(S[0], 1.0);
+        out[ind] = (0.5 * val * val / beta) / c10::cuda::compat::max(S[0], static_cast<float>(1.0));
       } else {
-        out[ind] = (abs_val - 0.5 * beta) / max(S[0], 1.0);
+        out[ind] = (abs_val - 0.5 * beta) / c10::cuda::compat::max(S[0], static_cast<float>(1.0));
       }
     }
   }
@@ -75,11 +75,11 @@ __global__ void SelectSmoothL1GradientKernel(
       float y_hat = Y_hat[ind];
       float y = Y[i * 4 + j];
       float val = y_hat - y;
-      float abs_val = abs(val);
+      float abs_val = c10::cuda::compat::abs(val);
       if (abs_val < beta) {
-        out[ind] = norm * d_loss * val / beta / max(S[0], 1.0);
+        out[ind] = norm * d_loss * val / beta / c10::cuda::compat::max(S[0], static_cast<float>(1.0));
       } else {
-        out[ind] = norm * d_loss * ((float(0) < val) - (val < float(0))) / max(S[0], 1.0);
+        out[ind] = norm * d_loss * ((float(0) < val) - (val < float(0))) / c10::cuda::compat::max(S[0], static_cast<float>(1.0));
       }
     }
   }
diff --git a/modules/detectron/sigmoid_cross_entropy_loss_op.cu b/modules/detectron/sigmoid_cross_entropy_loss_op.cu
@@ -15,7 +15,7 @@
  */
 
 #include "caffe2/core/context_gpu.h"
-#include "sigmoid_cross_entropy_loss_op.h"
+#include "modules/detectron/sigmoid_cross_entropy_loss_op.h"
 
 namespace caffe2 {
 
diff --git a/modules/detectron/sigmoid_focal_loss_op.cu b/modules/detectron/sigmoid_focal_loss_op.cu
@@ -17,7 +17,7 @@
 #include <cfloat>
 
 #include "caffe2/core/context_gpu.h"
-#include "sigmoid_focal_loss_op.h"
+#include "modules/detectron/sigmoid_focal_loss_op.h"
 
 namespace caffe2 {
 
@@ -45,15 +45,15 @@ __global__ void SigmoidFocalLossKernel(
     float c1 = (t == (d + 1));
     float c2 = (t != -1 & t != (d + 1));
 
-    float Np = max(weight_pos[0], 1.0);
+    float Np = c10::cuda::compat::max(weight_pos[0], static_cast<float>(1.0));
     float zn = (1.0 - alpha) / Np;
     float zp = alpha / Np;
 
     // p = 1. / 1. + expf(-x)
     float p = 1. / (1. + expf(-logits[i]));
 
     // (1 - p)**gamma * log(p) where
-    float term1 = powf((1. - p), gamma) * logf(max(p, FLT_MIN));
+    float term1 = powf((1. - p), gamma) * logf(c10::cuda::compat::max(p, FLT_MIN));
     // p**gamma * log(1 - p)
     float term2 =
         powf(p, gamma) *
@@ -82,7 +82,7 @@ __global__ void SigmoidFocalLossGradientKernel(
       int a = c / num_classes;   // current anchor
       int d = c % num_classes;   // current class
 
-      float Np = max(weight_pos[0], 1.0);
+      float Np = c10::cuda::compat::max(weight_pos[0], static_cast<float>(1.0));
       float zn = (1.0 - alpha) / Np;
       float zp = alpha / Np;
       int t = targets[n * (H * W * A) + a * (H * W) + y * W + x];
@@ -94,7 +94,7 @@ __global__ void SigmoidFocalLossGradientKernel(
       // (1-p)**g * (1 - p - g*p*log(p))
       float term1 =
           powf((1. - p), gamma) *
-          (1. - p - (p * gamma * logf(max(p, FLT_MIN))));
+          (1. - p - (p * gamma * logf(c10::cuda::compat::max(p, FLT_MIN))));
       // (p**g) * (g*(1-p)*log(1-p) - p)
       float term2 =
           powf(p, gamma) *
diff --git a/modules/detectron/smooth_l1_loss_op.cu b/modules/detectron/smooth_l1_loss_op.cu
@@ -15,7 +15,7 @@
  */
 
 #include "caffe2/core/context_gpu.h"
-#include "smooth_l1_loss_op.h"
+#include "modules/detectron/smooth_l1_loss_op.h"
 
 namespace caffe2 {
 
@@ -27,7 +27,7 @@ __global__ void SmoothL1Kernel(
   //        |x| - 0.5 * beta      otherwise
   CUDA_1D_KERNEL_LOOP(index, n) {
     T val = in[index];
-    T abs_val = abs(val);
+    T abs_val = c10::cuda::compat::abs(val);
     if (abs_val < beta) {
       out[index] = 0.5 * val * val / beta;
     } else {
@@ -49,7 +49,7 @@ __global__ void SmoothL1GradientKernel(
   // We also scale by norm * d_loss in this kernel for convenience
   CUDA_1D_KERNEL_LOOP(index, n) {
     T val = in[index];
-    T abs_val = abs(val);
+    T abs_val = c10::cuda::compat::abs(val);
     T d_loss = *d_loss_data;
     if (abs_val < beta) {
       out[index] = norm * d_loss * val / beta;
diff --git a/modules/detectron/softmax_focal_loss_op.cu b/modules/detectron/softmax_focal_loss_op.cu
@@ -17,7 +17,7 @@
 #include <cfloat>
 
 #include "caffe2/core/context_gpu.h"
-#include "softmax_focal_loss_op.h"
+#include "modules/detectron/softmax_focal_loss_op.h"
 
 namespace caffe2 {
 
@@ -69,7 +69,7 @@ __global__ void SoftmaxFocalLossKernel(
     int n = i / (W * H * A);
     const int label = static_cast<int>(targets[i]);
 
-    float Np = max(weight_pos[0], 1.0);
+    float Np = c10::cuda::compat::max(weight_pos[0], static_cast<float>(1.0));
     float z = (label == 0) * (1 - alpha) / Np +
               (label >= 1) * alpha / Np;
 
@@ -79,7 +79,7 @@ __global__ void SoftmaxFocalLossKernel(
       int idx = n * (H * W * D) + (offset + label) * (H * W) + y * W + x;
       losses[i] =
           -(pow(1.0f - Pdata[idx], gamma) *
-          log(max(Pdata[idx], FLT_MIN))) * z;
+          log(c10::cuda::compat::max(Pdata[idx], FLT_MIN))) * z;
     }
   }
 }
@@ -97,7 +97,7 @@ __global__ void SoftmaxFocalLossGradientWeightKernel(
     int a = (i / (W * H)) % A;
     int n = i / (W * H * A);
     const int label = static_cast<int>(targets[i]);
-    float Np = max(weight_pos[0], 1.0);
+    float Np = c10::cuda::compat::max(weight_pos[0], static_cast<float>(1.0));
     float z =  (label == 0) * (1 - alpha) / Np +
                (label >= 1) * alpha / Np;
 
@@ -109,7 +109,7 @@ __global__ void SoftmaxFocalLossGradientWeightKernel(
       float p = Pdata[idx];
       buff[i] =
           (-pow(onemp, gamma) +
-          gamma * pow(onemp, gamma - 1) * p * log(max(p, FLT_MIN))) * z;
+          gamma * pow(onemp, gamma - 1) * p * log(c10::cuda::compat::max(p, FLT_MIN))) * z;
     }
   }
 }
diff --git a/modules/detectron/spatial_narrow_as_op.cu b/modules/detectron/spatial_narrow_as_op.cu
@@ -16,7 +16,7 @@
 
 #include "caffe2/core/context_gpu.h"
 #include "caffe2/core/operator.h"
-#include "spatial_narrow_as_op.h"
+#include "modules/detectron/spatial_narrow_as_op.h"
 
 namespace caffe2 {
 
diff --git a/modules/detectron/upsample_nearest_op.cu b/modules/detectron/upsample_nearest_op.cu
@@ -58,7 +58,7 @@
 
 
 #include "caffe2/core/context_gpu.h"
-#include "upsample_nearest_op.h"
+#include "modules/detectron/upsample_nearest_op.h"
 
 namespace caffe2 {
 
diff --git a/tools/amd_build/build_amd.py b/tools/amd_build/build_amd.py
@@ -63,6 +63,7 @@
     "caffe2/utils/*",
     "c10/cuda/*",
     "c10/cuda/test/CMakeLists.txt",
+    "modules/*",
     # PyTorch paths
     # Keep this synchronized with is_pytorch_file in hipify_python.py
     "aten/src/ATen/cuda/*",

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
	`1`	`+project(modules CXX C)`
`1`	`2`	`add_subdirectory(detectron)`
`2`	`3`	`add_subdirectory(module_test)`
`3`	`4`	`add_subdirectory(observers)`
Original file line number	Diff line number	Diff line change
`@@ -15,7 +15,7 @@`
`15`	`15`	`*/`
`16`	`16`
`17`	`17`	`#include "caffe2/core/context_gpu.h"`
`18`		`-#include "select_smooth_l1_loss_op.h"`
	`18`	`+#include "modules/detectron/select_smooth_l1_loss_op.h"`
`19`	`19`
`20`	`20`	`namespace caffe2 {`
`21`	`21`
`@@ -38,11 +38,11 @@ __global__ void SelectSmoothL1Kernel(`
`38`	`38`	`float y_hat = Y_hat[ind];`
`39`	`39`	`float y = Y[i * 4 + j];`
`40`	`40`	`float val = y_hat - y;`
`41`		`- float abs_val = abs(val);`
	`41`	`+ float abs_val = c10::cuda::compat::abs(val);`
`42`	`42`	`if (abs_val < beta) {`
`43`		`- out[ind] = (0.5 * val * val / beta) / max(S[0], 1.0);`
	`43`	`+ out[ind] = (0.5 * val * val / beta) / c10::cuda::compat::max(S[0], static_cast<float>(1.0));`
`44`	`44`	`} else {`
`45`		`- out[ind] = (abs_val - 0.5 * beta) / max(S[0], 1.0);`
	`45`	`+ out[ind] = (abs_val - 0.5 * beta) / c10::cuda::compat::max(S[0], static_cast<float>(1.0));`
`46`	`46`	`}`
`47`	`47`	`}`
`48`	`48`	`}`
`@@ -75,11 +75,11 @@ __global__ void SelectSmoothL1GradientKernel(`
`75`	`75`	`float y_hat = Y_hat[ind];`
`76`	`76`	`float y = Y[i * 4 + j];`
`77`	`77`	`float val = y_hat - y;`
`78`		`- float abs_val = abs(val);`
	`78`	`+ float abs_val = c10::cuda::compat::abs(val);`
`79`	`79`	`if (abs_val < beta) {`
`80`		`- out[ind] = norm * d_loss * val / beta / max(S[0], 1.0);`
	`80`	`+ out[ind] = norm * d_loss * val / beta / c10::cuda::compat::max(S[0], static_cast<float>(1.0));`
`81`	`81`	`} else {`
`82`		`- out[ind] = norm * d_loss * ((float(0) < val) - (val < float(0))) / max(S[0], 1.0);`
	`82`	`+ out[ind] = norm * d_loss * ((float(0) < val) - (val < float(0))) / c10::cuda::compat::max(S[0], static_cast<float>(1.0));`
`83`	`83`	`}`
`84`	`84`	`}`
`85`	`85`	`}`