Fix CPU only for GPU tensors (triton-inference-server#74)

Tabrizian · web-flow · commit 3d800b3ea1b4 · 2021-08-12T18:53:54.000-04:00
* Fix CPU only mode for GPU tensors

* Review edits
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -37,8 +37,7 @@ project(tritonpythonbackend LANGUAGES C CXX)
 # GPU support is disabled by default because python backend doesn't
 # because python backend does not need to access CUDA or GPUs
 #
-option(TRITON_ENABLE_GPU "Enable GPU support in backend" OFF)
-option(TRITON_ENABLE_GPU_TENSORS "Allow GPU input and output tensors" OFF)
+option(TRITON_ENABLE_GPU "Enable GPU support in backend" ON)
 option(TRITON_ENABLE_STATS "Include statistics collections in backend" ON)
 
 set(TRITON_BACKEND_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/backend repo")
@@ -199,6 +198,7 @@ target_link_libraries(
     triton-backend-utils    # from repo-backend
     ZLIB::ZLIB
     -larchive               # shared memory 
+    -ldl                    # dlopen
 )
 
 target_link_libraries(
@@ -210,19 +210,16 @@ target_link_libraries(
    triton-backend-utils    # from repo-backend
    -larchive               # libarchive
    -lrt                    # shared memory 
+   -ldl                    # dlopen
 )
 
 if(${TRITON_ENABLE_GPU})
   target_link_libraries(
     triton-python-backend-stub
-    PUBLIC
-      CUDA::cuda_driver
   )
 
   target_link_libraries(
     triton-python-backend
-    PUBLIC
-      CUDA::cuda_driver
   )
 endif() # TRITON_ENABLE_GPU
 
diff --git a/src/pb_main_utils.cc b/src/pb_main_utils.cc
@@ -24,13 +24,13 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+#include "pb_main_utils.h"
+
 #include <future>
+#include "pb_utils.h"
 #include "triton/backend/backend_common.h"
 #include "triton/core/tritonserver.h"
 
-#include "pb_main_utils.h"
-#include "pb_utils.h"
-
 namespace triton { namespace backend { namespace python {
 
 TRITONSERVER_Error*
diff --git a/src/pb_stub.cc b/src/pb_stub.cc
@@ -24,9 +24,8 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-#include <pybind11/embed.h>
-#include <pybind11/numpy.h>
-#include <pybind11/stl.h>
+#include "pb_stub.h"
+
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <sys/wait.h>
@@ -47,12 +46,9 @@
 #include "pb_utils.h"
 #include "shm_manager.h"
 
-#ifdef TRITON_ENABLE_GPU_TENSORS
-#include <cuda.h>
+#ifdef TRITON_ENABLE_GPU
 #include <cuda_runtime_api.h>
-#endif  // TRITON_ENABLE_GPU_TENSORS
-
-#include "pb_stub.h"
+#endif  // TRITON_ENABLE_GPU
 
 namespace py = pybind11;
 using namespace pybind11::literals;
@@ -298,7 +294,7 @@ Stub::ProcessResponse(
     }
 
     if (!output_tensor->IsCPU()) {
-#ifdef TRITON_ALLOW_GPU_TENSORS
+#ifdef TRITON_ENABLE_GPU
       std::unordered_map<void*, cudaIpcMemHandle_t*>::const_iterator
           reused_gpu_tensor =
               gpu_tensors_map_.find(output_tensor->GetGPUStartAddress());
@@ -320,7 +316,7 @@ Stub::ProcessRequest(
 {
   std::unique_ptr<InferRequest> infer_request =
       InferRequest::LoadFromSharedMemory(shm_pool_, request_offset);
-#ifdef TRITON_ENABLE_GPU_TENSORS
+#ifdef TRITON_ENABLE_GPU
   for (auto& input_tensor : infer_request->Inputs()) {
     if (!input_tensor->IsCPU()) {
       response_batch->cleanup = true;
@@ -329,7 +325,7 @@ Stub::ProcessRequest(
            input_tensor->CudaIpcMemHandle()});
     }
   }
-#endif  // TRITON_ENABLE_GPU_TENSORS
+#endif  // TRITON_ENABLE_GPU
 
   return infer_request;
 }
@@ -559,7 +555,7 @@ Stub::Cleanup()
   // Deleting the tensors should automatically trigger the destructor.
   tensors_to_remove_.clear();
 
-#ifdef TRITON_ENABLE_GPU_TENSORS
+#ifdef TRITON_ENABLE_GPU
   gpu_tensors_map_.clear();
 #endif
 }
diff --git a/src/pb_stub.h b/src/pb_stub.h
@@ -24,15 +24,23 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+#include <pybind11/embed.h>
+#include <pybind11/numpy.h>
+#include <pybind11/stl.h>
 #include <boost/interprocess/sync/interprocess_condition.hpp>
 #include <boost/interprocess/sync/interprocess_mutex.hpp>
 #include <boost/interprocess/sync/scoped_lock.hpp>
 #include <memory>
+#include "infer_request.h"
+#include "infer_response.h"
+#include "pb_tensor.h"
 #include "pb_utils.h"
 
 #pragma once
 
 namespace bi = boost::interprocess;
+namespace py = pybind11;
+using namespace pybind11::literals;
 
 namespace triton { namespace backend { namespace python {
 
@@ -58,9 +66,9 @@ class Stub {
   bool initialized_;
   static std::unique_ptr<Stub> stub_instance_;
 
-#ifdef TRITON_ENABLE_GPU_TENSORS
+#ifdef TRITON_ENABLE_GPU
   std::unordered_map<void*, cudaIpcMemHandle_t*> gpu_tensors_map_;
-#endif // TRITON_ENABLE_GPU_TENSORS
+#endif  // TRITON_ENABLE_GPU
 
  public:
   Stub(){};
diff --git a/src/pb_tensor.cc b/src/pb_tensor.cc
@@ -24,9 +24,9 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-#ifdef TRITON_ENABLE_GPU_TENSORS
+#ifdef TRITON_ENABLE_GPU
 #include <cuda.h>
-#endif  // TRITON_ENABLE_GPU_TENSORS
+#endif  // TRITON_ENABLE_GPU
 
 #ifdef TRITON_PB_STUB
 #include "pb_stub_utils.h"
@@ -301,7 +301,7 @@ PbTensor::LoadFromSharedMemory(
         raw_data->memory_type, raw_data->memory_type_id, data,
         raw_data->byte_size, nullptr /* DLManaged Tensor */);
   } else if (raw_data->memory_type == TRITONSERVER_MEMORY_GPU) {
-#ifdef TRITON_ENABLE_GPU_TENSORS
+#ifdef TRITON_ENABLE_GPU
     cudaIpcMemHandle_t* cuda_ipc_mem_handle;
     shm_pool->MapOffset((char**)&cuda_ipc_mem_handle, raw_data->memory_ptr);
     if (!tensor_shm->is_reused) {
@@ -335,7 +335,7 @@ PbTensor::LoadFromSharedMemory(
     }
 #else
     throw PythonBackendException("GPU Tensor is not supported.");
-#endif  // TRITON_ENABLE_GPU_TENSORS
+#endif  // TRITON_ENABLE_GPU
   }
 
   return pb_tensor;
@@ -422,7 +422,7 @@ PbTensor::FromDLPack(const std::string& name, const py::capsule& dlpack_tensor)
 
 PbTensor::~PbTensor() noexcept(false)
 {
-#ifdef TRITON_ENABLE_GPU_TENSORS
+#ifdef TRITON_ENABLE_GPU
   if (!IsCPU() && cuda_ipc_mem_handle_ != nullptr &&
       destruct_cuda_ipc_mem_handle_) {
     cudaError_t err = cudaIpcCloseMemHandle(GetGPUStartAddress());
@@ -434,7 +434,7 @@ PbTensor::~PbTensor() noexcept(false)
                                        .c_str());
     }
   }
-#endif  // TRITON_ENABLE_GPU_TENSORS
+#endif  // TRITON_ENABLE_GPU
   DeleteDLPack();
 }
 
@@ -459,24 +459,17 @@ PbTensor::AsNumpy() const
 }
 #endif  // TRITON_PB_STUB
 
-#ifdef TRITON_ENABLE_GPU_TENSORS
+#ifdef TRITON_ENABLE_GPU
 void*
 PbTensor::GetGPUStartAddress()
 {
   if (!this->IsCPU()) {
+    CUDADriverAPI& driver_api = CUDADriverAPI::getInstance();
     CUdeviceptr start_address;
-    CUresult cuda_err = cuPointerGetAttribute(
+
+    driver_api.PointerGetAttribute(
         &start_address, CU_POINTER_ATTRIBUTE_RANGE_START_ADDR,
         (CUdeviceptr)this->GetDataPtr());
-    if (cuda_err != CUDA_SUCCESS) {
-      const char* error_string;
-      cuGetErrorString(cuda_err, &error_string);
-      throw PythonBackendException(
-          std::string(
-              "failed to get cuda pointer device attribute: " +
-              std::string(error_string))
-              .c_str());
-    }
 
     return reinterpret_cast<void*>(start_address);
   }
@@ -485,6 +478,19 @@ PbTensor::GetGPUStartAddress()
       "Calling GetGPUStartAddress function on a CPU tensor.");
 }
 
+uint64_t
+PbTensor::GetGPUPointerOffset()
+{
+  if (!this->IsCPU()) {
+    uint64_t offset = reinterpret_cast<char*>(this->GetDataPtr()) -
+                      reinterpret_cast<char*>(this->GetGPUStartAddress());
+    return offset;
+  }
+
+  throw PythonBackendException(
+      "Calling GetGPUPointerOffset function on a CPU tensor.");
+}
+
 void
 PbTensor::SetReusedIpcHandle(cudaIpcMemHandle_t* cuda_ipc_mem_handle)
 {
@@ -498,7 +504,7 @@ PbTensor::CudaIpcMemHandle()
 {
   return cuda_ipc_mem_handle_;
 }
-#endif  // TRITON_ENABLE_GPU_TENSORS
+#endif  // TRITON_ENABLE_GPU
 
 void
 PbTensor::SaveToSharedMemory(
@@ -536,15 +542,15 @@ PbTensor::SaveToSharedMemory(
       memory_ptr_ = reinterpret_cast<void*>(data_in_shm);
     }
   } else {
-#ifdef TRITON_ENABLE_GPU_TENSORS
+#ifdef TRITON_ENABLE_GPU
     char* cuda_handle;
     uint64_t* ptr_offset;
     SaveTensorToSharedMemory(
         shm_pool, tensor_shm, cuda_handle, this->MemoryType(),
         this->MemoryTypeId(), this->ByteSize(), tensor_name.c_str(),
-        this->Dims().data(), this->Dims().size(), dtype_triton, &ptr_offset);
-    char* d_ptr = reinterpret_cast<char*>(this->GetDataPtr());
-    *ptr_offset = GetDevicePointerOffset(d_ptr);
+        this->Dims().data(), this->Dims().size(), dtype_triton, &ptr_offset,
+        shm_offset_);
+    *ptr_offset = this->GetGPUPointerOffset();
     if (!IsReused()) {
       cudaSetDevice(this->MemoryTypeId());
       cudaError_t err = cudaIpcGetMemHandle(
@@ -565,12 +571,10 @@ PbTensor::SaveToSharedMemory(
       *(reinterpret_cast<cudaIpcMemHandle_t*>(cuda_handle)) =
           *CudaIpcMemHandle();
     }
-    void* start_address = this->GetGPUStartAddress();
-    *ptr_offset = reinterpret_cast<char*>(this->GetDataPtr()) -
-                  reinterpret_cast<char*>(start_address);
+    *ptr_offset = this->GetGPUPointerOffset();
 #else
     throw PythonBackendException("GPU tensors are not supported.");
-#endif  // TRITON_ENABLE_GPU_TENSORS
+#endif  // TRITON_ENABLE_GPU
   }
 }
 
diff --git a/src/pb_tensor.h b/src/pb_tensor.h
@@ -27,9 +27,9 @@
 
 #pragma once
 
-#ifdef TRITON_ENABLE_GPU_TENSORS
+#ifdef TRITON_ENABLE_GPU
 #include <cuda_runtime_api.h>
-#endif  // TRITON_ENABLE_GPU_TENSORS
+#endif  // TRITON_ENABLE_GPU
 
 #include <dlpack/dlpack.h>
 
@@ -71,9 +71,9 @@ class PbTensor {
   PYTHONBACKEND_TensorType tensor_type_;
   uint64_t byte_size_;
   DLManagedTensor* dl_managed_tensor_;
-#ifdef TRITON_ENABLE_GPU_TENSORS
+#ifdef TRITON_ENABLE_GPU
   cudaIpcMemHandle_t* cuda_ipc_mem_handle_ = nullptr;
-#endif  // TRITON_ENABLE_GPU_TENSORS
+#endif  // TRITON_ENABLE_GPU
   bool is_reused_ = false;
   uint64_t reused_tensor_offset_ = 0;
   bool destruct_cuda_ipc_mem_handle_ = false;
@@ -142,11 +142,25 @@ class PbTensor {
   const std::string& Name() const;
   static std::shared_ptr<PbTensor> LoadFromSharedMemory(
       std::unique_ptr<SharedMemory>& shm_pool, off_t tensor_offset);
-#ifdef TRITON_ENABLE_GPU_TENSORS
+#ifdef TRITON_ENABLE_GPU
+  /// Set the cudaIpcMemHandle for the tensors that are reused.
+  /// \param cuda_ipc_mem_handle reusued tensor cudaIpcMemHandle
   void SetReusedIpcHandle(cudaIpcMemHandle_t* cuda_ipc_mem_handle);
+
+  /// Get the GPU start address.
+  /// \return The start address of a device pointer.
+  /// \throws PythonBackendException if the tensor is stored in CPU.
   void* GetGPUStartAddress();
+
+  /// Get the cuda IPC handle corresponding to this tensor.
+  /// \return The cudaIpcMemHandle
   cudaIpcMemHandle_t* CudaIpcMemHandle();
-#endif  // TRITON_ENABLE_GPU_TENSORS
+
+  /// Get the GPU pointer offset.
+  /// \return The offset of a device pointer.
+  /// \throws PythonBackendException if the tensor is stored in CPU.
+  uint64_t GetGPUPointerOffset();
+#endif  // TRITON_ENABLE_GPU
 
 #ifdef TRITON_PB_STUB
   /// Get NumPy representation of the tensor.
@@ -201,6 +215,7 @@ class PbTensor {
   /// Get the memory type id.
   /// \return The memory type id of the tensor.
   int64_t MemoryTypeId() const;
+
   PbTensor();
 
   /// Destructor
diff --git a/src/pb_utils.cc b/src/pb_utils.cc
diff --git a/src/pb_utils.h b/src/pb_utils.h
diff --git a/src/python.cc b/src/python.cc