Skip to content

Commit 3d800b3

Browse files
authoredAug 12, 2021
Fix CPU only for GPU tensors (triton-inference-server#74)
* Fix CPU only mode for GPU tensors * Review edits
1 parent 60a6496 commit 3d800b3

9 files changed

+191
-95
lines changed
 

‎CMakeLists.txt

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,7 @@ project(tritonpythonbackend LANGUAGES C CXX)
3737
# GPU support is disabled by default because python backend doesn't
3838
# because python backend does not need to access CUDA or GPUs
3939
#
40-
option(TRITON_ENABLE_GPU "Enable GPU support in backend" OFF)
41-
option(TRITON_ENABLE_GPU_TENSORS "Allow GPU input and output tensors" OFF)
40+
option(TRITON_ENABLE_GPU "Enable GPU support in backend" ON)
4241
option(TRITON_ENABLE_STATS "Include statistics collections in backend" ON)
4342

4443
set(TRITON_BACKEND_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/backend repo")
@@ -199,6 +198,7 @@ target_link_libraries(
199198
triton-backend-utils # from repo-backend
200199
ZLIB::ZLIB
201200
-larchive # shared memory
201+
-ldl # dlopen
202202
)
203203

204204
target_link_libraries(
@@ -210,19 +210,16 @@ target_link_libraries(
210210
triton-backend-utils # from repo-backend
211211
-larchive # libarchive
212212
-lrt # shared memory
213+
-ldl # dlopen
213214
)
214215

215216
if(${TRITON_ENABLE_GPU})
216217
target_link_libraries(
217218
triton-python-backend-stub
218-
PUBLIC
219-
CUDA::cuda_driver
220219
)
221220

222221
target_link_libraries(
223222
triton-python-backend
224-
PUBLIC
225-
CUDA::cuda_driver
226223
)
227224
endif() # TRITON_ENABLE_GPU
228225

‎src/pb_main_utils.cc

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,13 @@
2424
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
2525
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2626

27+
#include "pb_main_utils.h"
28+
2729
#include <future>
30+
#include "pb_utils.h"
2831
#include "triton/backend/backend_common.h"
2932
#include "triton/core/tritonserver.h"
3033

31-
#include "pb_main_utils.h"
32-
#include "pb_utils.h"
33-
3434
namespace triton { namespace backend { namespace python {
3535

3636
TRITONSERVER_Error*

‎src/pb_stub.cc

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,8 @@
2424
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
2525
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2626

27-
#include <pybind11/embed.h>
28-
#include <pybind11/numpy.h>
29-
#include <pybind11/stl.h>
27+
#include "pb_stub.h"
28+
3029
#include <sys/stat.h>
3130
#include <sys/types.h>
3231
#include <sys/wait.h>
@@ -47,12 +46,9 @@
4746
#include "pb_utils.h"
4847
#include "shm_manager.h"
4948

50-
#ifdef TRITON_ENABLE_GPU_TENSORS
51-
#include <cuda.h>
49+
#ifdef TRITON_ENABLE_GPU
5250
#include <cuda_runtime_api.h>
53-
#endif // TRITON_ENABLE_GPU_TENSORS
54-
55-
#include "pb_stub.h"
51+
#endif // TRITON_ENABLE_GPU
5652

5753
namespace py = pybind11;
5854
using namespace pybind11::literals;
@@ -298,7 +294,7 @@ Stub::ProcessResponse(
298294
}
299295

300296
if (!output_tensor->IsCPU()) {
301-
#ifdef TRITON_ALLOW_GPU_TENSORS
297+
#ifdef TRITON_ENABLE_GPU
302298
std::unordered_map<void*, cudaIpcMemHandle_t*>::const_iterator
303299
reused_gpu_tensor =
304300
gpu_tensors_map_.find(output_tensor->GetGPUStartAddress());
@@ -320,7 +316,7 @@ Stub::ProcessRequest(
320316
{
321317
std::unique_ptr<InferRequest> infer_request =
322318
InferRequest::LoadFromSharedMemory(shm_pool_, request_offset);
323-
#ifdef TRITON_ENABLE_GPU_TENSORS
319+
#ifdef TRITON_ENABLE_GPU
324320
for (auto& input_tensor : infer_request->Inputs()) {
325321
if (!input_tensor->IsCPU()) {
326322
response_batch->cleanup = true;
@@ -329,7 +325,7 @@ Stub::ProcessRequest(
329325
input_tensor->CudaIpcMemHandle()});
330326
}
331327
}
332-
#endif // TRITON_ENABLE_GPU_TENSORS
328+
#endif // TRITON_ENABLE_GPU
333329

334330
return infer_request;
335331
}
@@ -559,7 +555,7 @@ Stub::Cleanup()
559555
// Deleting the tensors should automatically trigger the destructor.
560556
tensors_to_remove_.clear();
561557

562-
#ifdef TRITON_ENABLE_GPU_TENSORS
558+
#ifdef TRITON_ENABLE_GPU
563559
gpu_tensors_map_.clear();
564560
#endif
565561
}

‎src/pb_stub.h

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,15 +24,23 @@
2424
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
2525
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2626

27+
#include <pybind11/embed.h>
28+
#include <pybind11/numpy.h>
29+
#include <pybind11/stl.h>
2730
#include <boost/interprocess/sync/interprocess_condition.hpp>
2831
#include <boost/interprocess/sync/interprocess_mutex.hpp>
2932
#include <boost/interprocess/sync/scoped_lock.hpp>
3033
#include <memory>
34+
#include "infer_request.h"
35+
#include "infer_response.h"
36+
#include "pb_tensor.h"
3137
#include "pb_utils.h"
3238

3339
#pragma once
3440

3541
namespace bi = boost::interprocess;
42+
namespace py = pybind11;
43+
using namespace pybind11::literals;
3644

3745
namespace triton { namespace backend { namespace python {
3846

@@ -58,9 +66,9 @@ class Stub {
5866
bool initialized_;
5967
static std::unique_ptr<Stub> stub_instance_;
6068

61-
#ifdef TRITON_ENABLE_GPU_TENSORS
69+
#ifdef TRITON_ENABLE_GPU
6270
std::unordered_map<void*, cudaIpcMemHandle_t*> gpu_tensors_map_;
63-
#endif // TRITON_ENABLE_GPU_TENSORS
71+
#endif // TRITON_ENABLE_GPU
6472

6573
public:
6674
Stub(){};

‎src/pb_tensor.cc

Lines changed: 30 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,9 @@
2424
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
2525
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2626

27-
#ifdef TRITON_ENABLE_GPU_TENSORS
27+
#ifdef TRITON_ENABLE_GPU
2828
#include <cuda.h>
29-
#endif // TRITON_ENABLE_GPU_TENSORS
29+
#endif // TRITON_ENABLE_GPU
3030

3131
#ifdef TRITON_PB_STUB
3232
#include "pb_stub_utils.h"
@@ -301,7 +301,7 @@ PbTensor::LoadFromSharedMemory(
301301
raw_data->memory_type, raw_data->memory_type_id, data,
302302
raw_data->byte_size, nullptr /* DLManaged Tensor */);
303303
} else if (raw_data->memory_type == TRITONSERVER_MEMORY_GPU) {
304-
#ifdef TRITON_ENABLE_GPU_TENSORS
304+
#ifdef TRITON_ENABLE_GPU
305305
cudaIpcMemHandle_t* cuda_ipc_mem_handle;
306306
shm_pool->MapOffset((char**)&cuda_ipc_mem_handle, raw_data->memory_ptr);
307307
if (!tensor_shm->is_reused) {
@@ -335,7 +335,7 @@ PbTensor::LoadFromSharedMemory(
335335
}
336336
#else
337337
throw PythonBackendException("GPU Tensor is not supported.");
338-
#endif // TRITON_ENABLE_GPU_TENSORS
338+
#endif // TRITON_ENABLE_GPU
339339
}
340340

341341
return pb_tensor;
@@ -422,7 +422,7 @@ PbTensor::FromDLPack(const std::string& name, const py::capsule& dlpack_tensor)
422422

423423
PbTensor::~PbTensor() noexcept(false)
424424
{
425-
#ifdef TRITON_ENABLE_GPU_TENSORS
425+
#ifdef TRITON_ENABLE_GPU
426426
if (!IsCPU() && cuda_ipc_mem_handle_ != nullptr &&
427427
destruct_cuda_ipc_mem_handle_) {
428428
cudaError_t err = cudaIpcCloseMemHandle(GetGPUStartAddress());
@@ -434,7 +434,7 @@ PbTensor::~PbTensor() noexcept(false)
434434
.c_str());
435435
}
436436
}
437-
#endif // TRITON_ENABLE_GPU_TENSORS
437+
#endif // TRITON_ENABLE_GPU
438438
DeleteDLPack();
439439
}
440440

@@ -459,24 +459,17 @@ PbTensor::AsNumpy() const
459459
}
460460
#endif // TRITON_PB_STUB
461461

462-
#ifdef TRITON_ENABLE_GPU_TENSORS
462+
#ifdef TRITON_ENABLE_GPU
463463
void*
464464
PbTensor::GetGPUStartAddress()
465465
{
466466
if (!this->IsCPU()) {
467+
CUDADriverAPI& driver_api = CUDADriverAPI::getInstance();
467468
CUdeviceptr start_address;
468-
CUresult cuda_err = cuPointerGetAttribute(
469+
470+
driver_api.PointerGetAttribute(
469471
&start_address, CU_POINTER_ATTRIBUTE_RANGE_START_ADDR,
470472
(CUdeviceptr)this->GetDataPtr());
471-
if (cuda_err != CUDA_SUCCESS) {
472-
const char* error_string;
473-
cuGetErrorString(cuda_err, &error_string);
474-
throw PythonBackendException(
475-
std::string(
476-
"failed to get cuda pointer device attribute: " +
477-
std::string(error_string))
478-
.c_str());
479-
}
480473

481474
return reinterpret_cast<void*>(start_address);
482475
}
@@ -485,6 +478,19 @@ PbTensor::GetGPUStartAddress()
485478
"Calling GetGPUStartAddress function on a CPU tensor.");
486479
}
487480

481+
uint64_t
482+
PbTensor::GetGPUPointerOffset()
483+
{
484+
if (!this->IsCPU()) {
485+
uint64_t offset = reinterpret_cast<char*>(this->GetDataPtr()) -
486+
reinterpret_cast<char*>(this->GetGPUStartAddress());
487+
return offset;
488+
}
489+
490+
throw PythonBackendException(
491+
"Calling GetGPUPointerOffset function on a CPU tensor.");
492+
}
493+
488494
void
489495
PbTensor::SetReusedIpcHandle(cudaIpcMemHandle_t* cuda_ipc_mem_handle)
490496
{
@@ -498,7 +504,7 @@ PbTensor::CudaIpcMemHandle()
498504
{
499505
return cuda_ipc_mem_handle_;
500506
}
501-
#endif // TRITON_ENABLE_GPU_TENSORS
507+
#endif // TRITON_ENABLE_GPU
502508

503509
void
504510
PbTensor::SaveToSharedMemory(
@@ -536,15 +542,15 @@ PbTensor::SaveToSharedMemory(
536542
memory_ptr_ = reinterpret_cast<void*>(data_in_shm);
537543
}
538544
} else {
539-
#ifdef TRITON_ENABLE_GPU_TENSORS
545+
#ifdef TRITON_ENABLE_GPU
540546
char* cuda_handle;
541547
uint64_t* ptr_offset;
542548
SaveTensorToSharedMemory(
543549
shm_pool, tensor_shm, cuda_handle, this->MemoryType(),
544550
this->MemoryTypeId(), this->ByteSize(), tensor_name.c_str(),
545-
this->Dims().data(), this->Dims().size(), dtype_triton, &ptr_offset);
546-
char* d_ptr = reinterpret_cast<char*>(this->GetDataPtr());
547-
*ptr_offset = GetDevicePointerOffset(d_ptr);
551+
this->Dims().data(), this->Dims().size(), dtype_triton, &ptr_offset,
552+
shm_offset_);
553+
*ptr_offset = this->GetGPUPointerOffset();
548554
if (!IsReused()) {
549555
cudaSetDevice(this->MemoryTypeId());
550556
cudaError_t err = cudaIpcGetMemHandle(
@@ -565,12 +571,10 @@ PbTensor::SaveToSharedMemory(
565571
*(reinterpret_cast<cudaIpcMemHandle_t*>(cuda_handle)) =
566572
*CudaIpcMemHandle();
567573
}
568-
void* start_address = this->GetGPUStartAddress();
569-
*ptr_offset = reinterpret_cast<char*>(this->GetDataPtr()) -
570-
reinterpret_cast<char*>(start_address);
574+
*ptr_offset = this->GetGPUPointerOffset();
571575
#else
572576
throw PythonBackendException("GPU tensors are not supported.");
573-
#endif // TRITON_ENABLE_GPU_TENSORS
577+
#endif // TRITON_ENABLE_GPU
574578
}
575579
}
576580

‎src/pb_tensor.h

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,9 @@
2727

2828
#pragma once
2929

30-
#ifdef TRITON_ENABLE_GPU_TENSORS
30+
#ifdef TRITON_ENABLE_GPU
3131
#include <cuda_runtime_api.h>
32-
#endif // TRITON_ENABLE_GPU_TENSORS
32+
#endif // TRITON_ENABLE_GPU
3333

3434
#include <dlpack/dlpack.h>
3535

@@ -71,9 +71,9 @@ class PbTensor {
7171
PYTHONBACKEND_TensorType tensor_type_;
7272
uint64_t byte_size_;
7373
DLManagedTensor* dl_managed_tensor_;
74-
#ifdef TRITON_ENABLE_GPU_TENSORS
74+
#ifdef TRITON_ENABLE_GPU
7575
cudaIpcMemHandle_t* cuda_ipc_mem_handle_ = nullptr;
76-
#endif // TRITON_ENABLE_GPU_TENSORS
76+
#endif // TRITON_ENABLE_GPU
7777
bool is_reused_ = false;
7878
uint64_t reused_tensor_offset_ = 0;
7979
bool destruct_cuda_ipc_mem_handle_ = false;
@@ -142,11 +142,25 @@ class PbTensor {
142142
const std::string& Name() const;
143143
static std::shared_ptr<PbTensor> LoadFromSharedMemory(
144144
std::unique_ptr<SharedMemory>& shm_pool, off_t tensor_offset);
145-
#ifdef TRITON_ENABLE_GPU_TENSORS
145+
#ifdef TRITON_ENABLE_GPU
146+
/// Set the cudaIpcMemHandle for the tensors that are reused.
147+
/// \param cuda_ipc_mem_handle reusued tensor cudaIpcMemHandle
146148
void SetReusedIpcHandle(cudaIpcMemHandle_t* cuda_ipc_mem_handle);
149+
150+
/// Get the GPU start address.
151+
/// \return The start address of a device pointer.
152+
/// \throws PythonBackendException if the tensor is stored in CPU.
147153
void* GetGPUStartAddress();
154+
155+
/// Get the cuda IPC handle corresponding to this tensor.
156+
/// \return The cudaIpcMemHandle
148157
cudaIpcMemHandle_t* CudaIpcMemHandle();
149-
#endif // TRITON_ENABLE_GPU_TENSORS
158+
159+
/// Get the GPU pointer offset.
160+
/// \return The offset of a device pointer.
161+
/// \throws PythonBackendException if the tensor is stored in CPU.
162+
uint64_t GetGPUPointerOffset();
163+
#endif // TRITON_ENABLE_GPU
150164

151165
#ifdef TRITON_PB_STUB
152166
/// Get NumPy representation of the tensor.
@@ -201,6 +215,7 @@ class PbTensor {
201215
/// Get the memory type id.
202216
/// \return The memory type id of the tensor.
203217
int64_t MemoryTypeId() const;
218+
204219
PbTensor();
205220

206221
/// Destructor

0 commit comments

Comments
 (0)