Skip to content

Commit 3d800b3

Browse files
authoredAug 12, 2021
Fix CPU only for GPU tensors (triton-inference-server#74)
* Fix CPU only mode for GPU tensors * Review edits
1 parent 60a6496 commit 3d800b3

9 files changed

+191
-95
lines changed
 

‎CMakeLists.txt

+3-6
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,7 @@ project(tritonpythonbackend LANGUAGES C CXX)
3737
# GPU support is disabled by default because python backend doesn't
3838
# because python backend does not need to access CUDA or GPUs
3939
#
40-
option(TRITON_ENABLE_GPU "Enable GPU support in backend" OFF)
41-
option(TRITON_ENABLE_GPU_TENSORS "Allow GPU input and output tensors" OFF)
40+
option(TRITON_ENABLE_GPU "Enable GPU support in backend" ON)
4241
option(TRITON_ENABLE_STATS "Include statistics collections in backend" ON)
4342

4443
set(TRITON_BACKEND_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/backend repo")
@@ -199,6 +198,7 @@ target_link_libraries(
199198
triton-backend-utils # from repo-backend
200199
ZLIB::ZLIB
201200
-larchive # shared memory
201+
-ldl # dlopen
202202
)
203203

204204
target_link_libraries(
@@ -210,19 +210,16 @@ target_link_libraries(
210210
triton-backend-utils # from repo-backend
211211
-larchive # libarchive
212212
-lrt # shared memory
213+
-ldl # dlopen
213214
)
214215

215216
if(${TRITON_ENABLE_GPU})
216217
target_link_libraries(
217218
triton-python-backend-stub
218-
PUBLIC
219-
CUDA::cuda_driver
220219
)
221220

222221
target_link_libraries(
223222
triton-python-backend
224-
PUBLIC
225-
CUDA::cuda_driver
226223
)
227224
endif() # TRITON_ENABLE_GPU
228225

‎src/pb_main_utils.cc

+3-3
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,13 @@
2424
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
2525
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2626

27+
#include "pb_main_utils.h"
28+
2729
#include <future>
30+
#include "pb_utils.h"
2831
#include "triton/backend/backend_common.h"
2932
#include "triton/core/tritonserver.h"
3033

31-
#include "pb_main_utils.h"
32-
#include "pb_utils.h"
33-
3434
namespace triton { namespace backend { namespace python {
3535

3636
TRITONSERVER_Error*

‎src/pb_stub.cc

+8-12
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,8 @@
2424
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
2525
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2626

27-
#include <pybind11/embed.h>
28-
#include <pybind11/numpy.h>
29-
#include <pybind11/stl.h>
27+
#include "pb_stub.h"
28+
3029
#include <sys/stat.h>
3130
#include <sys/types.h>
3231
#include <sys/wait.h>
@@ -47,12 +46,9 @@
4746
#include "pb_utils.h"
4847
#include "shm_manager.h"
4948

50-
#ifdef TRITON_ENABLE_GPU_TENSORS
51-
#include <cuda.h>
49+
#ifdef TRITON_ENABLE_GPU
5250
#include <cuda_runtime_api.h>
53-
#endif // TRITON_ENABLE_GPU_TENSORS
54-
55-
#include "pb_stub.h"
51+
#endif // TRITON_ENABLE_GPU
5652

5753
namespace py = pybind11;
5854
using namespace pybind11::literals;
@@ -298,7 +294,7 @@ Stub::ProcessResponse(
298294
}
299295

300296
if (!output_tensor->IsCPU()) {
301-
#ifdef TRITON_ALLOW_GPU_TENSORS
297+
#ifdef TRITON_ENABLE_GPU
302298
std::unordered_map<void*, cudaIpcMemHandle_t*>::const_iterator
303299
reused_gpu_tensor =
304300
gpu_tensors_map_.find(output_tensor->GetGPUStartAddress());
@@ -320,7 +316,7 @@ Stub::ProcessRequest(
320316
{
321317
std::unique_ptr<InferRequest> infer_request =
322318
InferRequest::LoadFromSharedMemory(shm_pool_, request_offset);
323-
#ifdef TRITON_ENABLE_GPU_TENSORS
319+
#ifdef TRITON_ENABLE_GPU
324320
for (auto& input_tensor : infer_request->Inputs()) {
325321
if (!input_tensor->IsCPU()) {
326322
response_batch->cleanup = true;
@@ -329,7 +325,7 @@ Stub::ProcessRequest(
329325
input_tensor->CudaIpcMemHandle()});
330326
}
331327
}
332-
#endif // TRITON_ENABLE_GPU_TENSORS
328+
#endif // TRITON_ENABLE_GPU
333329

334330
return infer_request;
335331
}
@@ -559,7 +555,7 @@ Stub::Cleanup()
559555
// Deleting the tensors should automatically trigger the destructor.
560556
tensors_to_remove_.clear();
561557

562-
#ifdef TRITON_ENABLE_GPU_TENSORS
558+
#ifdef TRITON_ENABLE_GPU
563559
gpu_tensors_map_.clear();
564560
#endif
565561
}

‎src/pb_stub.h

+10-2
Original file line numberDiff line numberDiff line change
@@ -24,15 +24,23 @@
2424
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
2525
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2626

27+
#include <pybind11/embed.h>
28+
#include <pybind11/numpy.h>
29+
#include <pybind11/stl.h>
2730
#include <boost/interprocess/sync/interprocess_condition.hpp>
2831
#include <boost/interprocess/sync/interprocess_mutex.hpp>
2932
#include <boost/interprocess/sync/scoped_lock.hpp>
3033
#include <memory>
34+
#include "infer_request.h"
35+
#include "infer_response.h"
36+
#include "pb_tensor.h"
3137
#include "pb_utils.h"
3238

3339
#pragma once
3440

3541
namespace bi = boost::interprocess;
42+
namespace py = pybind11;
43+
using namespace pybind11::literals;
3644

3745
namespace triton { namespace backend { namespace python {
3846

@@ -58,9 +66,9 @@ class Stub {
5866
bool initialized_;
5967
static std::unique_ptr<Stub> stub_instance_;
6068

61-
#ifdef TRITON_ENABLE_GPU_TENSORS
69+
#ifdef TRITON_ENABLE_GPU
6270
std::unordered_map<void*, cudaIpcMemHandle_t*> gpu_tensors_map_;
63-
#endif // TRITON_ENABLE_GPU_TENSORS
71+
#endif // TRITON_ENABLE_GPU
6472

6573
public:
6674
Stub(){};

‎src/pb_tensor.cc

+30-26
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,9 @@
2424
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
2525
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2626

27-
#ifdef TRITON_ENABLE_GPU_TENSORS
27+
#ifdef TRITON_ENABLE_GPU
2828
#include <cuda.h>
29-
#endif // TRITON_ENABLE_GPU_TENSORS
29+
#endif // TRITON_ENABLE_GPU
3030

3131
#ifdef TRITON_PB_STUB
3232
#include "pb_stub_utils.h"
@@ -301,7 +301,7 @@ PbTensor::LoadFromSharedMemory(
301301
raw_data->memory_type, raw_data->memory_type_id, data,
302302
raw_data->byte_size, nullptr /* DLManaged Tensor */);
303303
} else if (raw_data->memory_type == TRITONSERVER_MEMORY_GPU) {
304-
#ifdef TRITON_ENABLE_GPU_TENSORS
304+
#ifdef TRITON_ENABLE_GPU
305305
cudaIpcMemHandle_t* cuda_ipc_mem_handle;
306306
shm_pool->MapOffset((char**)&cuda_ipc_mem_handle, raw_data->memory_ptr);
307307
if (!tensor_shm->is_reused) {
@@ -335,7 +335,7 @@ PbTensor::LoadFromSharedMemory(
335335
}
336336
#else
337337
throw PythonBackendException("GPU Tensor is not supported.");
338-
#endif // TRITON_ENABLE_GPU_TENSORS
338+
#endif // TRITON_ENABLE_GPU
339339
}
340340

341341
return pb_tensor;
@@ -422,7 +422,7 @@ PbTensor::FromDLPack(const std::string& name, const py::capsule& dlpack_tensor)
422422

423423
PbTensor::~PbTensor() noexcept(false)
424424
{
425-
#ifdef TRITON_ENABLE_GPU_TENSORS
425+
#ifdef TRITON_ENABLE_GPU
426426
if (!IsCPU() && cuda_ipc_mem_handle_ != nullptr &&
427427
destruct_cuda_ipc_mem_handle_) {
428428
cudaError_t err = cudaIpcCloseMemHandle(GetGPUStartAddress());
@@ -434,7 +434,7 @@ PbTensor::~PbTensor() noexcept(false)
434434
.c_str());
435435
}
436436
}
437-
#endif // TRITON_ENABLE_GPU_TENSORS
437+
#endif // TRITON_ENABLE_GPU
438438
DeleteDLPack();
439439
}
440440

@@ -459,24 +459,17 @@ PbTensor::AsNumpy() const
459459
}
460460
#endif // TRITON_PB_STUB
461461

462-
#ifdef TRITON_ENABLE_GPU_TENSORS
462+
#ifdef TRITON_ENABLE_GPU
463463
void*
464464
PbTensor::GetGPUStartAddress()
465465
{
466466
if (!this->IsCPU()) {
467+
CUDADriverAPI& driver_api = CUDADriverAPI::getInstance();
467468
CUdeviceptr start_address;
468-
CUresult cuda_err = cuPointerGetAttribute(
469+
470+
driver_api.PointerGetAttribute(
469471
&start_address, CU_POINTER_ATTRIBUTE_RANGE_START_ADDR,
470472
(CUdeviceptr)this->GetDataPtr());
471-
if (cuda_err != CUDA_SUCCESS) {
472-
const char* error_string;
473-
cuGetErrorString(cuda_err, &error_string);
474-
throw PythonBackendException(
475-
std::string(
476-
"failed to get cuda pointer device attribute: " +
477-
std::string(error_string))
478-
.c_str());
479-
}
480473

481474
return reinterpret_cast<void*>(start_address);
482475
}
@@ -485,6 +478,19 @@ PbTensor::GetGPUStartAddress()
485478
"Calling GetGPUStartAddress function on a CPU tensor.");
486479
}
487480

481+
uint64_t
482+
PbTensor::GetGPUPointerOffset()
483+
{
484+
if (!this->IsCPU()) {
485+
uint64_t offset = reinterpret_cast<char*>(this->GetDataPtr()) -
486+
reinterpret_cast<char*>(this->GetGPUStartAddress());
487+
return offset;
488+
}
489+
490+
throw PythonBackendException(
491+
"Calling GetGPUPointerOffset function on a CPU tensor.");
492+
}
493+
488494
void
489495
PbTensor::SetReusedIpcHandle(cudaIpcMemHandle_t* cuda_ipc_mem_handle)
490496
{
@@ -498,7 +504,7 @@ PbTensor::CudaIpcMemHandle()
498504
{
499505
return cuda_ipc_mem_handle_;
500506
}
501-
#endif // TRITON_ENABLE_GPU_TENSORS
507+
#endif // TRITON_ENABLE_GPU
502508

503509
void
504510
PbTensor::SaveToSharedMemory(
@@ -536,15 +542,15 @@ PbTensor::SaveToSharedMemory(
536542
memory_ptr_ = reinterpret_cast<void*>(data_in_shm);
537543
}
538544
} else {
539-
#ifdef TRITON_ENABLE_GPU_TENSORS
545+
#ifdef TRITON_ENABLE_GPU
540546
char* cuda_handle;
541547
uint64_t* ptr_offset;
542548
SaveTensorToSharedMemory(
543549
shm_pool, tensor_shm, cuda_handle, this->MemoryType(),
544550
this->MemoryTypeId(), this->ByteSize(), tensor_name.c_str(),
545-
this->Dims().data(), this->Dims().size(), dtype_triton, &ptr_offset);
546-
char* d_ptr = reinterpret_cast<char*>(this->GetDataPtr());
547-
*ptr_offset = GetDevicePointerOffset(d_ptr);
551+
this->Dims().data(), this->Dims().size(), dtype_triton, &ptr_offset,
552+
shm_offset_);
553+
*ptr_offset = this->GetGPUPointerOffset();
548554
if (!IsReused()) {
549555
cudaSetDevice(this->MemoryTypeId());
550556
cudaError_t err = cudaIpcGetMemHandle(
@@ -565,12 +571,10 @@ PbTensor::SaveToSharedMemory(
565571
*(reinterpret_cast<cudaIpcMemHandle_t*>(cuda_handle)) =
566572
*CudaIpcMemHandle();
567573
}
568-
void* start_address = this->GetGPUStartAddress();
569-
*ptr_offset = reinterpret_cast<char*>(this->GetDataPtr()) -
570-
reinterpret_cast<char*>(start_address);
574+
*ptr_offset = this->GetGPUPointerOffset();
571575
#else
572576
throw PythonBackendException("GPU tensors are not supported.");
573-
#endif // TRITON_ENABLE_GPU_TENSORS
577+
#endif // TRITON_ENABLE_GPU
574578
}
575579
}
576580

‎src/pb_tensor.h

+21-6
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,9 @@
2727

2828
#pragma once
2929

30-
#ifdef TRITON_ENABLE_GPU_TENSORS
30+
#ifdef TRITON_ENABLE_GPU
3131
#include <cuda_runtime_api.h>
32-
#endif // TRITON_ENABLE_GPU_TENSORS
32+
#endif // TRITON_ENABLE_GPU
3333

3434
#include <dlpack/dlpack.h>
3535

@@ -71,9 +71,9 @@ class PbTensor {
7171
PYTHONBACKEND_TensorType tensor_type_;
7272
uint64_t byte_size_;
7373
DLManagedTensor* dl_managed_tensor_;
74-
#ifdef TRITON_ENABLE_GPU_TENSORS
74+
#ifdef TRITON_ENABLE_GPU
7575
cudaIpcMemHandle_t* cuda_ipc_mem_handle_ = nullptr;
76-
#endif // TRITON_ENABLE_GPU_TENSORS
76+
#endif // TRITON_ENABLE_GPU
7777
bool is_reused_ = false;
7878
uint64_t reused_tensor_offset_ = 0;
7979
bool destruct_cuda_ipc_mem_handle_ = false;
@@ -142,11 +142,25 @@ class PbTensor {
142142
const std::string& Name() const;
143143
static std::shared_ptr<PbTensor> LoadFromSharedMemory(
144144
std::unique_ptr<SharedMemory>& shm_pool, off_t tensor_offset);
145-
#ifdef TRITON_ENABLE_GPU_TENSORS
145+
#ifdef TRITON_ENABLE_GPU
146+
/// Set the cudaIpcMemHandle for the tensors that are reused.
147+
/// \param cuda_ipc_mem_handle reusued tensor cudaIpcMemHandle
146148
void SetReusedIpcHandle(cudaIpcMemHandle_t* cuda_ipc_mem_handle);
149+
150+
/// Get the GPU start address.
151+
/// \return The start address of a device pointer.
152+
/// \throws PythonBackendException if the tensor is stored in CPU.
147153
void* GetGPUStartAddress();
154+
155+
/// Get the cuda IPC handle corresponding to this tensor.
156+
/// \return The cudaIpcMemHandle
148157
cudaIpcMemHandle_t* CudaIpcMemHandle();
149-
#endif // TRITON_ENABLE_GPU_TENSORS
158+
159+
/// Get the GPU pointer offset.
160+
/// \return The offset of a device pointer.
161+
/// \throws PythonBackendException if the tensor is stored in CPU.
162+
uint64_t GetGPUPointerOffset();
163+
#endif // TRITON_ENABLE_GPU
150164

151165
#ifdef TRITON_PB_STUB
152166
/// Get NumPy representation of the tensor.
@@ -201,6 +215,7 @@ class PbTensor {
201215
/// Get the memory type id.
202216
/// \return The memory type id of the tensor.
203217
int64_t MemoryTypeId() const;
218+
204219
PbTensor();
205220

206221
/// Destructor

‎src/pb_utils.cc

+68-26
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828

2929
#include <archive.h>
3030
#include <archive_entry.h>
31+
#include <dlfcn.h>
3132
#include <errno.h>
3233
#include <fcntl.h>
3334
#include <pthread.h>
@@ -44,7 +45,7 @@
4445
#include <unordered_map>
4546
#include "shm_manager.h"
4647

47-
#ifdef TRITON_ENABLE_GPU_TENSORS
48+
#ifdef TRITON_ENABLE_GPU
4849
#include <cuda.h>
4950
#include <cuda_runtime_api.h>
5051
#endif
@@ -83,29 +84,6 @@ SaveStringToSharedMemory(
8384
strcpy(string_data, str);
8485
}
8586

86-
#ifdef TRITON_ENABLE_GPU_TENSORS
87-
size_t
88-
GetDevicePointerOffset(void* d_ptr)
89-
{
90-
CUdeviceptr start_address;
91-
CUresult cuda_err = cuPointerGetAttribute(
92-
&start_address, CU_POINTER_ATTRIBUTE_RANGE_START_ADDR,
93-
reinterpret_cast<CUdeviceptr>(d_ptr));
94-
if (cuda_err != CUDA_SUCCESS) {
95-
const char* error_string;
96-
cuGetErrorString(cuda_err, &error_string);
97-
throw PythonBackendException(
98-
std::string(
99-
"failed to get cuda pointer device attribute: " +
100-
std::string(error_string))
101-
.c_str());
102-
}
103-
104-
return reinterpret_cast<char*>(d_ptr) -
105-
reinterpret_cast<char*>(start_address);
106-
}
107-
#endif // TRITON_ENABLE_GPU_TENSORS
108-
10987
void
11088
SaveRawDataToSharedMemory(
11189
std::unique_ptr<SharedMemory>& shm_pool, off_t& raw_data_offset,
@@ -134,15 +112,15 @@ SaveRawDataToSharedMemory(
134112
}
135113

136114
if (memory_type == TRITONSERVER_MEMORY_GPU) {
137-
#ifdef TRITON_ENABLE_GPU_TENSORS
115+
#ifdef TRITON_ENABLE_GPU
138116
off_t buffer_offset;
139117
shm_pool->Map(
140118
(char**)&raw_data_ptr, sizeof(cudaIpcMemHandle_t), buffer_offset);
141119
raw_data->memory_ptr = buffer_offset;
142120
#else
143121
throw PythonBackendException(
144122
"Python backend does not support GPU tensors.");
145-
#endif // TRITON_ENABLE_GPU_TENSORS
123+
#endif // TRITON_ENABLE_GPU
146124
}
147125
}
148126

@@ -341,4 +319,68 @@ FileExists(std::string& path)
341319
return stat(path.c_str(), &buffer) == 0;
342320
}
343321

322+
#ifdef TRITON_ENABLE_GPU
323+
324+
CUDADriverAPI::CUDADriverAPI()
325+
{
326+
dl_open_handle_ = dlopen("libcuda.so", RTLD_LAZY);
327+
328+
// If libcuda.so is succesfully opened, it must be able to find
329+
// "cuPointerGetAttribute" and "cuGetErrorString" symbols.
330+
if (dl_open_handle_ != nullptr) {
331+
void* cu_pointer_get_attribute_fn =
332+
dlsym(dl_open_handle_, "cuPointerGetAttribute");
333+
if (cu_pointer_get_attribute_fn == nullptr) {
334+
throw PythonBackendException(
335+
std::string("Failed to dlsym 'cuPointerGetAttribute'. Error: ") +
336+
dlerror());
337+
}
338+
*((void**)&cu_pointer_get_attribute_fn_) = cu_pointer_get_attribute_fn;
339+
340+
void* cu_get_error_string_fn = dlsym(dl_open_handle_, "cuGetErrorString");
341+
if (cu_get_error_string_fn == nullptr) {
342+
throw PythonBackendException(
343+
std::string("Failed to dlsym 'cuGetErrorString'. Error: ") +
344+
dlerror());
345+
}
346+
*((void**)&cu_get_error_string_fn_) = cu_get_error_string_fn;
347+
}
348+
}
349+
350+
void
351+
CUDADriverAPI::PointerGetAttribute(
352+
CUdeviceptr* start_address, CUpointer_attribute attribute,
353+
CUdeviceptr dev_ptr)
354+
{
355+
CUresult cuda_err =
356+
(*cu_pointer_get_attribute_fn_)(start_address, attribute, dev_ptr);
357+
if (cuda_err != CUDA_SUCCESS) {
358+
const char* error_string;
359+
(*cu_get_error_string_fn_)(cuda_err, &error_string);
360+
throw PythonBackendException(
361+
std::string(
362+
"failed to get cuda pointer device attribute: " +
363+
std::string(error_string))
364+
.c_str());
365+
}
366+
}
367+
368+
bool
369+
CUDADriverAPI::IsAvailable()
370+
{
371+
return dl_open_handle_ != nullptr;
372+
}
373+
374+
CUDADriverAPI::~CUDADriverAPI() noexcept(false)
375+
{
376+
if (dl_open_handle_ != nullptr) {
377+
int status = dlclose(dl_open_handle_);
378+
if (status != 0) {
379+
throw PythonBackendException("Failed to close the libcuda handle.");
380+
}
381+
}
382+
}
383+
384+
#endif
385+
344386
}}} // namespace triton::backend::python

‎src/pb_utils.h

+30-2
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,9 @@
2626

2727
#pragma once
2828

29+
#ifdef TRITON_ENABLE_GPU
30+
#include <cuda.h>
31+
#endif // TRITON_ENABLE_GPU
2932
#include <pthread.h>
3033
#include <climits>
3134
#include <exception>
@@ -94,8 +97,6 @@ struct ExecuteArgs {
9497
off_t response_batch;
9598
};
9699

97-
size_t GetDevicePointerOffset(void* d_ptr);
98-
99100
struct InitializeArgs {
100101
off_t args;
101102
// Indicates whether the response has an error or not.
@@ -260,4 +261,31 @@ void ExtractTarFile(std::string& archive_path, std::string& dst_path);
260261

261262
bool FileExists(std::string& path);
262263

264+
#ifdef TRITON_ENABLE_GPU
265+
class CUDADriverAPI {
266+
public:
267+
static CUDADriverAPI& getInstance()
268+
{
269+
static CUDADriverAPI instance;
270+
return instance;
271+
}
272+
273+
private:
274+
void* dl_open_handle_ = nullptr;
275+
CUresult (*cu_pointer_get_attribute_fn_)(
276+
CUdeviceptr*, CUpointer_attribute, CUdeviceptr) = nullptr;
277+
CUresult (*cu_get_error_string_fn_)(CUresult, const char**) = nullptr;
278+
CUDADriverAPI();
279+
~CUDADriverAPI() noexcept(false);
280+
281+
public:
282+
CUDADriverAPI(CUDADriverAPI const&) = delete;
283+
void operator=(CUDADriverAPI const&) = delete;
284+
bool IsAvailable();
285+
void PointerGetAttribute(
286+
CUdeviceptr* start_address, CUpointer_attribute attr,
287+
CUdeviceptr device_ptr);
288+
};
289+
#endif // TRITON_ENABLE_GPU
290+
263291
}}} // namespace triton::backend::python

‎src/python.cc

+18-12
Original file line numberDiff line numberDiff line change
@@ -65,10 +65,6 @@
6565
#include "triton/core/tritonbackend.h"
6666
#include "triton/core/tritonserver.h"
6767

68-
#ifdef TRITON_ENABLE_GPU_TENSORS
69-
#include <cuda.h>
70-
#endif // TRITON_ENABLE_GPU_TENSORS
71-
7268
#define LOG_IF_EXCEPTION(X) \
7369
do { \
7470
try { \
@@ -258,12 +254,12 @@ class ModelInstanceState : public BackendModelInstance {
258254
std::string path_to_libpython_;
259255
std::string path_to_activate_;
260256

261-
#ifdef TRITON_ENABLE_GPU_TENSORS
257+
#ifdef TRITON_ENABLE_GPU
262258
std::unordered_map<
263259
std::array<char, sizeof(cudaIpcMemHandle_t)>, void*,
264260
boost::hash<std::array<char, sizeof(cudaIpcMemHandle_t)>>>
265261
gpu_tensors_map_;
266-
#endif // TRITON_ENABLE_GPU_TENSORS
262+
#endif // TRITON_ENABLE_GPU
267263
public:
268264
static TRITONSERVER_Error* Create(
269265
ModelState* model_state, TRITONBACKEND_ModelInstance* model_instance,
@@ -846,7 +842,7 @@ ModelInstanceState::ProcessRequests(
846842
TRITONSERVER_MemoryType actual_memory_type = src_memory_type;
847843
int64_t actual_memory_type_id = src_memory_type_id;
848844

849-
#ifdef TRITON_ENABLE_GPU_TENSORS
845+
#ifdef TRITON_ENABLE_GPU
850846
if (actual_memory_type == TRITONSERVER_MEMORY_GPU &&
851847
output_tensor->IsReused()) {
852848
std::array<char, sizeof(cudaIpcMemHandle_t)> cuda_handle;
@@ -1356,6 +1352,16 @@ ModelInstanceState::GetInputTensor(
13561352
if (input_dtype == TRITONSERVER_TYPE_BYTES) {
13571353
cpu_only_tensors = true;
13581354
}
1355+
1356+
#ifdef TRITON_ENABLE_GPU
1357+
CUDADriverAPI& cuda_driver_api = CUDADriverAPI::getInstance();
1358+
// If CUDA driver API is not available, the input tensors will be moved to
1359+
// CPU.
1360+
if (!cuda_driver_api.IsAvailable()) {
1361+
cpu_only_tensors = true;
1362+
}
1363+
#endif
1364+
13591365
TRITONSERVER_MemoryType src_memory_type;
13601366
int64_t src_memory_type_id;
13611367
size_t src_byte_size;
@@ -1364,11 +1370,11 @@ ModelInstanceState::GetInputTensor(
13641370
in, 0 /* input buffer index */, &src_ptr, &src_byte_size,
13651371
&src_memory_type, &src_memory_type_id));
13661372

1367-
// If TRITON_ENABLE_GPU_TENSORS is false, we need to copy the tensors
1373+
// If TRITON_ENABLE_GPU is false, we need to copy the tensors
13681374
// to the CPU.
1369-
#ifndef TRITON_ENABLE_GPU_TENSORS
1375+
#ifndef TRITON_ENABLE_GPU
13701376
cpu_only_tensors = true;
1371-
#endif // TRITON_ENABLE_GPU_TENSORS
1377+
#endif // TRITON_ENABLE_GPU
13721378

13731379
if (cpu_only_tensors || src_memory_type != TRITONSERVER_MEMORY_GPU) {
13741380
input_tensor = std::make_unique<PbTensor>(
@@ -1384,7 +1390,7 @@ ModelInstanceState::GetInputTensor(
13841390
input_name, input_buffer, input_byte_size,
13851391
TRITONSERVER_MEMORY_CPU /* memory_type */, 0 /* memory_type_id */);
13861392
} else {
1387-
#ifdef TRITON_ENABLE_GPU_TENSORS
1393+
#ifdef TRITON_ENABLE_GPU
13881394
// Retreiving GPU input tensors
13891395
const void* buffer = nullptr;
13901396
std::vector<std::pair<TRITONSERVER_MemoryType, int64_t>> alloc_perference;
@@ -1414,7 +1420,7 @@ ModelInstanceState::GetInputTensor(
14141420
return TRITONSERVER_ErrorNew(
14151421
TRITONSERVER_ERROR_INTERNAL,
14161422
"Python backend does not support GPU tensors.");
1417-
#endif // TRITON_ENABLE_GPU_TENSORS
1423+
#endif // TRITON_ENABLE_GPU
14181424
}
14191425

14201426
return nullptr;

0 commit comments

Comments
 (0)
Please sign in to comment.