Use context guard to replace make_current

turbo0628 · turbo0628 · commit e08f30238346 · 2022-08-31T20:33:35.000+08:00
diff --git a/taichi/codegen/cuda/codegen_cuda.cpp b/taichi/codegen/cuda/codegen_cuda.cpp
@@ -786,7 +786,6 @@ FunctionType CUDAModuleToFunctionConverter::convert(
 
   return [cuda_modules, kernel_name, args, offloaded_tasks,
           executor = this->executor_](RuntimeContext &context) {
-    CUDAContext::get_instance().make_current();
     std::vector<void *> arg_buffers(args.size(), nullptr);
     std::vector<void *> device_buffers(args.size(), nullptr);
 
@@ -804,24 +803,28 @@ FunctionType CUDAModuleToFunctionConverter::convert(
           // in shapes, e.g., shape=(0) or shape=(100, 0, 200). This makes
           // `arr_sz` zero.
           unsigned int attr_val = 0;
-          uint32_t ret_code = CUDADriver::get_instance().mem_get_attribute.call(
-              &attr_val, CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
-              (void *)arg_buffers[i]);
-
-          if (ret_code != CUDA_SUCCESS || attr_val != CU_MEMORYTYPE_DEVICE) {
-            // Copy to device buffer if arg is on host
-            // - ret_code != CUDA_SUCCESS:
-            //   arg_buffers[i] is not on device
-            // - attr_val != CU_MEMORYTYPE_DEVICE:
-            //   Cuda driver is aware of arg_buffers[i] but it might be on
-            //   host.
-            // See CUDA driver API `cuPointerGetAttribute` for more details.
-            transferred = true;
-            CUDADriver::get_instance().malloc(&device_buffers[i], arr_sz);
-            CUDADriver::get_instance().memcpy_host_to_device(
-                (void *)device_buffers[i], arg_buffers[i], arr_sz);
-          } else {
-            device_buffers[i] = arg_buffers[i];
+          {
+            auto context_guard = CUDAContext::get_instance().get_guard();
+            uint32_t ret_code =
+                CUDADriver::get_instance().mem_get_attribute.call(
+                    &attr_val, CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
+                    (void *)arg_buffers[i]);
+
+            if (ret_code != CUDA_SUCCESS || attr_val != CU_MEMORYTYPE_DEVICE) {
+              // Copy to device buffer if arg is on host
+              // - ret_code != CUDA_SUCCESS:
+              //   arg_buffers[i] is not on device
+              // - attr_val != CU_MEMORYTYPE_DEVICE:
+              //   Cuda driver is aware of arg_buffers[i] but it might be on
+              //   host.
+              // See CUDA driver API `cuPointerGetAttribute` for more details.
+              transferred = true;
+              CUDADriver::get_instance().malloc(&device_buffers[i], arr_sz);
+              CUDADriver::get_instance().memcpy_host_to_device(
+                  (void *)device_buffers[i], arg_buffers[i], arr_sz);
+            } else {
+              device_buffers[i] = arg_buffers[i];
+            }
           }
           // device_buffers[i] saves a raw ptr on CUDA device.
           context.set_arg(i, (uint64)device_buffers[i]);
@@ -845,7 +848,10 @@ FunctionType CUDAModuleToFunctionConverter::convert(
       }
     }
     if (transferred) {
-      CUDADriver::get_instance().stream_synchronize(nullptr);
+      {
+        auto context_guard = CUDAContext::get_instance().get_guard();
+        CUDADriver::get_instance().stream_synchronize(nullptr);
+      }
     }
 
     for (int i = 0; i < offloaded_tasks.size(); i++) {
@@ -859,6 +865,7 @@ FunctionType CUDAModuleToFunctionConverter::convert(
 
     // copy data back to host
     if (transferred) {
+      auto context_guard = CUDAContext::get_instance().get_guard();
       CUDADriver::get_instance().stream_synchronize(nullptr);
       for (int i = 0; i < (int)args.size(); i++) {
         if (device_buffers[i] != arg_buffers[i]) {
diff --git a/taichi/program/sparse_matrix.cpp b/taichi/program/sparse_matrix.cpp
@@ -1,4 +1,7 @@
 #include "taichi/program/sparse_matrix.h"
+#if defined(TI_WITH_CUDA)
+#include "taichi/rhi/cuda/cuda_context.h"
+#endif
 
 #include <sstream>
 #include <string>
@@ -204,6 +207,7 @@ void CuSparseMatrix::build_csr_from_coo(void *coo_row_ptr,
                                         int nnz) {
 #if defined(TI_WITH_CUDA)
   void *csr_row_offset_ptr = NULL;
+  auto context_guard = CUDAContext::get_instance().get_guard();
   CUDADriver::get_instance().malloc(&csr_row_offset_ptr,
                                     sizeof(int) * (rows_ + 1));
   cusparseHandle_t cusparse_handle;
@@ -269,8 +273,10 @@ void CuSparseMatrix::spmv(Program *prog, const Ndarray &x, Ndarray &y) {
       &beta, vecY, CUDA_R_32F, CUSPARSE_SPMV_CSR_ALG1, &bufferSize);
 
   void *dBuffer = NULL;
-  if (bufferSize > 0)
+  if (bufferSize > 0) {
+    auto context_guard = CUDAContext::get_instance().get_guard();
     CUDADriver::get_instance().malloc(&dBuffer, bufferSize);
+  }
   CUSPARSEDriver::get_instance().cpSpMV(
       cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matrix_, vecX,
       &beta, vecY, CUDA_R_32F, CUSPARSE_SPMV_CSR_ALG1, dBuffer);
diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp
@@ -1100,14 +1100,6 @@ void export_lang(py::module &m) {
     }
   });
 
-#if defined(TI_WITH_CUDA)
-  m.def("pop_cuda_context",
-        []() { CUDADriver::get_instance().context_pop_current(NULL); });
-
-  m.def("push_cuda_context",
-        []() { CUDAContext::get_instance().make_current(); });
-#endif
-
   // Type system
 
   py::class_<Type>(m, "Type").def("to_string", &Type::to_string);
diff --git a/taichi/rhi/cuda/cuda_context.cpp b/taichi/rhi/cuda/cuda_context.cpp
@@ -55,20 +55,26 @@ CUDAContext::CUDAContext()
 
 std::size_t CUDAContext::get_total_memory() {
   std::size_t ret, _;
+  this->make_current();
   driver_.mem_get_info(&_, &ret);
+  driver_.context_pop_current(nullptr);
   return ret;
 }
 
 std::size_t CUDAContext::get_free_memory() {
   std::size_t ret, _;
+  this->make_current();
   driver_.mem_get_info(&ret, &_);
+  driver_.context_pop_current(nullptr);
   return ret;
 }
 
 std::string CUDAContext::get_device_name() {
   constexpr uint32_t kMaxNameStringLength = 128;
   char name[kMaxNameStringLength];
+  this->make_current();
   driver_.device_get_name(name, kMaxNameStringLength /*=128*/, device_);
+  driver_.context_pop_current(nullptr);
   std::string str(name);
   return str;
 }
diff --git a/taichi/rhi/cuda/cuda_context.h b/taichi/rhi/cuda/cuda_context.h
@@ -78,21 +78,33 @@ class CUDAContext {
     void *new_ctx_;
 
    public:
-    ContextGuard(CUDAContext *new_ctx) : old_ctx_(nullptr), new_ctx_(new_ctx) {
+    ContextGuard(CUDAContext *new_ctx)
+        : old_ctx_(nullptr), new_ctx_(new_ctx->context_) {
       CUDADriver::get_instance().context_get_current(&old_ctx_);
-      if (old_ctx_ != new_ctx)
+      if (old_ctx_ != new_ctx_) {
         new_ctx->make_current();
+      }
     }
 
     ~ContextGuard() {
-      if (old_ctx_ != new_ctx_) {
+      // Always pop out the current context in order to inter-operator with
+      // 3rd-party libs However, this would cause problems when there are nested
+      // guards Use the following logic if encountered context-related any
+      // errors if (old_ctx_ != new_ctx_) {
+      //   CUDADriver::get_instance().context_set_current(old_ctx_);
+      // }
+      void *pop_ctx = nullptr;
+      CUDADriver::get_instance().context_pop_current(&pop_ctx);
+      TI_ASSERT(pop_ctx == new_ctx_);
+
+      if (old_ctx_ != nullptr) {
         CUDADriver::get_instance().context_set_current(old_ctx_);
       }
     }
   };
 
-  ContextGuard get_guard() {
-    return ContextGuard(this);
+  std::unique_ptr<ContextGuard> get_guard() {
+    return std::move(std::make_unique<ContextGuard>(this));
   }
 
   std::unique_lock<std::mutex> get_lock_guard() {
diff --git a/taichi/rhi/cuda/cuda_device.cpp b/taichi/rhi/cuda/cuda_device.cpp
@@ -14,6 +14,8 @@ CudaDevice::AllocInfo CudaDevice::get_alloc_info(
 DeviceAllocation CudaDevice::allocate_memory(const AllocParams &params) {
   AllocInfo info;
 
+  auto context_guard = CUDAContext::get_instance().get_guard();
+
   if (params.host_read || params.host_write) {
     CUDADriver::get_instance().malloc_managed(&info.ptr, params.size,
                                               CU_MEM_ATTACH_GLOBAL);
@@ -45,7 +47,10 @@ DeviceAllocation CudaDevice::allocate_memory_runtime(
       caching_allocator_ = std::make_unique<CudaCachingAllocator>(this);
     }
     info.ptr = caching_allocator_->allocate(params);
-    CUDADriver::get_instance().memset((void *)info.ptr, 0, info.size);
+    {
+      auto context_guard = CUDAContext::get_instance().get_guard();
+      CUDADriver::get_instance().memset((void *)info.ptr, 0, info.size);
+    }
   } else {
     info.ptr = allocate_llvm_runtime_memory_jit(params);
   }
@@ -63,6 +68,9 @@ DeviceAllocation CudaDevice::allocate_memory_runtime(
 
 void CudaDevice::dealloc_memory(DeviceAllocation handle) {
   validate_device_alloc(handle);
+
+  auto context_guard = CUDAContext::get_instance().get_guard();
+
   AllocInfo &info = allocations_[handle.alloc_id];
   if (info.ptr == nullptr) {
     TI_ERROR("the DeviceAllocation is already deallocated");
@@ -80,6 +88,8 @@ void CudaDevice::dealloc_memory(DeviceAllocation handle) {
 }
 
 void *CudaDevice::map(DeviceAllocation alloc) {
+  auto context_guard = CUDAContext::get_instance().get_guard();
+
   AllocInfo &info = allocations_[alloc.alloc_id];
   size_t size = info.size;
   info.mapped = new char[size];
@@ -89,6 +99,8 @@ void *CudaDevice::map(DeviceAllocation alloc) {
 }
 
 void CudaDevice::unmap(DeviceAllocation alloc) {
+  auto context_guard = CUDAContext::get_instance().get_guard();
+
   AllocInfo &info = allocations_[alloc.alloc_id];
   CUDADriver::get_instance().memcpy_host_to_device(info.ptr, info.mapped,
                                                    info.size);
@@ -97,6 +109,8 @@ void CudaDevice::unmap(DeviceAllocation alloc) {
 }
 
 void CudaDevice::memcpy_internal(DevicePtr dst, DevicePtr src, uint64_t size) {
+  auto context_guard = CUDAContext::get_instance().get_guard();
+
   void *dst_ptr =
       static_cast<char *>(allocations_[dst.alloc_id].ptr) + dst.offset;
   void *src_ptr =
@@ -119,6 +133,8 @@ DeviceAllocation CudaDevice::import_memory(void *ptr, size_t size) {
 }
 
 uint64 CudaDevice::fetch_result_uint64(int i, uint64 *result_buffer) {
+  auto context_guard = CUDAContext::get_instance().get_guard();
+
   CUDADriver::get_instance().stream_synchronize(nullptr);
   uint64 ret;
   CUDADriver::get_instance().memcpy_device_to_host(&ret, result_buffer + i,
diff --git a/taichi/rhi/interop/vulkan_cuda_interop.cpp b/taichi/rhi/interop/vulkan_cuda_interop.cpp
@@ -85,7 +85,7 @@ CUexternalMemory import_vk_memory_object_from_handle(HANDLE handle,
   if (is_dedicated) {
     desc.flags |= CUDA_EXTERNAL_MEMORY_DEDICATED;
   }
-
+  auto context_guard = CUDAContext::get_instance().get_guard();
   CUDADriver::get_instance().import_external_memory(&ext_mem, &desc);
   return ext_mem;
 }
@@ -104,6 +104,7 @@ CUexternalMemory import_vk_memory_object_from_handle(int fd,
   if (is_dedicated) {
     desc.flags |= CUDA_EXTERNAL_MEMORY_DEDICATED;
   }
+  auto context_guard = CUDAContext::get_instance().get_guard();
   CUDADriver::get_instance().import_external_memory(&ext_mem, &desc);
   return ext_mem;
 }
@@ -120,6 +121,7 @@ void *map_buffer_onto_external_memory(CUexternalMemory ext_mem,
   desc.offset = offset;
   desc.size = size;
 
+  auto context_guard = CUDAContext::get_instance().get_guard();
   CUDADriver::get_instance().external_memory_get_mapped_buffer(
       (CUdeviceptr *)&ptr, ext_mem, &desc);
   return ptr;
@@ -137,6 +139,7 @@ void *get_cuda_memory_pointer(VkDeviceMemory mem,
 }
 
 void cuda_memcpy(void *dst, void *src, size_t size) {
+  auto context_guard = CUDAContext::get_instance().get_guard();
   CUDADriver::get_instance().memcpy_device_to_device(dst, src, size);
 }
 
diff --git a/taichi/runtime/cuda/jit_cuda.cpp b/taichi/runtime/cuda/jit_cuda.cpp
@@ -13,9 +13,7 @@ JITModule *JITSessionCUDA ::add_module(std::unique_ptr<llvm::Module> M,
                                      "module NVPTX");
     writer.write(ptx);
   }
-  // TODO: figure out why using the guard leads to wrong tests results
-  // auto context_guard = CUDAContext::get_instance().get_guard();
-  CUDAContext::get_instance().make_current();
+  auto context_guard = CUDAContext::get_instance().get_guard();
   // Create module for object
   void *cuda_module;
   TI_TRACE("PTX size: {:.2f}KB", ptx.size() / 1024.0);
diff --git a/taichi/runtime/cuda/jit_cuda.h b/taichi/runtime/cuda/jit_cuda.h
@@ -47,9 +47,7 @@ class JITModuleCUDA : public JITModule {
   }
 
   void *lookup_function(const std::string &name) override {
-    // TODO: figure out why using the guard leads to wrong tests results
-    // auto context_guard = CUDAContext::get_instance().get_guard();
-    CUDAContext::get_instance().make_current();
+    auto context_guard = CUDAContext::get_instance().get_guard();
     void *func = nullptr;
     auto t = Time::get_time();
     auto err = CUDADriver::get_instance().module_get_function.call_with_warning(
diff --git a/taichi/runtime/llvm/llvm_runtime_executor.cpp b/taichi/runtime/llvm/llvm_runtime_executor.cpp
@@ -176,7 +176,7 @@ void LlvmRuntimeExecutor::print_list_manager_info(void *list_manager,
 void LlvmRuntimeExecutor::synchronize() {
   if (config_->arch == Arch::cuda) {
 #if defined(TI_WITH_CUDA)
-    CUDAContext::get_instance().make_current();
+    auto context_guard = CUDAContext::get_instance().get_guard();
     CUDADriver::get_instance().stream_synchronize(nullptr);
 #else
     TI_ERROR("No CUDA support");
@@ -191,7 +191,7 @@ uint64 LlvmRuntimeExecutor::fetch_result_uint64(int i, uint64 *result_buffer) {
   uint64 ret;
   if (config_->arch == Arch::cuda) {
 #if defined(TI_WITH_CUDA)
-    CUDAContext::get_instance().make_current();
+    auto context_guard = CUDAContext::get_instance().get_guard();
     CUDADriver::get_instance().memcpy_device_to_host(&ret, result_buffer + i,
                                                      sizeof(uint64));
 #else
@@ -373,6 +373,7 @@ void LlvmRuntimeExecutor::initialize_llvm_runtime_snodes(
       result_buffer);
   if (config_->arch == Arch::cuda) {
 #if defined(TI_WITH_CUDA)
+    auto context_guard = CUDAContext::get_instance().get_guard();
     CUDADriver::get_instance().memset(root_buffer, 0, rounded_size);
 #else
     TI_NOT_IMPLEMENTED
@@ -476,6 +477,7 @@ void LlvmRuntimeExecutor::fill_ndarray(const DeviceAllocation &alloc,
   auto ptr = get_ndarray_alloc_info_ptr(alloc);
   if (config_->arch == Arch::cuda) {
 #if defined(TI_WITH_CUDA)
+    auto cuda_context = CUDAContext::get_instance().get_guard();
     CUDADriver::get_instance().memsetd32((void *)ptr, data, size);
 #else
     TI_NOT_IMPLEMENTED
@@ -515,9 +517,12 @@ void LlvmRuntimeExecutor::materialize_runtime(MemoryPool *memory_pool,
   TaichiLLVMContext *tlctx = nullptr;
   if (config_->arch == Arch::cuda) {
 #if defined(TI_WITH_CUDA)
-    CUDADriver::get_instance().malloc(
-        (void **)result_buffer_ptr,
-        sizeof(uint64) * taichi_result_buffer_entries);
+    {
+      auto context_guard = CUDAContext::get_instance().get_guard();
+      CUDADriver::get_instance().malloc(
+          (void **)result_buffer_ptr,
+          sizeof(uint64) * taichi_result_buffer_entries);
+    }
     const auto total_mem = runtime_mem_info_->get_total_memory();
     if (config_->device_memory_fraction == 0) {
       TI_ASSERT(config_->device_memory_GB > 0);
@@ -537,9 +542,11 @@ void LlvmRuntimeExecutor::materialize_runtime(MemoryPool *memory_pool,
     cuda::CudaDevice::AllocInfo preallocated_device_buffer_alloc_info =
         cuda_device()->get_alloc_info(preallocated_device_buffer_alloc_);
     preallocated_device_buffer_ = preallocated_device_buffer_alloc_info.ptr;
-
-    CUDADriver::get_instance().memset(preallocated_device_buffer_, 0,
-                                      prealloc_size);
+    {
+      auto context_guard = CUDAContext::get_instance().get_guard();
+      CUDADriver::get_instance().memset(preallocated_device_buffer_, 0,
+                                        prealloc_size);
+    }
     tlctx = llvm_context_device_.get();
 #else
     TI_NOT_IMPLEMENTED
@@ -612,11 +619,6 @@ void LlvmRuntimeExecutor::materialize_runtime(MemoryPool *memory_pool,
         "LLVMRuntime_set_profiler_stop", llvm_runtime_,
         (void *)&KernelProfilerBase::profiler_stop);
   }
-#if defined(TI_WITH_CUDA)
-  if (config_->arch == Arch::cuda) {
-    CUDADriver::get_instance().context_pop_current(nullptr);
-  }
-#endif
 }
 
 void LlvmRuntimeExecutor::destroy_snode_tree(SNodeTree *snode_tree) {
diff --git a/taichi/runtime/program_impls/llvm/llvm_program.cpp b/taichi/runtime/program_impls/llvm/llvm_program.cpp
@@ -13,6 +13,7 @@
 #if defined(TI_WITH_CUDA)
 #include "taichi/runtime/cuda/aot_module_builder_impl.h"
 #include "taichi/codegen/cuda/codegen_cuda.h"
+#include "taichi/rhi/cuda/cuda_context.h"
 #endif
 
 #if defined(TI_WITH_DX12)
@@ -69,6 +70,7 @@ std::unique_ptr<StructCompiler> LlvmProgramImpl::compile_snode_tree_types_impl(
         Arch::dx12, this, std::move(device_module), tree->id());
   } else {
     TI_ASSERT(config->arch == Arch::cuda);
+    auto context_guard = CUDAContext::get_instance().get_guard();
     auto device_module = clone_struct_compiler_initial_context(
         has_multiple_snode_trees, runtime_exec_->llvm_context_device_.get());
     struct_compiler = std::make_unique<StructCompilerLLVM>(
diff --git a/tests/cpp/aot/llvm/graph_aot_test.cpp b/tests/cpp/aot/llvm/graph_aot_test.cpp
diff --git a/tests/cpp/aot/llvm/kernel_aot_test.cpp b/tests/cpp/aot/llvm/kernel_aot_test.cpp

Original file line number	Diff line number	Diff line change
`@@ -85,7 +85,7 @@ CUexternalMemory import_vk_memory_object_from_handle(HANDLE handle,`
`85`	`85`	`if (is_dedicated) {`
`86`	`86`	`desc.flags \|= CUDA_EXTERNAL_MEMORY_DEDICATED;`
`87`	`87`	`}`
`88`		`-`
	`88`	`+ auto context_guard = CUDAContext::get_instance().get_guard();`
`89`	`89`	`CUDADriver::get_instance().import_external_memory(&ext_mem, &desc);`
`90`	`90`	`return ext_mem;`
`91`	`91`	`}`
`@@ -104,6 +104,7 @@ CUexternalMemory import_vk_memory_object_from_handle(int fd,`
`104`	`104`	`if (is_dedicated) {`
`105`	`105`	`desc.flags \|= CUDA_EXTERNAL_MEMORY_DEDICATED;`
`106`	`106`	`}`
	`107`	`+ auto context_guard = CUDAContext::get_instance().get_guard();`
`107`	`108`	`CUDADriver::get_instance().import_external_memory(&ext_mem, &desc);`
`108`	`109`	`return ext_mem;`
`109`	`110`	`}`
`@@ -120,6 +121,7 @@ void *map_buffer_onto_external_memory(CUexternalMemory ext_mem,`
`120`	`121`	`desc.offset = offset;`
`121`	`122`	`desc.size = size;`
`122`	`123`
	`124`	`+ auto context_guard = CUDAContext::get_instance().get_guard();`
`123`	`125`	`CUDADriver::get_instance().external_memory_get_mapped_buffer(`
`124`	`126`	`(CUdeviceptr *)&ptr, ext_mem, &desc);`
`125`	`127`	`return ptr;`
`@@ -137,6 +139,7 @@ void *get_cuda_memory_pointer(VkDeviceMemory mem,`
`137`	`139`	`}`
`138`	`140`
`139`	`141`	`void cuda_memcpy(void dst, void src, size_t size) {`
	`142`	`+ auto context_guard = CUDAContext::get_instance().get_guard();`
`140`	`143`	`CUDADriver::get_instance().memcpy_device_to_device(dst, src, size);`
`141`	`144`	`}`
`142`	`145`