taichi-dev · jim19930609 · Jul 6, 2022 · Jun 29, 2022 · Jun 29, 2022 · Jun 29, 2022
diff --git a/cmake/TaichiTests.cmake b/cmake/TaichiTests.cmake
@@ -13,8 +13,9 @@ endif()
 file(GLOB_RECURSE TAICHI_TESTS_SOURCE
         "tests/cpp/analysis/*.cpp"
         "tests/cpp/aot/*.cpp"
+        "tests/cpp/aot/llvm/*.cpp"
+        "tests/cpp/aot/vulkan/*.cpp"
         "tests/cpp/backends/*.cpp"
-        "tests/cpp/backends/llvm/*.cpp"
         "tests/cpp/codegen/*.cpp"
         "tests/cpp/common/*.cpp"
         "tests/cpp/ir/*.cpp"

diff --git a/taichi/aot/graph_data.cpp b/taichi/aot/graph_data.cpp
@@ -1,19 +1,17 @@
 #include "taichi/aot/graph_data.h"
 #include "taichi/program/ndarray.h"
-#define TI_RUNTIME_HOST
-#include "taichi/program/context.h"
-#undef TI_RUNTIME_HOST
 
 namespace taichi {
 namespace lang {
 namespace aot {
+
 void CompiledGraph::run(
     const std::unordered_map<std::string, IValue> &args) const {
-  RuntimeContext ctx;
   for (const auto &dispatch : dispatches) {
-    memset(&ctx, 0, sizeof(RuntimeContext));
+    RuntimeContext ctx = ctx_;
 
     TI_ASSERT(dispatch.compiled_kernel);
+
     // Populate args metadata into RuntimeContext
     const auto &symbolic_args_ = dispatch.symbolic_args;
     for (int i = 0; i < symbolic_args_.size(); ++i) {
@@ -27,6 +25,7 @@ void CompiledGraph::run(
         TI_ERROR_IF(arr->element_shape != symbolic_arg.element_shape,
                     "Mismatched shape information for argument {}",
                     symbolic_arg.name);
+
         set_runtime_ctx_ndarray(&ctx, i, arr);
       } else if (ival.tag == aot::ArgKind::kScalar) {
         ctx.set_arg(i, ival.val);

diff --git a/taichi/aot/graph_data.h b/taichi/aot/graph_data.h
@@ -4,6 +4,9 @@
 #include <unordered_map>
 #include "taichi/ir/type.h"
 #include "taichi/aot/module_data.h"
+#define TI_RUNTIME_HOST
+#include "taichi/program/context.h"
+#undef TI_RUNTIME_HOST
 
 template <typename T, typename G>
 T taichi_union_cast_with_different_sizes(G g);
@@ -12,7 +15,7 @@ namespace taichi {
 namespace lang {
 class AotModuleBuilder;
 class Ndarray;
-struct RuntimeContext;
+
 namespace aot {
 // Currently only scalar, matrix and ndarray are supported.
 enum class ArgKind { kScalar, kMatrix, kNdarray, kUnknown };
@@ -112,6 +115,7 @@ struct CompiledDispatch {
 
 struct TI_DLL_EXPORT CompiledGraph {
   std::vector<CompiledDispatch> dispatches;
+  RuntimeContext ctx_;
 
   void run(const std::unordered_map<std::string, IValue> &args) const;
 

diff --git a/taichi/program/ndarray.cpp b/taichi/program/ndarray.cpp
@@ -194,6 +194,14 @@ void set_runtime_ctx_ndarray(RuntimeContext *ctx,
                              int arg_id,
                              Ndarray *ndarray) {
   ctx->set_arg_devalloc(arg_id, ndarray->ndarray_alloc_, ndarray->shape);
+
+  uint64_t total_array_size = 1;
+  for (const auto &dim : ndarray->total_shape()) {
+    total_array_size *= dim;
+  }
+  total_array_size *= data_type_size(ndarray->dtype);
+
+  ctx->set_array_runtime_size(arg_id, total_array_size);
 }
 
 }  // namespace lang

diff --git a/taichi/runtime/llvm/llvm_aot_module_loader.cpp b/taichi/runtime/llvm/llvm_aot_module_loader.cpp
@@ -18,8 +18,8 @@ LlvmOfflineCache::KernelCacheData LlvmAotModule::load_kernel_from_cache(
 std::unique_ptr<aot::Kernel> LlvmAotModule::make_new_kernel(
     const std::string &name) {
   auto fn = convert_module_to_function(name, load_kernel_from_cache(name));
-  return std::make_unique<llvm_aot::KernelImpl>(fn, name,
-                                                load_kernel_from_cache(name));
+  return std::make_unique<llvm_aot::KernelImpl>(
+      fn, name, LlvmOfflineCache::KernelCacheData());
 }
 
 std::unique_ptr<aot::Field> LlvmAotModule::make_new_field(
@@ -47,7 +47,10 @@ std::unique_ptr<aot::CompiledGraph> LlvmAotModule::get_graph(std::string name) {
     dispatches.push_back({dispatch.kernel_name, dispatch.symbolic_args,
                           get_kernel(dispatch.kernel_name)});
   }
-  aot::CompiledGraph graph{dispatches};
+
+  aot::CompiledGraph graph = aot::CompiledGraph({dispatches});
+  executor_->prepare_runtime_context(&graph.ctx_);
+
   return std::make_unique<aot::CompiledGraph>(std::move(graph));
 }
 

diff --git a/taichi/runtime/llvm/llvm_runtime_executor.h b/taichi/runtime/llvm/llvm_runtime_executor.h
@@ -61,6 +61,8 @@ class LlvmRuntimeExecutor {
 
   LLVMRuntime *get_llvm_runtime();
 
+  void prepare_runtime_context(RuntimeContext *ctx);
+
  private:
   /* ----------------------- */
   /* ------ Allocation ----- */
@@ -85,8 +87,6 @@ class LlvmRuntimeExecutor {
       std::vector<std::unique_ptr<SNodeTree>> &snode_trees_,
       uint64 *result_buffer);
 
-  void prepare_runtime_context(RuntimeContext *ctx);
-
   template <typename T, typename... Args>
   T runtime_query(const std::string &key,
                   uint64 *result_buffer,

diff --git a/taichi/runtime/program_impls/llvm/llvm_program.cpp b/taichi/runtime/program_impls/llvm/llvm_program.cpp
@@ -3,10 +3,10 @@
 #include "llvm/IR/Module.h"
 
 #include "taichi/program/program.h"
-#include "taichi/runtime/llvm/aot_graph_data.h"
+#include "taichi/codegen/codegen.h"
 #include "taichi/codegen/llvm/struct_llvm.h"
+#include "taichi/runtime/llvm/aot_graph_data.h"
 #include "taichi/runtime/llvm/llvm_offline_cache.h"
-#include "taichi/codegen/codegen.h"
 #include "taichi/runtime/cpu/aot_module_builder_impl.h"
 
 #if defined(TI_WITH_CUDA)

diff --git a/tests/cpp/backends/llvm/field_aot_test.cpp → tests/cpp/aot/llvm/field_aot_test.cpp b/tests/cpp/backends/llvm/field_aot_test.cpp → tests/cpp/aot/llvm/field_aot_test.cpp
diff --git a/tests/cpp/aot/llvm/field_aot_test.py b/tests/cpp/aot/llvm/field_aot_test.py
@@ -0,0 +1,17 @@
+import argparse
+
+from utils import compile_field_aot
+
+import taichi as ti
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--arch", type=str)
+args = parser.parse_args()
+
+if __name__ == "__main__":
+    if args.arch == "cpu":
+        compile_field_aot(arch=ti.cpu)
+    elif args.arch == "cuda":
+        compile_field_aot(arch=ti.cuda)
+    else:
+        assert False
diff --git a/tests/cpp/backends/llvm/kernel_aot_test.cpp → tests/cpp/aot/llvm/kernel_aot_test.cpp b/tests/cpp/backends/llvm/kernel_aot_test.cpp → tests/cpp/aot/llvm/kernel_aot_test.cpp
diff --git a/tests/cpp/aot/llvm/kernel_aot_test.py b/tests/cpp/aot/llvm/kernel_aot_test.py
@@ -0,0 +1,17 @@
+import argparse
+
+from utils import compile_kernel_aot
+
+import taichi as ti
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--arch", type=str)
+args = parser.parse_args()
+
+if __name__ == "__main__":
+    if args.arch == "cpu":
+        compile_kernel_aot(arch=ti.cpu)
+    elif args.arch == "cuda":
+        compile_kernel_aot(arch=ti.cuda)
+    else:
+        assert False
diff --git a/tests/cpp/aot/llvm/mpm88_graph_aot_test.cpp b/tests/cpp/aot/llvm/mpm88_graph_aot_test.cpp
@@ -0,0 +1,199 @@
+#include "gtest/gtest.h"
+#include "taichi/ir/ir_builder.h"
+#include "taichi/ir/statements.h"
+#include "taichi/inc/constants.h"
+#include "taichi/program/program.h"
+#include "tests/cpp/ir/ndarray_kernel.h"
+#include "tests/cpp/program/test_program.h"
+#include "taichi/aot/graph_data.h"
+#include "taichi/program/graph_builder.h"
+#include "taichi/runtime/gfx/aot_module_loader_impl.h"
+#include "taichi/rhi/device.h"
+
+#include "taichi/program/kernel_profiler.h"
+#include "taichi/runtime/program_impls/llvm/llvm_program.h"
+#include "taichi/system/memory_pool.h"
+#include "taichi/runtime/cpu/aot_module_loader_impl.h"
+#include "taichi/runtime/cuda/aot_module_loader_impl.h"
+#include "taichi/rhi/cuda/cuda_driver.h"
+#include "taichi/platform/cuda/detect_cuda.h"
+
+#define TI_RUNTIME_HOST
+#include "taichi/program/context.h"
+#undef TI_RUNTIME_HOST
+
+using namespace taichi;
+using namespace lang;
+
+constexpr int NR_PARTICLES = 8192;
+constexpr int N_GRID = 128;
+
+TEST(LlvmCGraph, Mpm88Cpu) {
+  CompileConfig cfg;
+  cfg.arch = Arch::x64;
+  cfg.kernel_profiler = false;
+  constexpr KernelProfilerBase *kNoProfiler = nullptr;
+  LlvmProgramImpl prog{cfg, kNoProfiler};
+  auto *compute_device = prog.get_compute_device();
+  // Must have handled all the arch fallback logic by this point.
+  auto memory_pool = std::make_unique<MemoryPool>(cfg.arch, compute_device);
+  prog.initialize_host();
+  uint64 *result_buffer{nullptr};
+  prog.materialize_runtime(memory_pool.get(), kNoProfiler, &result_buffer);
+
+  /* AOTLoader */
+  cpu::AotModuleParams aot_params;
+  const auto folder_dir = getenv("TAICHI_AOT_FOLDER_PATH");
+
+  std::stringstream aot_mod_ss;
+  aot_mod_ss << folder_dir;
+  aot_params.module_path = aot_mod_ss.str();
+  aot_params.executor_ = prog.get_runtime_executor();
+  auto mod = cpu::make_aot_module(aot_params);
+
+  // Prepare & Run "init" Graph
+  auto g_init = mod->get_graph("init");
+
+  /* Prepare arguments */
+  constexpr int kArrBytes_x = NR_PARTICLES * 2 * sizeof(float);
+  auto devalloc_x = prog.allocate_memory_ndarray(kArrBytes_x, result_buffer);
+  auto x = taichi::lang::Ndarray(devalloc_x, taichi::lang::PrimitiveType::f32,
+                                 {NR_PARTICLES}, {2});
+
+  constexpr int kArrBytes_v = NR_PARTICLES * 2 * sizeof(float);
+  auto devalloc_v = prog.allocate_memory_ndarray(kArrBytes_v, result_buffer);
+  auto v = taichi::lang::Ndarray(devalloc_v, taichi::lang::PrimitiveType::f32,
+                                 {NR_PARTICLES}, {2});
+
+  constexpr int kArrBytes_J = NR_PARTICLES * sizeof(float);
+  auto devalloc_J = prog.allocate_memory_ndarray(kArrBytes_J, result_buffer);
+  auto J = taichi::lang::Ndarray(devalloc_J, taichi::lang::PrimitiveType::f32,
+                                 {NR_PARTICLES});
+
+  std::unordered_map<std::string, taichi::lang::aot::IValue> args;
+  args.insert({"x", taichi::lang::aot::IValue::create(x)});
+  args.insert({"v", taichi::lang::aot::IValue::create(v)});
+  args.insert({"J", taichi::lang::aot::IValue::create(J)});
+
+  g_init->run(args);
+  prog.synchronize();
+
+  // Prepare & Run "update" Graph
+  auto g_update = mod->get_graph("update");
+
+  constexpr int kArrBytes_grid_v = N_GRID * N_GRID * 2 * sizeof(float);
+  auto devalloc_grid_v =
+      prog.allocate_memory_ndarray(kArrBytes_grid_v, result_buffer);
+  auto grid_v = taichi::lang::Ndarray(
+      devalloc_grid_v, taichi::lang::PrimitiveType::f32, {N_GRID, N_GRID}, {2});
+
+  constexpr int kArrBytes_grid_m = N_GRID * N_GRID * sizeof(float);
+  auto devalloc_grid_m =
+      prog.allocate_memory_ndarray(kArrBytes_grid_m, result_buffer);
+  auto grid_m = taichi::lang::Ndarray(
+      devalloc_grid_m, taichi::lang::PrimitiveType::f32, {N_GRID, N_GRID});
+
+  constexpr int kArrBytes_pos = NR_PARTICLES * 3 * sizeof(float);
+  auto devalloc_pos =
+      prog.allocate_memory_ndarray(kArrBytes_pos, result_buffer);
+  auto pos = taichi::lang::Ndarray(
+      devalloc_pos, taichi::lang::PrimitiveType::f32, {NR_PARTICLES}, {3});
+
+  constexpr int kArrBytes_C = NR_PARTICLES * sizeof(float) * 2 * 2;
+  auto devalloc_C = prog.allocate_memory_ndarray(kArrBytes_C, result_buffer);
+  auto C = taichi::lang::Ndarray(devalloc_C, taichi::lang::PrimitiveType::f32,
+                                 {NR_PARTICLES}, {2, 2});
+
+  args.insert({"C", taichi::lang::aot::IValue::create(C)});
+  args.insert({"grid_v", taichi::lang::aot::IValue::create(grid_v)});
+  args.insert({"grid_m", taichi::lang::aot::IValue::create(grid_m)});
+  args.insert({"pos", taichi::lang::aot::IValue::create(pos)});
+
+  g_update->run(args);
+  prog.synchronize();
+}
+
+TEST(LlvmCGraph, Mpm88Cuda) {
+  if (is_cuda_api_available()) {
+    CompileConfig cfg;
+    cfg.arch = Arch::cuda;
+    cfg.kernel_profiler = false;
+    constexpr KernelProfilerBase *kNoProfiler = nullptr;
+    LlvmProgramImpl prog{cfg, kNoProfiler};
+    prog.initialize_host();
+    uint64 *result_buffer{nullptr};
+    prog.materialize_runtime(nullptr, kNoProfiler, &result_buffer);
+
+    /* AOTLoader */
+    cuda::AotModuleParams aot_params;
+    const auto folder_dir = getenv("TAICHI_AOT_FOLDER_PATH");
+
+    std::stringstream aot_mod_ss;
+    aot_mod_ss << folder_dir;
+    aot_params.module_path = aot_mod_ss.str();
+    aot_params.executor_ = prog.get_runtime_executor();
+    auto mod = cuda::make_aot_module(aot_params);
+
+    // Prepare & Run "init" Graph
+    auto g_init = mod->get_graph("init");
+
+    /* Prepare arguments */
+    constexpr int kArrBytes_x = NR_PARTICLES * 2 * sizeof(float);
+    auto devalloc_x = prog.allocate_memory_ndarray(kArrBytes_x, result_buffer);
+    auto x = taichi::lang::Ndarray(devalloc_x, taichi::lang::PrimitiveType::f32,
+                                   {NR_PARTICLES}, {2});
+
+    constexpr int kArrBytes_v = NR_PARTICLES * 2 * sizeof(float);
+    auto devalloc_v = prog.allocate_memory_ndarray(kArrBytes_v, result_buffer);
+    auto v = taichi::lang::Ndarray(devalloc_v, taichi::lang::PrimitiveType::f32,
+                                   {NR_PARTICLES}, {2});
+
+    constexpr int kArrBytes_J = NR_PARTICLES * sizeof(float);
+    auto devalloc_J = prog.allocate_memory_ndarray(kArrBytes_J, result_buffer);
+    auto J = taichi::lang::Ndarray(devalloc_J, taichi::lang::PrimitiveType::f32,
+                                   {NR_PARTICLES});
+
+    std::unordered_map<std::string, taichi::lang::aot::IValue> args;
+    args.insert({"x", taichi::lang::aot::IValue::create(x)});
+    args.insert({"v", taichi::lang::aot::IValue::create(v)});
+    args.insert({"J", taichi::lang::aot::IValue::create(J)});
+
+    g_init->run(args);
+    prog.synchronize();
+
+    // Prepare & Run "update" Graph
+    auto g_update = mod->get_graph("update");
+
+    constexpr int kArrBytes_grid_v = N_GRID * N_GRID * 2 * sizeof(float);
+    auto devalloc_grid_v =
+        prog.allocate_memory_ndarray(kArrBytes_grid_v, result_buffer);
+    auto grid_v =
+        taichi::lang::Ndarray(devalloc_grid_v, taichi::lang::PrimitiveType::f32,
+                              {N_GRID, N_GRID}, {2});
+
+    constexpr int kArrBytes_grid_m = N_GRID * N_GRID * sizeof(float);
+    auto devalloc_grid_m =
+        prog.allocate_memory_ndarray(kArrBytes_grid_m, result_buffer);
+    auto grid_m = taichi::lang::Ndarray(
+        devalloc_grid_m, taichi::lang::PrimitiveType::f32, {N_GRID, N_GRID});
+
+    constexpr int kArrBytes_pos = NR_PARTICLES * 3 * sizeof(float);
+    auto devalloc_pos =
+        prog.allocate_memory_ndarray(kArrBytes_pos, result_buffer);
+    auto pos = taichi::lang::Ndarray(
+        devalloc_pos, taichi::lang::PrimitiveType::f32, {NR_PARTICLES}, {3});
+
+    constexpr int kArrBytes_C = NR_PARTICLES * sizeof(float) * 2 * 2;
+    auto devalloc_C = prog.allocate_memory_ndarray(kArrBytes_C, result_buffer);
+    auto C = taichi::lang::Ndarray(devalloc_C, taichi::lang::PrimitiveType::f32,
+                                   {NR_PARTICLES}, {2, 2});
+
+    args.insert({"C", taichi::lang::aot::IValue::create(C)});
+    args.insert({"grid_v", taichi::lang::aot::IValue::create(grid_v)});
+    args.insert({"grid_m", taichi::lang::aot::IValue::create(grid_m)});
+    args.insert({"pos", taichi::lang::aot::IValue::create(pos)});
+
+    g_update->run(args);
+    prog.synchronize();
+  }
+}
diff --git a/tests/cpp/backends/llvm/utils.py → tests/cpp/aot/llvm/utils.py b/tests/cpp/backends/llvm/utils.py → tests/cpp/aot/llvm/utils.py