Make Channel API accept buffer structs rather than raw pointers. (pytorch#45014)

beauby · facebook-github-bot · commit ac8c7c4e9f45 · 2020-09-21T10:18:45.000-07:00
Summary: Pull Request resolved: pytorch#45014 Pull Request resolved: pytorch/tensorpipe#219 Pull Request resolved: pytorch/tensorpipe#212 + Introduce buffer.h defining the buffer struct(s). The `CpuBuffer` struct is always defined, while the `CudaBuffer` struct is defined only when `TENSORPIPE_SUPPORTS_CUDA` is true. + Update all channels to take a `CpuBuffer` or `CudaBuffer` for `send`/`recv` rather than a raw pointer and a length. + Make the base `Channel`/`Context` classes templated on `TBuffer`, effectively creating two channel hierarchies (one for CPU channels, one for CUDA channels). + Update the Pipe and the generic channel tests to use the new API. So far, generic channel tests are CPU only, and tests for the CUDA IPC channel are (temporarily) disabled. A subsequent PR will take care of refactoring tests so that generic tests work for CUDA channels. An other PR will add support for CUDA tensors in the Pipe. Differential Revision: D23598033 Test Plan: Imported from OSS Reviewed By: lw Pulled By: beauby fbshipit-source-id: 1d6c3f91e288420858835cd5e7962e8da051b44b
diff --git a/BUILD.bazel b/BUILD.bazel
@@ -1727,13 +1727,16 @@ cc_library(
         "@gloo",
         "@onnx",
         "@fmt",
-        "@tensorpipe",
     ] + if_cuda(
         [
             ":caffe2_cpp_cuda",
             ":aten_cuda",
+            "@tensorpipe//:tensorpipe_cuda",
+        ],
+        [
+            ":aten",
+            "@tensorpipe",
         ],
-        [":aten"],
     ),
     alwayslink = True,
 )
@@ -1764,7 +1767,7 @@ cu_library(
         "@cudnn",
         "@eigen",
         "@gloo",
-        "@tensorpipe",
+        "@tensorpipe//:tensorpipe_cuda",
     ],
     alwayslink = True,
 )
diff --git a/test/cpp/rpc/test_tensorpipe_serialization.cpp b/test/cpp/rpc/test_tensorpipe_serialization.cpp
@@ -42,7 +42,7 @@ TEST(TensorpipeSerialize, Base) {
   recvingTpMessage.tensors.reserve(sendingTpMessage.tensors.size());
   for (auto& tpTensor : sendingTpMessage.tensors) {
     tensorpipe::Message::Tensor t;
-    t.length = tpTensor.length;
+    t.buffer = tensorpipe::CpuBuffer{nullptr, tpTensor.buffer.cpu.length};
     t.metadata = tpTensor.metadata;
     recvingTpMessage.tensors.push_back(std::move(t));
   }
@@ -67,7 +67,10 @@ TEST(TensorpipeSerialize, Base) {
   for (int i = 0; i < recvingTpMessage.tensors.size(); i++) {
     tensorpipe::Message::Tensor& srcTensor = sendingTpMessage.tensors[i];
     tensorpipe::Message::Tensor& dstTensor = recvingTpMessage.tensors[i];
-    memcpy(dstTensor.data, srcTensor.data, srcTensor.length);
+    memcpy(
+        dstTensor.buffer.cpu.ptr,
+        srcTensor.buffer.cpu.ptr,
+        srcTensor.buffer.cpu.length);
   }
 
   // Mimic read() callback:
@@ -110,9 +113,10 @@ TEST(TensorpipeSerialize, RecopySparseTensors) {
   EXPECT_TRUE(torch::equal(main, tpBuffers.tensors[0]));
   EXPECT_TRUE(torch::equal(tiny, tpBuffers.tensors[1]));
   // Test cloned storage
-  EXPECT_EQ(main.storage().data(), sendingTpMessage.tensors[0].data);
-  EXPECT_NE(tiny.storage().data(), sendingTpMessage.tensors[1].data);
-  EXPECT_EQ(tiny.element_size() * k1K, sendingTpMessage.tensors[1].length);
+  EXPECT_EQ(main.storage().data(), sendingTpMessage.tensors[0].buffer.cpu.ptr);
+  EXPECT_NE(tiny.storage().data(), sendingTpMessage.tensors[1].buffer.cpu.ptr);
+  EXPECT_EQ(
+      tiny.element_size() * k1K, sendingTpMessage.tensors[1].buffer.cpu.length);
 }
 
 TEST(TensorpipeSerialize, NoDeleterTensors) {
@@ -136,21 +140,25 @@ TEST(TensorpipeSerialize, NoDeleterTensors) {
   EXPECT_EQ(tpBuffers.copiedTensors.size(), 2);
   EXPECT_EQ(sendingTpMessage.tensors.size(), 2);
   EXPECT_EQ(
-      tpBuffers.copiedTensors[0].size(), sendingTpMessage.tensors[0].length);
+      tpBuffers.copiedTensors[0].size(),
+      sendingTpMessage.tensors[0].buffer.cpu.length);
   EXPECT_EQ(
-      tpBuffers.copiedTensors[1].size(), sendingTpMessage.tensors[1].length);
+      tpBuffers.copiedTensors[1].size(),
+      sendingTpMessage.tensors[1].buffer.cpu.length);
   EXPECT_EQ(
-      tpBuffers.copiedTensors[0].data(), sendingTpMessage.tensors[0].data);
+      tpBuffers.copiedTensors[0].data(),
+      sendingTpMessage.tensors[0].buffer.cpu.ptr);
   EXPECT_EQ(
-      tpBuffers.copiedTensors[1].data(), sendingTpMessage.tensors[1].data);
+      tpBuffers.copiedTensors[1].data(),
+      sendingTpMessage.tensors[1].buffer.cpu.ptr);
   EXPECT_TRUE(
       memcmp(
           tpBuffers.copiedTensors[0].data(),
           t1.storage().data(),
-          sendingTpMessage.tensors[0].length) == 0);
+          sendingTpMessage.tensors[0].buffer.cpu.length) == 0);
   EXPECT_TRUE(
       memcmp(
           tpBuffers.copiedTensors[1].data(),
           t2.storage().data(),
-          sendingTpMessage.tensors[1].length) == 0);
+          sendingTpMessage.tensors[1].buffer.cpu.length) == 0);
 }
diff --git a/third_party/tensorpipe b/third_party/tensorpipe
@@ -1 +1 @@
-Subproject commit 42033c5437fc9c181dc9d0a32df600484e2b0685
+Subproject commit 9646e1a431997edb1579972cef196d8fb97a77a5
diff --git a/third_party/tensorpipe.BUILD b/third_party/tensorpipe.BUILD
@@ -1,5 +1,5 @@
 load("@rules_cc//cc:defs.bzl", "cc_library")
-load("@//third_party:substitution.bzl", "template_rule")
+load("@//third_party:substitution.bzl", "header_template_rule")
 
 LIBUV_COMMON_SRCS = [
     "third_party/libuv/src/fs-poll.c",
@@ -59,62 +59,87 @@ cc_library(
     visibility = ["//visibility:public"],
 )
 
-proto_library(
-    name = "tensorpipe_proto_source",
-    srcs = glob([
-        "tensorpipe/proto/*.proto",
-        "tensorpipe/proto/*/*.proto",
-    ]),
-    visibility = ["//visibility:public"],
-)
-
-cc_proto_library(
-    name = "tensorpipe_protos",
-    deps = [":tensorpipe_proto_source"],
+cc_library(
+    name = "libnop",
+    srcs = [],
+    includes = ["third_party/libnop/include"],
+    hdrs = glob(["third_party/libnop/include/**/*.h"]),
 )
 
-template_rule(
-    name = "tensorpipe_header_template",
-    src = "tensorpipe/tensorpipe.h.in",
-    out = "tensorpipe/tensorpipe.h",
+header_template_rule(
+    name = "tensorpipe_config_header",
+    src = "tensorpipe/config.h.in",
+    out = "tensorpipe/config.h",
     substitutions = {
-        "cmakedefine01 TENSORPIPE_HAS_SHM_TRANSPORT": "define TENSORPIPE_HAS_SHM_TRANSPORT 0",
-        "cmakedefine01 TENSORPIPE_HAS_CMA_CHANNEL": "define TENSORPIPE_HAS_CMA_CHANNEL 0",
+        "#cmakedefine01 TENSORPIPE_HAS_SHM_TRANSPORT": "",
+        "#cmakedefine01 TENSORPIPE_HAS_CMA_CHANNEL": "",
+        "#cmakedefine01 TENSORPIPE_HAS_CUDA_IPC_CHANNEL": "",
+        "#cmakedefine01 TENSORPIPE_SUPPORTS_CUDA": "",
     },
 )
 
+TENSORPIPE_HEADERS = glob([
+    "tensorpipe/*.h",
+    "tensorpipe/channel/*.h",
+    "tensorpipe/channel/*/*.h",
+    "tensorpipe/common/*.h",
+    "tensorpipe/core/*.h",
+    "tensorpipe/transport/*.h",
+    "tensorpipe/transport/*/*.h",
+    "tensorpipe/util/*/*.h",
+])
+
+TENSORPIPE_BASE_SRCS = glob([
+    "tensorpipe/*.cc",
+    "tensorpipe/channel/*.cc",
+    "tensorpipe/common/*.cc",
+    "tensorpipe/core/*.cc",
+    "tensorpipe/transport/*.cc",
+    "tensorpipe/util/*/*.cc",
+])
+
+TENSORPIPE_SRCS = TENSORPIPE_BASE_SRCS + glob([
+    "tensorpipe/channel/basic/*.cc",
+    "tensorpipe/channel/mpt/*.cc",
+    "tensorpipe/channel/xth/*.cc",
+    "tensorpipe/transport/uv/*.cc",
+])
+
+TENSORPIPE_SRCS_CUDA = TENSORPIPE_SRCS + glob([
+    "tensorpipe/channel/cuda_ipc/*.cc",
+])
+
 cc_library(
     name = "tensorpipe",
-    srcs = glob(
-        [
-            "tensorpipe/*.cc",
-            "tensorpipe/channel/*.cc",
-            "tensorpipe/channel/*/*.cc",
-            "tensorpipe/common/*.cc",
-            "tensorpipe/core/*.cc",
-            "tensorpipe/transport/*.cc",
-            "tensorpipe/transport/*/*.cc",
-            "tensorpipe/util/*/*.cc",
-        ],
-    ),
-    hdrs = glob(
-        [
-            "tensorpipe/*.h",
-            "tensorpipe/channel/*.h",
-            "tensorpipe/channel/*/*.h",
-            "tensorpipe/common/*.h",
-            "tensorpipe/core/*.h",
-            "tensorpipe/transport/*.h",
-            "tensorpipe/transport/*/*.h",
-            "tensorpipe/util/*/*.h",
-        ],
-    ),
+    srcs = TENSORPIPE_SRCS + [":tensorpipe_config_header"],
+    hdrs = TENSORPIPE_HEADERS,
     includes = [
         ".",
     ],
     copts = [
         "-std=c++14",
     ],
     visibility = ["//visibility:public"],
-    deps = [":tensorpipe_protos", ":libuv"],
+    deps = [
+        ":libnop",
+        ":libuv",
+    ],
+)
+
+cc_library(
+    name = "tensorpipe_cuda",
+    srcs = TENSORPIPE_SRCS_CUDA + [":tensorpipe_config_header"],
+    hdrs = TENSORPIPE_HEADERS,
+    includes = [
+        ".",
+    ],
+    copts = [
+        "-std=c++14",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":libnop",
+        ":libuv",
+        "@cuda",
+    ],
 )
diff --git a/torch/csrc/distributed/rpc/tensorpipe_agent.h b/torch/csrc/distributed/rpc/tensorpipe_agent.h
@@ -16,6 +16,7 @@
 
 namespace tensorpipe {
 
+class CpuBuffer;
 class Context;
 class Error;
 class Listener;
@@ -30,7 +31,9 @@ class Context;
 } // namespace transport
 
 namespace channel {
+template <typename TBuffer>
 class Context;
+using CpuContext = Context<CpuBuffer>;
 } // namespace channel
 
 using DeviceMap = std::unordered_map<c10::DeviceIndex, c10::DeviceIndex>;
@@ -53,7 +56,7 @@ struct TransportRegistration {
 C10_DECLARE_REGISTRY(TensorPipeTransportRegistry, TransportRegistration);
 
 struct ChannelRegistration {
-  std::shared_ptr<tensorpipe::channel::Context> channel;
+  std::shared_ptr<tensorpipe::channel::CpuContext> channel;
   int64_t priority;
 };
 
diff --git a/torch/csrc/distributed/rpc/tensorpipe_utils.cpp b/torch/csrc/distributed/rpc/tensorpipe_utils.cpp
@@ -99,15 +99,17 @@ std::tuple<tensorpipe::Message, TensorpipeWriteBuffers> tensorpipeSerialize(
       std::vector<char> storageData(
           tensorData.data(), tensorData.data() + tensorData.sizeInBytes());
       tpMessage.tensors.push_back(tensorpipe::Message::Tensor{
-          storageData.data(), storageData.size(), std::move(metadata)});
+          tensorpipe::CpuBuffer{storageData.data(), storageData.size()},
+          std::move(metadata)});
       buffers.copiedTensors.push_back(std::move(storageData));
     } else {
       // TensorPipe uses the same Message class for both reading and writing, so
       // it uses non-const ptrs even though it doesn't modify them when writing.
       // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
       char* tensorPtr = const_cast<char*>(tensorData.data());
       tpMessage.tensors.push_back(tensorpipe::Message::Tensor{
-          tensorPtr, tensorData.sizeInBytes(), std::move(metadata)});
+          tensorpipe::CpuBuffer{tensorPtr, tensorData.sizeInBytes()},
+          std::move(metadata)});
     }
   }
 
@@ -152,8 +154,8 @@ TensorpipeReadBuffers tensorpipeAllocate(tensorpipe::Message& tpMessage) {
 
   for (auto& tensor : tpMessage.tensors) {
     buffers.tensors.emplace_back(
-        at::getCPUAllocator()->allocate(tensor.length));
-    tensor.data = buffers.tensors.back().get();
+        at::getCPUAllocator()->allocate(tensor.buffer.cpu.length));
+    tensor.buffer.cpu.ptr = buffers.tensors.back().get();
   }
 
   return buffers;