[NCCL] Add option to run NCCL on high priority cuda stream (pytorch#43796)

mingzhe09088 · facebook-github-bot · commit 574f9af1607f · 2020-09-16T16:00:41.000-07:00
Summary: Pull Request resolved: pytorch#43796 This diff adds an option for the process group NCCL backend to pick high priority cuda streams. Test Plan: waitforsandcastle Reviewed By: jiayisuse Differential Revision: D23404286 fbshipit-source-id: b79ae097b7cd945a26e8ba1dd13ad3147ac790eb
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
@@ -685,19 +685,34 @@ They are used in specifying strategies for reduction collectives, e.g.,
 #endif
 
 #ifdef USE_C10D_NCCL
-  shared_ptr_class_<::c10d::ProcessGroupNCCL>(
+  auto processGroupNCCL = shared_ptr_class_<::c10d::ProcessGroupNCCL>(
       module, "ProcessGroupNCCL", processGroup)
+      .def(py::init<
+           const std::shared_ptr<::c10d::Store>&,
+           int,
+           int,
+           ::c10d::ProcessGroupNCCL::Options>())
       .def(
-          py::init<
-              const std::shared_ptr<::c10d::Store>&,
-              int,
-              int,
-              const std::chrono::milliseconds&>(),
+          py::init([](const std::shared_ptr<::c10d::Store>& store,
+                      int rank,
+                      int size,
+                      const std::chrono::milliseconds& timeout){
+            ::c10d::ProcessGroupNCCL::Options options;
+            options.isHighPriorityStream = false;
+            options.opTimeout = timeout;
+            return std::make_shared<::c10d::ProcessGroupNCCL>(
+                store, rank, size, options);
+          }),
           py::arg("store"),
           py::arg("rank"),
           py::arg("size"),
           py::arg("timeout") = std::chrono::milliseconds(
               ::c10d::ProcessGroupNCCL::kProcessGroupNCCLOpTimeoutMillis));
+
+  py::class_<::c10d::ProcessGroupNCCL::Options>(processGroupNCCL, "Options")
+      .def(py::init<>())
+      .def_readwrite("is_high_priority", &::c10d::ProcessGroupNCCL::Options::isHighPriorityStream)
+      .def_readwrite("op_timeout", &::c10d::ProcessGroupNCCL::Options::opTimeout);
 #endif
 
 #ifdef USE_C10D_MPI
diff --git a/torch/lib/c10d/ProcessGroupNCCL.cpp b/torch/lib/c10d/ProcessGroupNCCL.cpp
@@ -430,13 +430,14 @@ ProcessGroupNCCL::ProcessGroupNCCL(
     const std::shared_ptr<Store>& store,
     int rank,
     int size,
-    const std::chrono::milliseconds& opTimeout)
+    Options options)
     : ProcessGroup(rank, size),
       store_(store),
       ncclCommCounter_(0),
       terminateProcessGroup_(false),
-      opTimeout_(opTimeout),
-      futureNCCLCallbackStreams_(c10::cuda::device_count()) {
+      opTimeout_(options.opTimeout),
+      futureNCCLCallbackStreams_(c10::cuda::device_count()),
+      isHighPriorityStream_(options.isHighPriorityStream) {
   try {
     parseNcclBlockingWait();
   } catch (std::exception& e) {
@@ -769,14 +770,14 @@ std::vector<std::shared_ptr<NCCLComm>>& ProcessGroupNCCL::getNCCLComm(
     ncclComms[i] = NCCLComm::create(numRanks, rank, ncclID);
 
     // Creates the NCCL streams
-    streamVal.push_back(at::cuda::getStreamFromPool());
+    streamVal.push_back(at::cuda::getStreamFromPool(isHighPriorityStream_));
 
     // If not set before, get a dedicated stream for the device to run
     // FutureNCCL then callbacks.
     std::lock_guard<std::mutex> lock(mutex_);
     if (futureNCCLCallbackStreams_[deviceIndex] == nullptr) {
       futureNCCLCallbackStreams_[deviceIndex] =
-          std::make_shared<at::cuda::CUDAStream>(at::cuda::getStreamFromPool());
+          std::make_shared<at::cuda::CUDAStream>(at::cuda::getStreamFromPool(isHighPriorityStream_));
     }
   }
 
@@ -931,6 +932,8 @@ void ProcessGroupNCCL::workEnqueue(
     workList_.emplace_back(std::move(work));
   }
 }
+ProcessGroupNCCL::Options::Options()
+    : opTimeout(kProcessGroupNCCLOpTimeoutMillis), isHighPriorityStream(false) {}
 
 template <typename Fn, typename PreProcess, typename PostProcess>
 std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::collective(
diff --git a/torch/lib/c10d/ProcessGroupNCCL.hpp b/torch/lib/c10d/ProcessGroupNCCL.hpp
@@ -161,6 +161,13 @@ class ProcessGroupNCCL : public ProcessGroup {
     friend class ProcessGroupNCCL;
   };
 
+  struct Options {
+    explicit Options();
+
+    std::chrono::milliseconds opTimeout;
+    bool isHighPriorityStream;
+  };
+
   // FutureNCCL is a subclass of ivalue's Future. The goal is to use
   // this class in getFuture API of WorkNCCL. This Future is mostly a
   // wrapper to synchronize streams appropriately and it mostly enables
@@ -341,8 +348,7 @@ class ProcessGroupNCCL : public ProcessGroup {
       const std::shared_ptr<Store>& store,
       int rank,
       int size,
-      const std::chrono::milliseconds& opTimeout =
-          std::chrono::milliseconds(kProcessGroupNCCLOpTimeoutMillis));
+      Options options = Options());
 
   // This constructor includes the deprecated `groupName` argument.
   // If you have existing code that uses the `groupName`, you can replace
@@ -352,9 +358,8 @@ class ProcessGroupNCCL : public ProcessGroup {
       int rank,
       int size,
       const std::string& groupName,
-      const std::chrono::milliseconds& opTimeout =
-          std::chrono::milliseconds(kProcessGroupNCCLOpTimeoutMillis))
-      : ProcessGroupNCCL(store, rank, size, opTimeout) {}
+      Options options = Options())
+      : ProcessGroupNCCL(store, rank, size, options) {}
 
   virtual ~ProcessGroupNCCL();
 
@@ -626,6 +631,9 @@ class ProcessGroupNCCL : public ProcessGroup {
   // of the corresponding device inside ProcessGroupNCCL::getNCCLComm if not set
   // before.
   std::vector<std::shared_ptr<at::cuda::CUDAStream>> futureNCCLCallbackStreams_;
+
+  // Schedule NCCL operations on high priority CUDA streams.
+  bool isHighPriorityStream_ = false;
 };
 
 } // namespace c10d
diff --git a/torch/lib/c10d/test/ProcessGroupNCCLErrorsTest.cpp b/torch/lib/c10d/test/ProcessGroupNCCLErrorsTest.cpp
@@ -38,8 +38,8 @@ class ProcessGroupNCCLSimulateErrors : public c10d::ProcessGroupNCCL {
       const std::shared_ptr<c10d::Store>& store,
       int rank,
       int size,
-      std::chrono::milliseconds timeout)
-      : ProcessGroupNCCL(store, rank, size, timeout), simulate_error_(false) {}
+      c10d::ProcessGroupNCCL::Options opts)
+      : ProcessGroupNCCL(store, rank, size, opts), simulate_error_(false) {}
 
   std::exception_ptr checkForNCCLErrors(
       const std::vector<std::shared_ptr<c10d::NCCLComm>>& ncclComms) override {
@@ -100,8 +100,8 @@ class ProcessGroupNCCLTimedOutErrors : public ProcessGroupNCCLSimulateErrors {
       const std::shared_ptr<c10d::Store>& store,
       int rank,
       int size,
-      std::chrono::milliseconds timeout)
-      : ProcessGroupNCCLSimulateErrors(store, rank, size, timeout),
+      c10d::ProcessGroupNCCL::Options opts)
+      : ProcessGroupNCCLSimulateErrors(store, rank, size, opts),
         set_timedout_error_(false) {}
 
   std::shared_ptr<ProcessGroupNCCL::WorkNCCL> initWork(
@@ -165,8 +165,10 @@ TEST_F(ProcessGroupNCCLErrorsTest, testNCCLErrorsBlocking) {
   }
 
   ASSERT_TRUE(setenv(c10d::NCCL_BLOCKING_WAIT, "1", 1) == 0);
+  c10d::ProcessGroupNCCL::Options options;
+  options.opTimeout = std::chrono::milliseconds(1000);
   ProcessGroupNCCLSimulateErrors pg(
-      store_, 0, 1, std::chrono::milliseconds(1000));
+      store_, 0, 1, options);
 
   auto work = pg.allreduce(tensors_);
   work->wait();
@@ -192,8 +194,10 @@ TEST_F(ProcessGroupNCCLErrorsTest, testNCCLTimedoutErrorsBlocking) {
   }
 
   ASSERT_TRUE(setenv(c10d::NCCL_BLOCKING_WAIT, "1", 1) == 0);
+  c10d::ProcessGroupNCCL::Options options;
+  options.opTimeout = std::chrono::milliseconds(3000);
   ProcessGroupNCCLTimedOutErrors pg(
-      store_, 0, 1, std::chrono::milliseconds(3000));
+      store_, 0, 1, options);
 
   auto work = pg.allreduce(tensors_);
   work->wait();
@@ -213,8 +217,10 @@ TEST_F(ProcessGroupNCCLErrorsTest, testNCCLErrorsNonBlocking) {
     return;
   }
 
+  c10d::ProcessGroupNCCL::Options options;
+  options.opTimeout = std::chrono::milliseconds(3000);
   ProcessGroupNCCLSimulateErrors pg(
-      store_, 0, 1, std::chrono::milliseconds(3000));
+      store_, 0, 1, options);
 
   auto work = pg.allreduce(tensors_);
   pg.barrier()->wait();
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
@@ -689,7 +689,7 @@ def test_irecv(self):
 
         # BROADCAST
         def _test_broadcast_helper(
-            self, group, group_id, rank, cuda=False, rank_to_GPU=None
+            self, group, group_id, rank, cuda=False, rank_to_GPU=None, with_options=False
         ):
             for dtype, value, requires_cuda in [
                 (torch.float, -1e-10, False),
@@ -707,12 +707,24 @@ def _test_broadcast_helper(
                     if cuda:
                         expected_tensor = expected_tensor.cuda(rank_to_GPU[rank][0])
                     if rank == src:
-                        dist.broadcast(expected_tensor, src, group_id)
+                        if with_options:
+                            opts = dist.BroadcastOptions()
+                            opts.rootTensor = 0
+                            opts.rootRank = src
+                            group_id.broadcast([expected_tensor], opts).wait()
+                        else:
+                            dist.broadcast(expected_tensor, src, group_id)
                     else:
                         tensor = _build_tensor(src + 1, -1, dtype)
                         if cuda:
                             tensor = tensor.cuda(rank_to_GPU[rank][0])
-                        dist.broadcast(tensor, src, group_id)
+                        if with_options:
+                            opts = dist.BroadcastOptions()
+                            opts.rootTensor = 0
+                            opts.rootRank = src
+                            group_id.broadcast([tensor], opts).wait()
+                        else:
+                            dist.broadcast(tensor, src, group_id)
                         self.assertEqual(tensor.size(), expected_tensor.size())
                         self.assertEqual(tensor.ne(expected_tensor).max(), torch.tensor(False))
 
@@ -744,6 +756,28 @@ def test_broadcast_full_group(self):
             group, group_id, rank = self._init_full_group_test()
             self._test_broadcast_helper(group, group_id, rank)
 
+        @unittest.skipIf(
+            BACKEND != "nccl",
+            "Only NCCL backend supports high priority stream",
+        )
+        @skip_if_no_gpu
+        @skip_if_rocm
+        def test_nccl_high_priority_stream(self):
+            group, _, rank = self._init_global_test()
+            rank_to_GPU = self._init_multigpu_helper()
+
+            new_port = str(MASTER_PORT + 1)
+            os.environ['MASTER_PORT'] = new_port
+            gen_iterator = dist.rendezvous('env://', rank, dist.get_world_size())
+            store, rank, size = next(gen_iterator)
+            store = dist.PrefixStore(new_port, store)
+
+            opts = dist.ProcessGroupNCCL.Options()
+            opts.is_high_priority = False
+            group_id = dist.ProcessGroupNCCL(store, rank, size, opts)
+
+            self._test_broadcast_helper(group, group_id, rank, True, rank_to_GPU, True)
+
         # REDUCE
         def _test_reduce_helper(
             self,