Add barrier() at the end of init_process_group and new_group. (pytorch#45181)

pritamdamania · facebook-github-bot · commit a2b4177c5b9a · 2020-09-25T15:46:59.000-07:00
Summary: Pull Request resolved: pytorch#45181 `init_process_group` and `new_group` update a bunch of global variables after initializing the actual process group. As a result, there is a race that after initializing the process group on say rank 0, if we immediately check the default process group on rank 1 (say via RPC), we might actually get an error since rank 1 hasn't yet updated its _default_pg variable. To resolve this issue, I've added barrier() at the end of both of these calls. This ensures that once these calls return we are guaranteed about correct initialization on all ranks. Since these calls are usually done mostly during initialization, it should be fine to add the overhead of a barrier() here. #Closes: pytorch#40434, pytorch#40378 ghstack-source-id: 112923112 Test Plan: Reproduced the failures in pytorch#40434 and pytorch#40378 and verified that this PR fixes the issue. Reviewed By: mrshenli Differential Revision: D23858025 fbshipit-source-id: c4d5e46c2157981caf3ba1525dec5310dcbc1830
diff --git a/test/cpp_extensions/cpp_c10d_extension.cpp b/test/cpp_extensions/cpp_c10d_extension.cpp
@@ -63,7 +63,7 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupTest::allgather_base(
 
 std::shared_ptr<ProcessGroup::Work> ProcessGroupTest::barrier(
     const BarrierOptions& opts) {
-  throw std::runtime_error("ProcessGroupTest does not support barrier");
+  return std::make_shared<ProcessGroupTest::WorkTest>();
 }
 
 std::shared_ptr<ProcessGroup::Work> ProcessGroupTest::gather(
diff --git a/test/distributed/test_c10d.py b/test/distributed/test_c10d.py
@@ -334,11 +334,11 @@ def test_unknown_handler(self):
 @skip_if_win32()
 class RendezvousEnvTest(TestCase):
     @retry_on_connect_failures
+    @requires_nccl()
     def test_common_errors(self):
-        # TODO remove this hack
-        if not hasattr(c10d, "ProcessGroupNCCL"):
-            raise unittest.SkipTest("C10D is not built with NCCL process group,"
-                                    " skipping test")
+        if torch.cuda.device_count() == 0:
+            raise unittest.SkipTest("No GPUs available, skipping test")
+
         vars = {
             "WORLD_SIZE": "1",
             "RANK": "0",
@@ -579,6 +579,8 @@ def _test_default_store_timeout(self, backend):
     @requires_nccl()
     @retry_on_connect_failures
     def test_default_store_timeout_nccl(self):
+        if torch.cuda.device_count() == 0:
+            raise unittest.SkipTest("No GPUs available, skipping test")
         self._test_default_store_timeout('nccl')
 
     @requires_gloo()
diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
@@ -436,6 +436,10 @@ def init_process_group(backend,
     _backend = _pg_map[_default_pg][0]
     _default_pg_init_method = init_method
 
+    # barrier at the end to ensure that once we return from this method, all
+    # process groups including global variables are updated correctly on all
+    # ranks.
+    barrier()
 
 def _new_process_group_helper(world_size,
                               rank,
@@ -2025,4 +2029,9 @@ def new_group(ranks=None, timeout=default_pg_timeout, backend=None):
         for group_rank, global_rank in enumerate(ranks)
     }
 
+    # barrier at the end to ensure that once we return from this method, all
+    # process groups including global variables are updated correctly on all
+    # ranks.
+    barrier()
+
     return pg
diff --git a/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py b/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py
@@ -325,14 +325,19 @@ def trainer_name(self, rank):
         # The name has to be consistent with that in 'dist_init' decorator.
         return f"worker{rank}"
 
-    def _remote_worker_process(self):
+    def _remote_worker_process(self, ddp_mode):
         gLogger.info("The remote worker is running.")
         dist.init_process_group(
             backend="gloo",
             init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name),
             world_size=self.world_size,
             rank=self.rank,
         )
+
+        if ddp_mode in (DdpMode.INSIDE, DdpMode.OUTSIDE):
+            # new_group needs to be called on ranks.
+            dist.new_group(TRAINER_RANKS)
+
         global shutdown_signal
         with shutdown_signal:
             shutdown_signal.wait()
@@ -367,6 +372,7 @@ def _master_process(self, ddp_mode: DdpMode, simulate_uneven_inputs: bool):
             world_size=self.world_size,
             rank=self.rank,
         )
+
         remote_em_rref = rpc.remote(
             self.remote_worker_name(), RemoteEM, args=(NUM_EM_ROW, D_SPARSE)
         )
@@ -401,6 +407,10 @@ def do_test_on_master(
                 )
             )
 
+        if ddp_mode in (DdpMode.INSIDE, DdpMode.OUTSIDE):
+            # new_group needs to be called on ranks.
+            dist.new_group(TRAINER_RANKS)
+
         training_examples = get_training_examples()
         for _ in range(3):
             futures = []
@@ -455,7 +465,7 @@ def _do_test(self, ddp_mode, simulate_uneven_inputs=False):
         if self.rank == MASTER_RANK:
             self._master_process(ddp_mode, simulate_uneven_inputs)
         elif self.rank == REMOTE_WORKER_RANK:
-            self._remote_worker_process()
+            self._remote_worker_process(ddp_mode)
         elif self.rank in TRAINER_RANKS:
             self._trainer_process(self.rank)
         else:
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
@@ -285,6 +285,8 @@ def init_method(self):
 
     @classmethod
     def _run(cls, rank, test_name, file_name):
+        if BACKEND == 'nccl' and not torch.cuda.is_available():
+            sys.exit(TEST_SKIPS['no_cuda'].exit_code)
         self = cls(test_name)
         self.rank = rank
         self.file_name = file_name
@@ -2283,7 +2285,7 @@ def test_DistributedDataParallel_requires_grad(self):
         @skip_if_lt_x_gpu(int(os.environ["WORLD_SIZE"]))
         @skip_if_rocm
         def test_DistributedDataParallel_non_default_stream(self):
-            stream = torch.cuda.Stream()
+            stream = torch.cuda.Stream(self.rank)
             rank = self.rank
             with torch.cuda.stream(stream):
                 net = torch.nn.parallel.DistributedDataParallel(
@@ -3020,7 +3022,7 @@ def _run_uneven_inputs_test(
             rank = self.rank
             sync_interval = test_case.sync_interval
             # Ensure all outsanding GPU work is comlete so this test runs independently.
-            torch.cuda.synchronize()
+            dist.barrier()
             # Bucket_cap_mb is intentionally low to test allreduce scheduling when
             # there are many buckets.
             net = torch.nn.parallel.DistributedDataParallel(

Original file line number	Diff line number	Diff line change
`@@ -63,7 +63,7 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupTest::allgather_base(`
`63`	`63`
`64`	`64`	`std::shared_ptr<ProcessGroup::Work> ProcessGroupTest::barrier(`
`65`	`65`	`const BarrierOptions& opts) {`
`66`		`- throw std::runtime_error("ProcessGroupTest does not support barrier");`
	`66`	`+ return std::make_shared<ProcessGroupTest::WorkTest>();`
`67`	`67`	`}`
`68`	`68`
`69`	`69`	`std::shared_ptr<ProcessGroup::Work> ProcessGroupTest::gather(`