Skip to content

Commit b22abbe

Browse files
rohan-varmafacebook-github-bot
authored andcommittedSep 9, 2020
Enable test_distributed to work with spawn mode (pytorch#41769)
Summary: Pull Request resolved: pytorch#41769 Currently the tests in `test_distributed` only work with the `fork` mode multiprocessing, this PR introduces support for `spawn` mode multiprocessing as well (while keeping the `fork` mode intact). Motivations for the change: 1) Spawn multiprocessing is the default on MacOS, so it better emulates how MacOS users would use distributed 2) With python 3.8+, spawn is the default on linux, so we should have test coverage for this 3) PT multiprocessing suggests using spawn/forkserver over fork, for sharing cuda tensors: https://pytorch.org/docs/stable/multiprocessing.html 4) Spawn is better supported with respect to certain sanitizers such as TSAN, so adding this sanitizer coverage may help us uncover issues. How it is done: 1) Move `test_distributed` tests in `_DistTestBase` class to a shared file `distributed_test` (similar to how the RPC tests are structured) 2) For `Barrier`, refactor the setup of temp directories, as the current version did not work with spawn, each process would get a different randomly generated directory and thus would write to different barriers. 3) Add all the relevant builds to run internally and in OSS. Running test_distributed with spawn mode in OSS can be done with: `python test/run_test.py -i distributed/test_distributed_spawn -v` Reviewed By: izdeby Differential Revision: D22408023 fbshipit-source-id: e206be16961fd80438f995e221f18139d7e6d2a9
1 parent 1d01fcd commit b22abbe

File tree

5 files changed

+3223
-3170
lines changed

5 files changed

+3223
-3170
lines changed
 

‎test/distributed/test_distributed.py

+9-3,168
Large diffs are not rendered by default.
+35
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
from __future__ import absolute_import, division, print_function, unicode_literals
2+
3+
import os
4+
import sys
5+
import unittest
6+
7+
import torch.distributed as dist
8+
from torch.testing._internal.common_utils import run_tests, TEST_WITH_ASAN, NO_MULTIPROCESSING_SPAWN
9+
from torch.testing._internal.distributed.distributed_test import (
10+
DistributedTest, TestDistBackend
11+
)
12+
13+
if not dist.is_available():
14+
print("Distributed not available, skipping tests", file=sys.stderr)
15+
sys.exit(0)
16+
17+
BACKEND = os.environ["BACKEND"]
18+
19+
if BACKEND == "gloo" or BACKEND == "nccl":
20+
21+
@unittest.skipIf(
22+
TEST_WITH_ASAN, "Skip ASAN as torch + multiprocessing spawn have known issues"
23+
)
24+
@unittest.skipIf(
25+
NO_MULTIPROCESSING_SPAWN, "Spawn not available, skipping tests."
26+
)
27+
class TestDistBackendWithSpawn(TestDistBackend, DistributedTest._DistTestBase):
28+
29+
def setUp(self):
30+
super().setUp()
31+
self._spawn_processes()
32+
33+
34+
if __name__ == "__main__":
35+
run_tests()

‎test/run_test.py

+7-2
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
'test_dataloader',
3636
'distributed/test_data_parallel',
3737
'distributed/test_distributed',
38+
'distributed/test_distributed_spawn',
3839
'test_distributions',
3940
'test_expecttest',
4041
'test_foreach',
@@ -96,6 +97,7 @@
9697
'distributed/rpc/test_process_group_agent',
9798
'distributed/rpc/test_tensorpipe_agent',
9899
'distributed/test_distributed',
100+
'distributed/test_distributed_spawn',
99101
]
100102

101103
ROCM_BLOCKLIST = [
@@ -142,6 +144,7 @@
142144
'distributed/rpc/test_process_group_agent',
143145
'distributed/rpc/test_tensorpipe_agent',
144146
'distributed/algorithms/ddp_comm_hooks/test_ddp_hooks',
147+
'distributed/test_distributed_spawn',
145148
'test_cuda',
146149
'test_cuda_primary_ctx',
147150
'test_cpp_extensions_aot_ninja',
@@ -306,7 +309,8 @@ def test_distributed(test_module, test_directory, options):
306309
for with_init_file in {True, False}:
307310
tmp_dir = tempfile.mkdtemp()
308311
if options.verbose:
309-
with_init = ' with file init_method' if with_init_file else ''
312+
init_str = "with {} init_method"
313+
with_init = init_str.format("file" if with_init_file else "env")
310314
print_to_stderr(
311315
'Running distributed tests for the {} backend{}'.format(
312316
backend, with_init))
@@ -315,7 +319,7 @@ def test_distributed(test_module, test_directory, options):
315319
os.environ['INIT_METHOD'] = 'env://'
316320
os.environ.update(env_vars)
317321
if with_init_file:
318-
if test_module == "test_distributed":
322+
if test_module in ["test_distributed", "test_distributed_spawn"]:
319323
init_method = 'file://{}/'.format(tmp_dir)
320324
else:
321325
init_method = 'file://{}/shared_init_file'.format(tmp_dir)
@@ -348,6 +352,7 @@ def test_distributed(test_module, test_directory, options):
348352
'test_cpp_extensions_aot_no_ninja': test_cpp_extensions_aot_no_ninja,
349353
'test_cpp_extensions_aot_ninja': test_cpp_extensions_aot_ninja,
350354
'distributed/test_distributed': test_distributed,
355+
'distributed/test_distributed_spawn': test_distributed,
351356
}
352357

353358

‎torch/testing/_internal/common_distributed.py

+21
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,26 @@ def compute_sum(fn, world_size):
194194
]
195195
]
196196

197+
tmp_dir = None
198+
def initialize_temp_directories(init_method=None):
199+
global tmp_dir
200+
tmp_dir = tempfile.TemporaryDirectory()
201+
os.environ["TEMP_DIR"] = tmp_dir.name
202+
os.mkdir(os.path.join(tmp_dir.name, "barrier"))
203+
os.mkdir(os.path.join(tmp_dir.name, "test_dir"))
204+
init_dir_path = os.path.join(tmp_dir.name, "init_dir")
205+
os.mkdir(init_dir_path)
206+
# Set init method if specified.
207+
if init_method is not None:
208+
os.environ["INIT_METHOD"] = init_method
209+
else:
210+
os.environ["INIT_METHOD"] = "file://" + os.path.join(
211+
init_dir_path, "shared_init_file"
212+
)
213+
214+
def cleanup_temp_dir():
215+
if tmp_dir is not None:
216+
tmp_dir.cleanup()
197217

198218
# [How does MultiProcessTestCase work?]
199219
# Each MultiProcessTestCase instance uses 1 + `world_size()` processes, by
@@ -243,6 +263,7 @@ def __init__(self, method_name='runTest'):
243263
def setUp(self):
244264
super().setUp()
245265
self.skip_return_code_checks = []
266+
self.processes = []
246267
self.rank = self.MAIN_PROCESS_RANK
247268
self.file_name = tempfile.NamedTemporaryFile(delete=False).name
248269

‎torch/testing/_internal/distributed/distributed_test.py

+3,151
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)
Please sign in to comment.