jacobmou · Nov 11, 2019
diff --git a/‎.circleci/config.yml
+16 b/‎.circleci/config.yml
+16
diff --git a/‎.circleci/generate_config_yml.py
+1 b/‎.circleci/generate_config_yml.py
+1
diff --git a/‎.circleci/scripts/should_run_job.py
+4 b/‎.circleci/scripts/should_run_job.py
+4
diff --git a/‎.circleci/verbatim-sources/workflows-pytorch-ge-config-tests.yml
+16 b/‎.circleci/verbatim-sources/workflows-pytorch-ge-config-tests.yml
+16
diff --git a/‎.jenkins/pytorch/test.sh
+15-1 b/‎.jenkins/pytorch/test.sh
+15-1
diff --git a/‎test/common_utils.py
+50-2 b/‎test/common_utils.py
+50-2
diff --git a/‎test/cpp/jit/test_misc.cpp
+4-1 b/‎test/cpp/jit/test_misc.cpp
+4-1
diff --git a/‎test/jit/test_autodiff_subgraph_slicing.py
+20-10 b/‎test/jit/test_autodiff_subgraph_slicing.py
+20-10
diff --git a/‎test/jit/test_models.py
+11-8 b/‎test/jit/test_models.py
+11-8
diff --git a/‎test/jit_utils.py
+7-26 b/‎test/jit_utils.py
+7-26
diff --git a/‎test/run_test.py
+3-3 b/‎test/run_test.py
+3-3
diff --git a/‎test/test_jit.py
+204-190 b/‎test/test_jit.py
+204-190
diff --git a/‎test/test_jit_fuser.py
+12-10 b/‎test/test_jit_fuser.py
+12-10
diff --git a/‎test/test_jit_fuser_legacy.py
+6 b/‎test/test_jit_fuser_legacy.py
+6
diff --git a/‎test/test_jit_legacy.py
+10 b/‎test/test_jit_legacy.py
+10
diff --git a/‎test/test_jit_simple.py
+10 b/‎test/test_jit_simple.py
+10
diff --git a/‎torch/csrc/jit/graph_executor.cpp
+1-1 b/‎torch/csrc/jit/graph_executor.cpp
+1-1
diff --git a/‎torch/csrc/jit/passes/alias_analysis.cpp
+3-1 b/‎torch/csrc/jit/passes/alias_analysis.cpp
+3-1
diff --git a/‎torch/csrc/jit/profiling_graph_executor_impl.cpp
+22-12 b/‎torch/csrc/jit/profiling_graph_executor_impl.cpp
+22-12
@@ -1895,6 +1895,22 @@ workflows:
           ios_platform: "OS"
           requires:
             - setup
+      - pytorch_linux_test:
+          name: pytorch_linux_xenial_py3_6_gcc5_4_ge_config_legacy_test
+          requires:
+            - setup
+            - pytorch_linux_xenial_py3_6_gcc5_4_build
+          build_environment: "pytorch-linux-xenial-py3.6-gcc5.4-ge_config_legacy-test"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4:347"
+          resource_class: large
+      - pytorch_linux_test:
+          name: pytorch_linux_xenial_py3_6_gcc5_4_ge_config_simple_test
+          requires:
+            - setup
+            - pytorch_linux_xenial_py3_6_gcc5_4_build
+          build_environment: "pytorch-linux-xenial-py3.6-gcc5.4-ge_config_simple-test"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4:347"
+          resource_class: large
       - caffe2_linux_build:
           name: caffe2_onnx_py2_gcc5_ubuntu16_04_build
           requires:
 
@@ -93,6 +93,7 @@ def write(self, output_filehandle):
     File("workflows-pytorch-macos-builds.yml"),
     File("workflows-pytorch-android-gradle-build.yml"),
     File("workflows-pytorch-ios-builds.yml"),
+    File("workflows-pytorch-ge-config-tests.yml"),
     Listgen(caffe2_build_definitions.get_workflow_jobs, 3),
     File("workflows-binary-builds-smoke-subset.yml"),
     Listgen(binary_build_definitions.get_binary_smoke_test_jobs, 3),
 
@@ -65,6 +65,10 @@
     # XLA
     'pytorch-xla-linux-xenial-py3.6-clang7',
 
+    # GraphExecutor config jobs
+    'pytorch-linux-xenial-py3.6-gcc5.4-ge_config_simple-test',
+    'pytorch-linux-xenial-py3.6-gcc5.4-ge_config_legacy-test',
+
     # Other checks
     'pytorch-short-perf-test-gpu',
     'pytorch-python-doc-push',
 
@@ -0,0 +1,16 @@
+      - pytorch_linux_test:
+          name: pytorch_linux_xenial_py3_6_gcc5_4_ge_config_legacy_test
+          requires:
+            - setup
+            - pytorch_linux_xenial_py3_6_gcc5_4_build
+          build_environment: "pytorch-linux-xenial-py3.6-gcc5.4-ge_config_legacy-test"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4:347"
+          resource_class: large
+      - pytorch_linux_test:
+          name: pytorch_linux_xenial_py3_6_gcc5_4_ge_config_simple_test
+          requires:
+            - setup
+            - pytorch_linux_xenial_py3_6_gcc5_4_build
+          build_environment: "pytorch-linux-xenial-py3.6-gcc5.4-ge_config_simple-test"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4:347"
+          resource_class: large
@@ -109,8 +109,18 @@ test_python_nn() {
   assert_git_not_dirty
 }
 
+test_python_ge_config_simple() {
+  time python test/run_test.py --include jit_simple --verbose
+  assert_git_not_dirty
+}
+
+test_python_ge_config_legacy() {
+  time python test/run_test.py --include jit_legacy jit_fuser_legacy --verbose
+  assert_git_not_dirty
+}
+
 test_python_all_except_nn() {
-  time python test/run_test.py --exclude nn --verbose --bring-to-front quantization quantized quantized_tensor quantized_nn_mods
+  time python test/run_test.py --exclude nn jit_simple jit_legacy jit_fuser_legacy --verbose --bring-to-front quantization quantized quantized_tensor quantized_nn_mods
   assert_git_not_dirty
 }
 
@@ -219,6 +229,10 @@ if [[ "${BUILD_ENVIRONMENT}" == *backward* ]]; then
 elif [[ "${BUILD_ENVIRONMENT}" == *xla* || "${JOB_BASE_NAME}" == *xla* ]]; then
   test_torchvision
   test_xla
+elif [[ "${BUILD_ENVIRONMENT}" == *ge_config_legacy* || "${JOB_BASE_NAME}" == *ge_config_legacy* ]]; then
+  test_python_ge_config_legacy
+elif [[ "${BUILD_ENVIRONMENT}" == *ge_config_simple* || "${JOB_BASE_NAME}" == *ge_config_simple* ]]; then
+  test_python_ge_config_simple
 elif [[ "${BUILD_ENVIRONMENT}" == *libtorch* ]]; then
   # TODO: run some C++ tests
   echo "no-op at the moment"
 
@@ -39,17 +39,65 @@
 from torch._six import string_classes, inf
 import torch.backends.cudnn
 import torch.backends.mkl
-
+from enum import Enum
 
 torch.backends.disable_global_flags()
 
+IS_SANDCASTLE = os.getenv('SANDCASTLE') == '1' or os.getenv('TW_JOB_USER') == 'sandcastle'
+
+class ProfilingMode(Enum):
+    LEGACY = 1
+    SIMPLE = 2
+    PROFILING = 3
+
+@contextmanager
+def enable_profiling_mode():
+    if GRAPH_EXECUTOR == ProfilingMode.PROFILING:
+        old_prof_exec_state = torch._C._jit_set_profiling_executor(True)
+        old_prof_mode_state = torch._C._jit_set_profiling_mode(True)
+    try:
+        yield
+    finally:
+        if GRAPH_EXECUTOR == ProfilingMode.PROFILING:
+            torch._C._jit_set_profiling_executor(old_prof_exec_state)
+            torch._C._jit_set_profiling_mode(old_prof_mode_state)
+
+func_call = torch._C.ScriptFunction.__call__
+meth_call = torch._C.ScriptMethod.__call__
+
+def prof_callable(callable, *args, **kwargs):
+    if 'profile_and_replay' in kwargs:
+        del kwargs['profile_and_replay']
+        if GRAPH_EXECUTOR == ProfilingMode.PROFILING:
+            with enable_profiling_mode():
+                callable(*args, **kwargs)
+                return callable(*args, **kwargs)
+
+    return callable(*args, **kwargs)
+
+def prof_func_call(*args, **kwargs):
+    return prof_callable(func_call, *args, **kwargs)
+
+def prof_meth_call(*args, **kwargs):
+    return prof_callable(meth_call, *args, **kwargs)
+
+torch._C.ScriptFunction.__call__ = prof_func_call
+torch._C.ScriptMethod.__call__ = prof_meth_call
 
 parser = argparse.ArgumentParser(add_help=False)
 parser.add_argument('--subprocess', action='store_true',
                     help='whether to run each test in a subprocess')
 parser.add_argument('--seed', type=int, default=1234)
 parser.add_argument('--accept', action='store_true')
+parser.add_argument('--ge_config', type=str)
+
+GRAPH_EXECUTOR = ProfilingMode.SIMPLE if IS_SANDCASTLE else ProfilingMode.PROFILING
 args, remaining = parser.parse_known_args()
+if args.ge_config == 'legacy':
+    GRAPH_EXECUTOR = ProfilingMode.LEGACY
+elif args.ge_config == 'simple':
+    GRAPH_EXECUTOR = ProfilingMode.SIMPLE
+
 TEST_IN_SUBPROCESS = args.subprocess
 SEED = args.seed
 if not expecttest.ACCEPT:
@@ -1229,7 +1277,7 @@ def get_int64_dtype(dtype):
                             int64_dtype, layout, device, fv + 5, False)
 
 
-IS_SANDCASTLE = os.getenv('SANDCASTLE') == '1' or os.getenv('TW_JOB_USER') == 'sandcastle'
+
 
 THESE_TAKE_WAY_TOO_LONG = {
     'test_Conv3d_groups',
 
@@ -1001,7 +1001,10 @@ graph(%a):
     return stack;
   };
   run(graph, stack);
-  AT_ASSERT(testPassValue);
+  // we will not run fusion in simple mode
+  if (!getExecutorMode()) {
+    AT_ASSERT(testPassValue);
+  }
 }
 
 static void checkShape(
 
@@ -1,6 +1,7 @@
 import os
 import sys
-
+import unittest
+from common_utils import GRAPH_EXECUTOR, ProfilingMode, enable_profiling_mode
 import torch
 
 # Make the helper files in test/ importable
@@ -21,18 +22,21 @@
 def pyfn(a, b):
     return a * b
 
+@unittest.skipIf(GRAPH_EXECUTOR == ProfilingMode.SIMPLE, "Simple Executor doesn't support gradients")
 class TestAutodiffSubgraphSlicing(JitTestCase):
     # TODO: It is better if we can test directly on graphs instead of the current
     # end-to-end fashion.
     def _perform_ad_subgraph_slicing(self, fn, *input_sizes):
         with disable_autodiff_subgraph_inlining():
-            ge = torch.jit.script(fn)
-            inputs = [torch.randn(size, requires_grad=True) for size in input_sizes]
-            ge(*inputs)
-            return ge.graph_for(*inputs)
+            with enable_profiling_mode():
+                ge = torch.jit.script(fn)
+                inputs = [torch.randn(size, requires_grad=True) for size in input_sizes]
+                ge(*inputs, profile_and_replay=True)
+                return ge.graph_for(*inputs)
 
     def assertGraphSize(self, graph, size):
-        self.assertEqual(len(list(graph.nodes())), size)
+        nodes = list(filter(lambda n : n.kind() != "prim::BailOut" and n.kind() != "prim::BailoutTemplate", graph.nodes()))
+        self.assertEqual(len(list(nodes)), size)
 
     def test_chunk_constant_script_ad(self):
         @torch.jit.script
@@ -42,8 +46,9 @@ def func(x):
 
         input = torch.rand(6, 10).requires_grad_()
         with disable_autodiff_subgraph_inlining():
-            output = func(input)
-            self.assertAutodiffNode(func.graph_for(input), True, ['prim::ConstantChunk'], [])
+            with enable_profiling_mode():
+                output = func(input, profile_and_replay=True)
+                self.assertAutodiffNode(func.graph_for(input), True, ['prim::ConstantChunk'], [])
 
     def test_simple_merge(self):
         # o --> o
@@ -156,8 +161,13 @@ def fn(v, w, x, y):
 
         graph = self._perform_ad_subgraph_slicing(fn, 1, 1, 1, 1)
 
-        self.assertGraphSize(graph, 3)
-        self.assertGraphContainsExactly(graph, 'prim::DifferentiableGraph', 1)
+        # GuardElimination can't get rid of a prim::BailOut on ^pyfn
+        # which makes us create two `prim::DifferentiableGraph`s
+        # instead of just one
+        num_nodes = 4 if GRAPH_EXECUTOR == ProfilingMode.PROFILING else 3
+        self.assertGraphSize(graph, num_nodes)
+        num_diff_nodes = 2 if GRAPH_EXECUTOR == ProfilingMode.PROFILING else 1
+        self.assertGraphContainsExactly(graph, 'prim::DifferentiableGraph', num_diff_nodes)
 
     def test_respects_lexical_scoping(self):
         def fn(x, k):
 
@@ -1,7 +1,7 @@
 import os
 import sys
 import unittest
-
+from common_utils import enable_profiling_mode
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -228,8 +228,9 @@ def test_neural_style_cuda(self):
     @staticmethod
     def _test_mnist(self, device, check_export_import=True):
         # eval() is present because dropout makes this nondeterministic
-        self.checkTrace(MnistNet().to(device).eval(), (torch.rand(5, 1, 28, 28, device=device),),
-                        export_import=check_export_import)
+        with enable_profiling_mode():
+            self.checkTrace(MnistNet().to(device).eval(), (torch.rand(5, 1, 28, 28, device=device),),
+                            export_import=check_export_import)
 
     def test_mnist(self):
         self._test_mnist(self, device='cpu')
@@ -277,8 +278,9 @@ def forward(self, x):
                 action_scores = self.affine2(x)
                 return F.softmax(action_scores, dim=1)
 
-        self.checkTrace(Policy().to(device), (torch.rand(1, 4, device=device),),
-                        export_import=test_export_import)
+        with enable_profiling_mode():
+            self.checkTrace(Policy().to(device), (torch.rand(1, 4, device=device),),
+                            export_import=test_export_import)
 
     def test_reinforcement_learning(self):
         self._test_reinforcement_learning(self, device='cpu')
@@ -526,9 +528,10 @@ def forward(self, x):
                             export_import=False, allow_unused=True,
                             inputs_require_grads=False)
         else:
-            # eval() is present because randn_like makes this nondeterministic
-            self.checkTrace(VAE().to(device).eval(), (torch.rand(128, 1, 28, 28, device=device),),
-                            export_import=check_export_import)
+            with enable_profiling_mode():
+                # eval() is present because randn_like makes this nondeterministic
+                self.checkTrace(VAE().to(device).eval(), (torch.rand(128, 1, 28, 28, device=device),),
+                                export_import=check_export_import)
 
     def test_vae(self):
         self._test_vae(self, device='cpu')
 
@@ -12,11 +12,10 @@
 import torch.jit.quantized
 import zipfile
 import functools
-from enum import Enum
 
 # Testing utils
 from common_utils import TestCase, IS_WINDOWS, \
-    freeze_rng_state, TemporaryFileName
+    freeze_rng_state, TemporaryFileName, enable_profiling_mode, ProfilingMode
 
 # Standard library
 from contextlib import contextmanager
@@ -33,28 +32,9 @@
 import tempfile
 import textwrap
 
-IN_TRANSITION_TO_PROFILING_GRAPH_EXECUTOR = False
-
 RUN_CUDA = torch.cuda.is_available()
 RUN_CUDA_MULTI_GPU = RUN_CUDA and torch.cuda.device_count() > 1
 
-class ProfilingMode(Enum):
-    OFF = 1
-    EXECUTOR = 2
-    FULL = 3
-
-@contextmanager
-def enable_profiling_mode(flag):
-    if IN_TRANSITION_TO_PROFILING_GRAPH_EXECUTOR:
-        old_prof_exec_state = torch._C._jit_set_profiling_executor(flag != ProfilingMode.OFF)
-        old_prof_mode_state = torch._C._jit_set_profiling_mode(flag == ProfilingMode.FULL)
-    try:
-        yield
-    finally:
-        if IN_TRANSITION_TO_PROFILING_GRAPH_EXECUTOR:
-            torch._C._jit_set_profiling_executor(old_prof_exec_state)
-            torch._C._jit_set_profiling_mode(old_prof_mode_state)
-
 def execWrapper(code, glob, loc):
     if PY2:
         exec(code) in glob, loc
@@ -325,13 +305,13 @@ def get_frame_vars(self, frames_up):
         return defined_vars
 
     def checkScriptRaisesRegex(self, script, inputs, exception, regex,
-                               outputs=None, capture_output=False, profiling=ProfilingMode.FULL):
+                               outputs=None, capture_output=False, profiling=ProfilingMode.PROFILING):
         """
         Checks that a given function will throw the correct exception,
         when executed with normal python, the string frontend, and the AST frontend
         """
 
-        with enable_profiling_mode(profiling):
+        with enable_profiling_mode():
             # normal python
             with self.assertRaisesRegex(exception, regex):
                 script(*inputs)
@@ -362,12 +342,12 @@ def checkScript(self,
                     inputs_requires_grad=False,
                     capture_output=False,
                     frames_up=1,
-                    profiling=ProfilingMode.FULL):
+                    profiling=ProfilingMode.PROFILING):
         with torch.jit.optimized_execution(optimize):
-            with enable_profiling_mode(profiling):
+            with enable_profiling_mode():
                 if isinstance(script, str):
                     # Compile the string to a Script function
-                    # with enable_profiling_mode(profiling):
+                    # with enable_profiling_mode():
                     cu = torch.jit.CompilationUnit(script, _frames_up=frames_up)
 
                     # Execute the Python function so we can run it later and get its
@@ -473,6 +453,7 @@ def input_reduce(input, fn, acc):
         outputs_ge = ge(*nograd_inputs)
         self.assertEqual(outputs, outputs_ge)
 
+        # test gradients case
         outputs = func(*recording_inputs)
         if inputs_require_grads:
             grads = torch.autograd.grad(allSum(outputs), flattened_recording_inputs,
 
@@ -54,6 +54,9 @@
     'utils',
     'namedtuple_return_api',
     'jit_fuser',
+    'jit_simple',
+    'jit_legacy',
+    'jit_fuser_legacy',
     'tensorboard',
     'namedtensor',
     'type_promotion',
@@ -135,15 +138,13 @@ def run_test(executable, test_module, test_directory, options, *extra_unittest_a
     # Can't call `python -m unittest test_*` here because it doesn't run code
     # in `if __name__ == '__main__': `. So call `python test_*.py` instead.
     argv = [test_module + '.py'] + unittest_args + list(extra_unittest_args)
-
     command = executable + argv
     return shell(command, test_directory)
 
 
 def test_cuda_primary_ctx(executable, test_module, test_directory, options):
     return run_test(executable, test_module, test_directory, options, '--subprocess')
 
-
 def test_cpp_extensions(executable, test_module, test_directory, options):
     try:
         cpp_extension.verify_ninja_availability()
@@ -444,7 +445,6 @@ def main():
                 signal_name = SIGNALS_TO_NAMES_DICT[-return_code]
                 message += ' Received signal: {}'.format(signal_name)
             raise RuntimeError(message)
-
     if options.coverage:
         shell(['coverage', 'combine'])
         shell(['coverage', 'html'])
 
@@ -9,16 +9,16 @@
 import torch.nn.functional as F
 from torch.testing import FileCheck
 
-from common_utils import run_tests, IS_SANDCASTLE
+from common_utils import run_tests, IS_SANDCASTLE, ProfilingMode, GRAPH_EXECUTOR, \
+    enable_profiling_mode
 from textwrap import dedent
 from itertools import product, permutations
 
 from test_jit import JitTestCase, enable_cpu_fuser, RUN_CUDA, RUN_CUDA_HALF, RUN_CUDA_MULTI_GPU, \
     backward_graph, all_backward_graphs, get_lstm_inputs, get_milstm_inputs, \
     LSTMCellC, LSTMCellF, LSTMCellS, MiLSTMCell, _inline_everything
-from jit_utils import enable_profiling_mode, ProfilingMode, IN_TRANSITION_TO_PROFILING_GRAPH_EXECUTOR
 
-if IN_TRANSITION_TO_PROFILING_GRAPH_EXECUTOR:
+if GRAPH_EXECUTOR == ProfilingMode.PROFILING:
     torch._C._jit_set_profiling_executor(True)
     torch._C._jit_set_profiling_mode(True)
 
@@ -123,7 +123,7 @@ def scaleshift(x, scale, shift):
 
     @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
     @unittest.skipIf(not RUN_CUDA_HALF, "no half support")
-    @unittest.skipIf(IN_TRANSITION_TO_PROFILING_GRAPH_EXECUTOR, "no half support with profiling on")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.LEGACY, "no half support with profiling on")
     def test_cuda_half(self):
         x = torch.randn(4, 4, dtype=torch.half, device='cuda')
         y = torch.randn(4, 4, dtype=torch.half, device='cuda')
@@ -303,15 +303,16 @@ def funcOptMax(a, b):
         funcs = (func2, funcInf, funcOptMin, funcOptMax)
         for f, inputs in product(funcs, [[a, b], [a, nan]]):
             inp1, inp2 = inputs
-            s = self.checkScript(f, (inp1, inp2), profiling=ProfilingMode.FULL)
+            s = self.checkScript(f, (inp1, inp2), profiling=ProfilingMode.PROFILING)
             self.assertAllFused(s.graph_for(inp1, inp2), except_for={'aten::size', 'aten::_size_if_not_equal'})
             c = s(inp1, inp2)
-            with enable_profiling_mode(ProfilingMode.FULL):
+            with enable_profiling_mode():
                 warmup_backward(c.sum())
             graph = backward_graph(s)
             self.assertAllFused(graph, except_for={'aten::Float'})
 
     @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.LEGACY, "no half support with profiling on")
     def test_dropout(self):
         def func(x):
             x = torch.nn.functional.dropout(x)
@@ -461,7 +462,7 @@ def test_exp_cuda(self):
         self.assertAllFused(ge.graph_for(x, y))
 
     @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
-    @unittest.skipIf(IN_TRANSITION_TO_PROFILING_GRAPH_EXECUTOR, "broken with profiling on")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.LEGACY, "broken with profiling on")
     @_inline_everything
     def test_fuse_decompose_normalization(self):
         class ResLike(torch.jit.ScriptModule):
@@ -552,7 +553,7 @@ def fn_test_scalar_arg_requires_grad(x, p):
                                                                   "aten::_size_if_not_equal"))
 
     @unittest.skipIf(IS_SANDCASTLE, "NYI: fuser CPU support for Sandcastle")
-    @unittest.skipIf(IN_TRANSITION_TO_PROFILING_GRAPH_EXECUTOR, "broken with profiling on")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.LEGACY, "broken with profiling on")
     @enable_cpu_fuser
     def test_fuser_deduplication(self):
         # See that fusion kernel outputs are deduplicated when removing  _grad_sum_to_size in the fuser's compilation
@@ -905,6 +906,7 @@ def f(x, y):
         self.assertAllFused(script_f.graph_for(x, y), except_for={'prim::TupleConstruct'})
 
     @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.LEGACY, "no half support with profiling on")
     def test_grad_sum_to_size_elimination(self):
 
         def my_broadcasted_cell(a, b, c):
@@ -913,7 +915,7 @@ def my_broadcasted_cell(a, b, c):
         s1 = torch.randn(5, 1, requires_grad=True, device='cuda')
         s2 = torch.randn(5, 5, requires_grad=True, device='cuda')
 
-        module = self.checkScript(my_broadcasted_cell, (s1, s1, s1), profiling=ProfilingMode.FULL)
+        module = self.checkScript(my_broadcasted_cell, (s1, s1, s1), profiling=ProfilingMode.PROFILING)
         forward_graph = module.graph_for(s1, s1, s1)
         self.assertAllFused(forward_graph, except_for=("aten::size", "prim::BroadcastSizes",
                                                        "aten::_size_if_not_equal"))
@@ -925,7 +927,7 @@ def my_broadcasted_cell(a, b, c):
             args = s2 if i < 1 else s1, s2 if i < 2 else s1, s2
             args = [a.detach_().requires_grad_() for a in args]
             # recompile, so we don't trigger bailouts
-            module = self.checkScript(my_broadcasted_cell, args, profiling=ProfilingMode.FULL)
+            module = self.checkScript(my_broadcasted_cell, args, profiling=ProfilingMode.PROFILING)
             res = module(s2 if i < 1 else s1, s2 if i < 2 else s1, s2)
             warmup_backward(res.sum(), args)
             grads = torch.autograd.grad(res.sum(), args)
 
@@ -0,0 +1,6 @@
+import sys
+sys.argv.append("--ge_config=legacy")
+from test_jit_fuser import *
+
+if __name__ == '__main__':
+    run_tests()
@@ -0,0 +1,10 @@
+import sys
+sys.argv.append("--ge_config=legacy")
+from test_jit import *
+
+if __name__ == '__main__':
+    run_tests()
+    if not PY2:
+        import test_jit_py3
+        suite = unittest.findTestCases(test_jit_py3)
+        unittest.TextTestRunner().run(suite)
@@ -0,0 +1,10 @@
+import sys
+sys.argv.append("--ge_config=simple")
+from test_jit import *
+
+if __name__ == '__main__':
+    run_tests()
+    if not PY2:
+        import test_jit_py3
+        suite = unittest.findTestCases(test_jit_py3)
+        unittest.TextTestRunner().run(suite)
@@ -495,7 +495,7 @@ struct GraphExecutorImpl : public GraphExecutorImplBase {
   }
 
   ExecutionPlan getPlanFor(Stack& stack) override {
-    return getGraphExecutorOptimize() ? getOrCompile(stack)
+   return getGraphExecutorOptimize() ? getOrCompile(stack)
                                       : getOrCompileFallback();
   }
 
 
@@ -363,8 +363,10 @@ void AliasDb::analyzeImpl(Node* node) {
       // TODO: this can be improved with summarizes of what the function does
       // for now we assume the worst
       return analyzeConservative(node);
-    case prim::Print:
     case prim::Uninitialized:
+      giveFreshAlias(node->output());
+      return;
+    case prim::Print:
     case prim::isinstance:
       // These ops do nothing
       return;
 
@@ -19,8 +19,14 @@
 namespace torch {
 namespace jit {
 
+#ifdef FBCODE_CAFFE2
 static std::atomic<bool> profiling_mode{false};
 static std::atomic<bool> executor_mode{false};
+#else
+static std::atomic<bool> executor_mode{true};
+static std::atomic<bool> profiling_mode{true};
+#endif
+
 
 std::atomic<bool>& getProfilingMode() {
   return profiling_mode;
@@ -112,22 +118,26 @@ ExecutionPlan ProfilingGraphExecutorImpl::getPlanFor(Stack& stack) {
   // TODO: insert grad propagation
   bool needs_gradient = getProfilingMode()
       ? needsGradientInProfilingMode(copy->block())
-      : needsGradient(copy);
+      : true;
   if (needs_gradient) {
-    auto diff_nodes = CreateAutodiffSubgraphs(
+    // for Simple Executor skip creating autodiff graphs
+    // and let autograd handle backward for us
+    if (getProfilingMode()) {
+      auto diff_nodes = CreateAutodiffSubgraphs(
         copy,
         getAutodiffSubgraphInlining() ? autodiffSubgraphNodeThreshold : 1);
-    for (Node *dnode : diff_nodes) {
-      auto diff_graph = std::move(dnode->g(attr::Subgraph));
-      Gradient gradient = differentiate(diff_graph);
-      runOptimization(gradient.f);
-      // run non diff optimization on the forward graph
-      runNondiffOptimization(gradient.f);
-      packGradient(gradient, dnode);
+      for (Node *dnode : diff_nodes) {
+        auto diff_graph = std::move(dnode->g(attr::Subgraph));
+        Gradient gradient = differentiate(diff_graph);
+        runOptimization(gradient.f);
+        // run non diff optimization on the forward graph
+        runNondiffOptimization(gradient.f);
+        packGradient(gradient, dnode);
+      }
+      InlineAutodiffSubgraphs(copy, getAutodiffSubgraphInlining()
+                                        ? autodiffSubgraphInlineThreshold
+                                        : 1);
     }
-    InlineAutodiffSubgraphs(copy, getAutodiffSubgraphInlining()
-                                      ? autodiffSubgraphInlineThreshold
-                                      : 1);
   } else {
     runNondiffOptimization(copy);
   }
Original file line number	Diff line number	Diff line change
`@@ -495,7 +495,7 @@ struct GraphExecutorImpl : public GraphExecutorImplBase {`
`495`	`495`	`}`
`496`	`496`
`497`	`497`	`ExecutionPlan getPlanFor(Stack& stack) override {`
`498`		`- return getGraphExecutorOptimize() ? getOrCompile(stack)`
	`498`	`+ return getGraphExecutorOptimize() ? getOrCompile(stack)`
`499`	`499`	`: getOrCompileFallback();`
`500`	`500`	`}`
`501`	`501`