Build shape expressions and remove outputs that are only used by aten::sizes (pytorch#45080)

Krovatkin · facebook-github-bot · commit 993628c74a9c · 2020-09-28T10:45:56.000-07:00
Summary: Currently, TE materializes all intermediate results even if they are only used for computing their shapes. This diff ports the approach the OF (Old Fuser) took to deal with this issue. Namely, given the structure of a fusion group we infer all the sizes outside a fusion group based on fusion group's inputs. A simple example would be: ``` def test_fuse(a, b): c = a + b d = c + b return d ``` Here we don't need to cache `c` as computing a gradient for `b` in `d = c + b` doesn't need it. We do need to compute sizes for all arguments here in case broadcasts happen. Without this optimization, TE would need to materialize `c` so we can get its size ``` [DUMP profiling_graph_executor_impl.cpp:499] Optimized Graph: [DUMP profiling_graph_executor_impl.cpp:499] graph(%a.1 : Tensor, [DUMP profiling_graph_executor_impl.cpp:499] %b.1 : Tensor): [DUMP profiling_graph_executor_impl.cpp:499] %11 : Tensor = prim::DifferentiableGraph_0(%b.1, %a.1) [DUMP profiling_graph_executor_impl.cpp:499] return (%11) [DUMP profiling_graph_executor_impl.cpp:499] with prim::DifferentiableGraph_0 = graph(%11 : Tensor, [DUMP profiling_graph_executor_impl.cpp:499] %13 : Tensor): [DUMP profiling_graph_executor_impl.cpp:499] %59 : int[] = aten::size(%13) # <string>:3:44 [DUMP profiling_graph_executor_impl.cpp:499] %62 : int[] = aten::size(%11) # <string>:3:93 [DUMP profiling_graph_executor_impl.cpp:499] %83 : Double(1:1, requires_grad=0, device=cuda:0), %84 : Double(1:1, requires_grad=0, device=cuda:0), %85 : bool = prim::TypeCheck(%11, %13) [DUMP profiling_graph_executor_impl.cpp:499] %86 : Tensor, %87 : Tensor = prim::If(%85) [DUMP profiling_graph_executor_impl.cpp:499] block0(): [DUMP profiling_graph_executor_impl.cpp:499] %d.4 : Double(1:1, requires_grad=0, device=cuda:0), %c.4 : Double(1:1, requires_grad=0, device=cuda:0) = prim::TensorExprGroup_0(%83, %84) [DUMP profiling_graph_executor_impl.cpp:499] -> (%d.4, %c.4) [DUMP profiling_graph_executor_impl.cpp:499] block1(): [DUMP profiling_graph_executor_impl.cpp:499] %94 : Function = prim::Constant[name="fallback_function", fallback=1]() [DUMP profiling_graph_executor_impl.cpp:499] %95 : (Tensor, Tensor) = prim::CallFunction(%94, %11, %13) [DUMP profiling_graph_executor_impl.cpp:499] %96 : Tensor, %97 : Tensor = prim::TupleUnpack(%95) [DUMP profiling_graph_executor_impl.cpp:499] -> (%96, %97) [DUMP profiling_graph_executor_impl.cpp:499] %60 : int[] = aten::size(%87) # <string>:3:55 [DUMP profiling_graph_executor_impl.cpp:499] %61 : int[]? = aten::_size_if_not_equal(%59, %60) # <string>:3:19 [DUMP profiling_graph_executor_impl.cpp:499] %64 : int[]? = aten::_size_if_not_equal(%62, %60) # <string>:3:68 [DUMP profiling_graph_executor_impl.cpp:499] %67 : int[] = aten::size(%86) # <string>:3:55 [DUMP profiling_graph_executor_impl.cpp:499] %68 : int[]? = aten::_size_if_not_equal(%60, %67) # <string>:3:19 [DUMP profiling_graph_executor_impl.cpp:499] %71 : int[]? = aten::_size_if_not_equal(%62, %67) # <string>:3:68 [DUMP profiling_graph_executor_impl.cpp:499] return (%86, %61, %64, %68, %71) [DUMP profiling_graph_executor_impl.cpp:499] with prim::TensorExprGroup_0 = graph(%1 : Double(1:1, requires_grad=0, device=cuda:0), [DUMP profiling_graph_executor_impl.cpp:499] %4 : Double(1:1, requires_grad=0, device=cuda:0)): [DUMP profiling_graph_executor_impl.cpp:499] %5 : int = prim::Constant[value=1]() [DUMP profiling_graph_executor_impl.cpp:499] %c.3 : Double(1:1, requires_grad=0, device=cuda:0) = aten::add(%4, %1, %5) # /scratch/villedepommes/pytorches/bench/test/test_jit.py:2872:16 [DUMP profiling_graph_executor_impl.cpp:499] %2 : int = prim::Constant[value=1]() [DUMP profiling_graph_executor_impl.cpp:499] %d.3 : Double(1:1, requires_grad=0, device=cuda:0) = aten::add(%c.3, %1, %2) # /scratch/villedepommes/pytorches/bench/test/test_jit.py:2873:16 [DUMP profiling_graph_executor_impl.cpp:499] return (%d.3, %c.3) ``` With this optimization we use `prim::BroadcastSizes` to compute the size of `c`. No need to materialize it. ``` [DUMP profiling_graph_executor_impl.cpp:499] Optimized Graph: [DUMP profiling_graph_executor_impl.cpp:499] graph(%a.1 : Tensor, [DUMP profiling_graph_executor_impl.cpp:499] %b.1 : Tensor): [DUMP profiling_graph_executor_impl.cpp:499] %11 : Tensor = prim::DifferentiableGraph_0(%b.1, %a.1) [DUMP profiling_graph_executor_impl.cpp:499] return (%11) [DUMP profiling_graph_executor_impl.cpp:499] with prim::DifferentiableGraph_0 = graph(%11 : Tensor, [DUMP profiling_graph_executor_impl.cpp:499] %13 : Tensor): [DUMP profiling_graph_executor_impl.cpp:499] %59 : int[] = aten::size(%13) # <string>:3:44 [DUMP profiling_graph_executor_impl.cpp:499] %62 : int[] = aten::size(%11) # <string>:3:93 [DUMP profiling_graph_executor_impl.cpp:499] %88 : Double(1:1, requires_grad=0, device=cuda:0), %89 : Double(1:1, requires_grad=0, device=cuda:0), %90 : bool = prim::TypeCheck(%11, %13) [DUMP profiling_graph_executor_impl.cpp:499] %91 : Tensor = prim::If(%90) [DUMP profiling_graph_executor_impl.cpp:499] block0(): [DUMP profiling_graph_executor_impl.cpp:499] %d.4 : Double(1:1, requires_grad=0, device=cuda:0) = prim::TensorExprGroup_0(%88, %89) [DUMP profiling_graph_executor_impl.cpp:499] -> (%d.4) [DUMP profiling_graph_executor_impl.cpp:499] block1(): [DUMP profiling_graph_executor_impl.cpp:499] %97 : Function = prim::Constant[name="fallback_function", fallback=1]() [DUMP profiling_graph_executor_impl.cpp:499] %98 : (Tensor) = prim::CallFunction(%97, %11, %13) [DUMP profiling_graph_executor_impl.cpp:499] %99 : Tensor = prim::TupleUnpack(%98) [DUMP profiling_graph_executor_impl.cpp:499] -> (%99) [DUMP profiling_graph_executor_impl.cpp:499] %85 : int[] = aten::size(%91) [DUMP profiling_graph_executor_impl.cpp:499] %86 : int[] = prim::BroadcastSizes(%59, %62) [DUMP profiling_graph_executor_impl.cpp:499] %61 : int[]? = aten::_size_if_not_equal(%59, %86) # <string>:3:19 [DUMP profiling_graph_executor_impl.cpp:499] %64 : int[]? = aten::_size_if_not_equal(%62, %86) # <string>:3:68 [DUMP profiling_graph_executor_impl.cpp:499] %68 : int[]? = aten::_size_if_not_equal(%86, %85) # <string>:3:19 [DUMP profiling_graph_executor_impl.cpp:499] %71 : int[]? = aten::_size_if_not_equal(%62, %85) # <string>:3:68 [DUMP profiling_graph_executor_impl.cpp:499] return (%91, %61, %64, %68, %71) [DUMP profiling_graph_executor_impl.cpp:499] with prim::TensorExprGroup_0 = graph(%1 : Double(1:1, requires_grad=0, device=cuda:0), [DUMP profiling_graph_executor_impl.cpp:499] %4 : Double(1:1, requires_grad=0, device=cuda:0)): [DUMP profiling_graph_executor_impl.cpp:499] %5 : int = prim::Constant[value=1]() [DUMP profiling_graph_executor_impl.cpp:499] %c.3 : Double(1:1, requires_grad=0, device=cuda:0) = aten::add(%4, %1, %5) # /scratch/villedepommes/pytorches/bench/test/test_jit.py:2872:16 [DUMP profiling_graph_executor_impl.cpp:499] %2 : int = prim::Constant[value=1]() [DUMP profiling_graph_executor_impl.cpp:499] %d.3 : Double(1:1, requires_grad=0, device=cuda:0) = aten::add(%c.3, %1, %2) # /scratch/villedepommes/pytorches/bench/test/test_jit.py:2873:16 [DUMP profiling_graph_executor_impl.cpp:499] return (%d.3) ``` Pull Request resolved: pytorch#45080 Reviewed By: bertmaher Differential Revision: D23856410 Pulled By: Krovatkin fbshipit-source-id: 2956286eb03a4894a5baa151c35e6092466322b1
diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py
@@ -683,6 +683,26 @@ def foo(hx, cx):
         # XXX: TE fuser can handle concats in a fusion group.
         # FileCheck().check("FusedConcat").check_next("return").run(str(graph))
 
+    @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
+    def test_remove_output_used_only_in_size(self):
+        def test_fuse(a, b):
+            c = a + b
+            d = c + b
+            return d
+
+        scripted_f = torch.jit.script(test_fuse)
+        x = torch.ones(1, requires_grad=True, device='cuda')
+        y = torch.ones(1, requires_grad=True, device='cuda')
+        warmup_forward(scripted_f, x, y)
+        g = torch.jit.last_executed_optimized_graph()
+        diff_nodes = [n for n in g.nodes() if n.kind() == 'prim::DifferentiableGraph']
+        self.assertEqual(len(diff_nodes), 1)
+        g = diff_nodes[0].g('Subgraph')
+        if_nodes = [n for n in g.nodes() if n.kind() == 'prim::If']
+        self.assertEqual(len(if_nodes), 1)
+        # the if node and the fusion group inside it should only have one output
+        self.assertEqual(len(list(if_nodes[0].outputs())), 1)
+
     @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
     def test_concat_invariant_cuda(self):
         # Invariant: the output of prim::FusedConcat may
diff --git a/torch/csrc/jit/passes/graph_fuser.cpp b/torch/csrc/jit/passes/graph_fuser.cpp
@@ -7,6 +7,7 @@
 #include <torch/csrc/jit/passes/common_subexpression_elimination.h>
 #include <torch/csrc/jit/passes/constant_pooling.h>
 #include <torch/csrc/jit/passes/dead_code_elimination.h>
+#include <torch/csrc/jit/passes/tensorexpr_fuser.h>
 #include <torch/csrc/jit/passes/utils/subgraph_utils.h>
 #include <torch/csrc/jit/runtime/autodiff.h>
 #include <torch/csrc/jit/runtime/custom_operator.h>
@@ -120,16 +121,6 @@ bool isSimpleMap(Node* node) {
   return true;
 }
 
-Value* broadcastSizes(at::ArrayRef<Value*> sizes, AliasDb* db) {
-  AT_ASSERT(!sizes.empty());
-  Graph* graph = sizes[0]->owningGraph();
-  Node* broadcast_n =
-      graph->insertNode(graph->create(prim::BroadcastSizes, sizes));
-  broadcast_n->output()->setType(ListType::ofInts());
-  db->createValue(broadcast_n->output());
-  return broadcast_n->output();
-}
-
 struct GraphFuser {
   using FusionCallback = std::function<bool(GraphFuser*, Node*)>;
 
@@ -926,13 +917,6 @@ struct GraphFuser {
     }
   }
 
-  bool usedOnlyInSize(Value* v) {
-    const auto& uses = v->uses();
-    return std::all_of(uses.begin(), uses.end(), [](const Use& u) {
-      return u.user->matches("aten::size(Tensor self) -> int[]");
-    });
-  }
-
   // Builds up expressions that compute shapes of all intermediates (and
   // outputs) of the fusion group, based on the sizes of inputs. You should run
   // DCE to remove those that you end up not using.
diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
@@ -29,6 +29,23 @@ bool isSupportedForBlock(Node* node) {
   }
 }
 
+bool usedOnlyInSize(Value* v) {
+  const auto& uses = v->uses();
+  return std::all_of(uses.begin(), uses.end(), [](const Use& u) {
+    return u.user->matches("aten::size(Tensor self) -> int[]");
+  });
+}
+
+Value* broadcastSizes(at::ArrayRef<Value*> sizes, AliasDb* db) {
+  AT_ASSERT(!sizes.empty());
+  Graph* graph = sizes[0]->owningGraph();
+  Node* broadcast_n =
+      graph->insertNode(graph->create(prim::BroadcastSizes, sizes));
+  broadcast_n->output()->setType(ListType::ofInts());
+  db->createValue(broadcast_n->output());
+  return broadcast_n->output();
+}
+
 namespace tensorexpr {
 bool isSupported(Node* node) {
   // For Block codegen we allow limited ops.
@@ -287,6 +304,132 @@ class TensorExprFuser {
         min_group_size_(min_group_size),
         disable_shape_checks_(disable_shape_checks) {}
 
+  // Builds up expressions that compute shapes of all intermediates (and
+  // outputs) of the fusion group, based on the sizes of inputs. You should run
+  // DCE to remove those that you end up not using.
+  std::unordered_map<Value*, Value*> buildShapeExpressions(Node* fusion_group) {
+    GRAPH_DUMP("buildShapeExpressions for ", fusion_group->g(attr::Subgraph));
+    WithInsertPoint insert_guard{fusion_group->next()};
+    std::unordered_map<Value*, Value*> shape_of;
+
+    Graph* graph = fusion_group->owningGraph();
+    auto subgraph = fusion_group->g(attr::Subgraph);
+
+    auto inputs = fusion_group->inputs();
+    auto sinputs = subgraph->inputs();
+    AT_ASSERT(inputs.size() == sinputs.size());
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      if (inputs[i]->type()->isSubtypeOf(TensorType::get())) {
+        Value* soutput = graph->insert(aten::size, {inputs[i]});
+        aliasDb_->createValue(soutput);
+        GRAPH_DEBUG(
+            "Adding a mapping for %",
+            sinputs[i]->debugName(),
+            " ",
+            getHeader(soutput->node()));
+        shape_of[sinputs[i]] = soutput;
+      }
+    }
+
+    // When we have a guarantee that an output won't be removed, because it's
+    // used in expressions that don't involve size checks, we can use its size
+    // instead of computing a long chain of broadcasts, starting from the
+    // beginning of the kernel.
+    auto outputs = fusion_group->outputs();
+    auto soutputs = subgraph->outputs();
+    AT_ASSERT(outputs.size() == soutputs.size());
+    for (size_t i = 0; i < outputs.size(); ++i) {
+      if (usedOnlyInSize(outputs[i]))
+        continue;
+      Value* soutput = graph->insert(aten::size, {outputs[i]});
+      aliasDb_->createValue(soutput);
+      shape_of[soutputs[i]] = soutput;
+    }
+
+    for (Node* n : subgraph->nodes()) {
+      // XXX: Use of shape_of.emplace is crucial to the output shape
+      // optimization!
+      if (n->kind() == aten::cat) {
+        // This is a bit more involved, because we have to account for the case
+        // when inputs have different shapes, but fortunately those tensors are
+        // always outputs, and so we can simply avoid replacing their queries,
+        // because it won't help us.
+        continue;
+      }
+      if (n->kind() == prim::Constant) {
+        continue;
+      }
+      if (n->kind() == prim::ConstantChunk) {
+        Node* sizes_node = graph->insertNode(
+            graph->create(prim::ChunkSizes, shape_of.at(n->input()), 2));
+        sizes_node->i_(attr::dim, n->i(attr::dim));
+        sizes_node->i_(attr::chunks, n->i(attr::chunks));
+        for (Value* output : sizes_node->outputs()) {
+          aliasDb_->createValue(output);
+        }
+        Value* regular_size = sizes_node->outputs().at(0);
+        Value* last_size = sizes_node->outputs().at(1);
+        regular_size->setType(ListType::ofInts());
+        last_size->setType(ListType::ofInts());
+        auto outputs = n->outputs();
+        for (Value* o : outputs.slice(0, outputs.size() - 1)) {
+          shape_of.emplace(o, regular_size);
+        }
+        shape_of.emplace(outputs.at(outputs.size() - 1), last_size);
+        continue;
+      }
+      auto tensor_inputs = filter(n->inputs(), [](Value* v) {
+        return v->type()->isSubtypeOf(TensorType::get());
+      });
+      GRAPH_DEBUG("Building sizes for ", getHeader(n));
+      bool all_inputs_have_sizes = true;
+      auto shapes = fmap(tensor_inputs, [&](Value* v) {
+        GRAPH_DEBUG("Getting aten::size for %", v->debugName());
+        all_inputs_have_sizes &= shape_of.count(v);
+        return shape_of.count(v) != 0 ? shape_of.at(v) : nullptr;
+      });
+
+      if (!all_inputs_have_sizes) {
+        GRAPH_DEBUG(
+            "Not all tensor arguments have sizes available to compute the broadcasted size",
+            getHeader(n));
+        continue;
+      }
+      shape_of.emplace(
+          n->output(),
+          shapes.size() == 1 ? shapes[0]
+                             : broadcastSizes(shapes, aliasDb_.get()));
+    }
+    return shape_of;
+  }
+
+  void removeOutputsUsedOnlyInSize(Node* fusion_group) {
+    if (fusion_group->kind() != prim::TensorExprGroup)
+      return;
+    auto subgraph = fusion_group->g(attr::Subgraph);
+
+    auto shape_of = buildShapeExpressions(fusion_group);
+    auto outputs = fusion_group->outputs().vec();
+    auto soutputs = subgraph->outputs().vec();
+    // XXX: Iterating in this order is not only good for performance reasons!
+    // It is also crucial for correctness (i has to reflect the current true
+    // index of outputs[i])!
+    for (int64_t i = static_cast<int64_t>(outputs.size()) - 1; i >= 0; --i) {
+      auto output = outputs[i];
+      auto soutput = soutputs[i];
+      if (usedOnlyInSize(output) && shape_of.count(soutput) > 0) {
+        auto uses = output->uses();
+        for (Use u : uses) {
+          AT_ASSERT(u.user->matches("aten::size(Tensor self) -> int[]"));
+          u.user->output()->replaceAllUsesWith(shape_of.at(soutput));
+          u.user->destroy();
+        }
+        fusion_group->eraseOutput(i);
+        subgraph->eraseOutput(i);
+      }
+    }
+  }
+
   void run() {
     aliasDb_ = torch::make_unique<AliasDb>(graph_);
     RemoveRedundantProfiles(graph_);
@@ -298,7 +441,7 @@ class TensorExprFuser {
     // fusion is done.
     inlineSmallFusionGroups(graph_->block());
     GRAPH_DUMP("After inlining small fusion groups: ", graph_);
-    guardFusionGroups(graph_->block());
+    guardFusionGroupsAndRemoveOutputs(graph_->block());
     GRAPH_DUMP("After guarding fusion groups: ", graph_);
     removeTensorTypeSpecializations(graph_->block());
     GRAPH_DUMP("After removing tensor type specializations: ", graph_);
@@ -772,17 +915,18 @@ class TensorExprFuser {
     }
   }
 
-  void guardFusionGroups(Block* block) {
+  void guardFusionGroupsAndRemoveOutputs(Block* block) {
     std::vector<Node*> fusion_groups;
     for (Node* n : block->nodes()) {
       for (Block* b : n->blocks()) {
-        guardFusionGroups(b);
+        guardFusionGroupsAndRemoveOutputs(b);
       }
       if (n->kind() == prim::TensorExprGroup) {
         fusion_groups.push_back(n);
       }
     }
     for (Node* fusion_group : fusion_groups) {
+      removeOutputsUsedOnlyInSize(fusion_group);
       guardFusionGroup(fusion_group);
     }
   }
diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.h b/torch/csrc/jit/passes/tensorexpr_fuser.h
@@ -29,6 +29,9 @@ TORCH_API void RemoveProfileNodesAndSpecializeTypes(
     std::shared_ptr<Graph>& graph);
 TORCH_API void RemoveTensorTypeSpecializations(std::shared_ptr<Graph>& graph);
 
+TORCH_API bool usedOnlyInSize(Value* v);
+TORCH_API Value* broadcastSizes(at::ArrayRef<Value*> sizes, AliasDb* db);
+
 namespace tensorexpr {
 TORCH_API bool isSupported(Node* node);
 }

Original file line number	Diff line number	Diff line change
`@@ -29,6 +29,9 @@ TORCH_API void RemoveProfileNodesAndSpecializeTypes(`
`29`	`29`	`std::shared_ptr<Graph>& graph);`
`30`	`30`	`TORCH_API void RemoveTensorTypeSpecializations(std::shared_ptr<Graph>& graph);`
`31`	`31`
	`32`	`+TORCH_API bool usedOnlyInSize(Value* v);`
	`33`	`+TORCH_API Value* broadcastSizes(at::ArrayRef<Value> sizes, AliasDb db);`
	`34`	`+`
`32`	`35`	`namespace tensorexpr {`
`33`	`36`	`TORCH_API bool isSupported(Node* node);`
`34`	`37`	`}`