[TF:XLA] Limit autoclustering of ShapeConsumingOps on the first pass.

Additionally prevent partial_declustering of these ops. PiperOrigin-RevId: 248091621
Intel-tensorflow · May 14, 2019 · c1de6bc · c1de6bc
1 parent c095504
commit c1de6bc
Show file tree

Hide file tree

Showing 12 changed files with 242 additions and 35 deletions.
diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD
@@ -203,6 +203,7 @@ tf_cc_test(
     deps = [
         ":ops",
         ":scope",
+        "//tensorflow/cc:cc_ops",
         "//tensorflow/core:framework",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",

diff --git a/tensorflow/cc/framework/scope.cc b/tensorflow/cc/framework/scope.cc
@@ -531,4 +531,23 @@ Scope NewInternalScope(Graph* graph, Status* status, ShapeRefiner* refiner) {
   return InternalScope::NewScope(graph, status, refiner);
 }
 
+Status CreateOutputWithScope(string op_name,
+                             absl::Span<const ::tensorflow::Input> inputs,
+                             const Scope& scope, Output* output) {
+  TF_RETURN_IF_ERROR(scope.status());
+  const auto unique_name = scope.GetUniqueNameForOp(op_name);
+  auto builder = ::tensorflow::NodeBuilder(unique_name, op_name);
+  for (auto input : inputs) {
+    TF_RETURN_IF_ERROR(scope.status());
+    builder = builder.Input(input.node());
+  }
+  ::tensorflow::Node* ret;
+  scope.UpdateBuilder(&builder);
+  TF_RETURN_IF_ERROR(scope.status());
+  scope.UpdateStatus(builder.Finalize(scope.graph(), &ret));
+  TF_RETURN_IF_ERROR(scope.status());
+  *output = Output(ret, 0);
+  return Status::OK();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/cc/framework/scope.h b/tensorflow/cc/framework/scope.h
@@ -255,6 +255,12 @@ struct CompositeOpScopes {
   Scope last;
 };
 
+// Creates a node of the given operation, with the given inputs, and assigns the
+// result to output. This does not support the ability to add additional
+// attributes.
+Status CreateOutputWithScope(string op_name,
+                             absl::Span<const ::tensorflow::Input> inputs,
+                             const Scope& scope, Output* output);
 /// @}
 
 }  // namespace tensorflow

diff --git a/tensorflow/cc/framework/scope_test.cc b/tensorflow/cc/framework/scope_test.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/cc/framework/scope.h"
+
+#include "tensorflow/cc/ops/array_ops.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -145,4 +147,14 @@ TEST(ScopeTest, ControlDeps) {
   EXPECT_EQ(c_c.control_deps().size(), 3);
 }
 
+TEST(ScopeTest, CreateOutput) {
+  Scope root = Scope::NewRootScope();
+  Output a = ops::Placeholder(root.WithOpName("a"), DT_FLOAT);
+  Output add;
+  ASSERT_TRUE(
+      CreateOutputWithScope("Add", {a, a}, root.WithOpName("add"), &add).ok());
+  EXPECT_EQ(add.node()->name(), "add");
+  EXPECT_EQ(add.node()->type_string(), "Add");
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/graphcycles/graphcycles.cc b/tensorflow/compiler/jit/graphcycles/graphcycles.cc
@@ -410,6 +410,16 @@ absl::Span<const int32> GraphCycles::Predecessors(int32 node) const {
   return rep_->nodes_[node]->in.GetSequence();
 }
 
+std::vector<int32> GraphCycles::SuccessorsCopy(int32 node) const {
+  absl::Span<const int32> successors = Successors(node);
+  return std::vector<int32>(successors.begin(), successors.end());
+}
+
+std::vector<int32> GraphCycles::PredecessorsCopy(int32 node) const {
+  absl::Span<const int32> predecessors = Predecessors(node);
+  return std::vector<int32>(predecessors.begin(), predecessors.end());
+}
+
 namespace {
 void SortInPostOrder(absl::Span<Node* const> nodes,
                      std::vector<int32>* to_sort) {

diff --git a/tensorflow/compiler/jit/graphcycles/graphcycles.h b/tensorflow/compiler/jit/graphcycles/graphcycles.h
@@ -118,9 +118,18 @@ class GraphCycles {
   // Expensive: should only be called from graphcycles_test.cc.
   bool CheckInvariants() const;
 
+  // Warning: Do not use these if iterating over the span and modifying the
+  // GraphCycles at the same time. Instead use SuccessorsCopy/PredecessorsCopy.
   absl::Span<const int32> Successors(int32 node) const;
   absl::Span<const int32> Predecessors(int32 node) const;
 
+  // Return a copy of the sucessors set. This is needed for code using the
+  // collection while modifying the GraphCycles.
+  std::vector<int32> SuccessorsCopy(int32 node) const;
+  // Return a copy of the predecessors set. This is needed for code using the
+  // collection while modifying the GraphCycles.
+  std::vector<int32> PredecessorsCopy(int32 node) const;
+
   // Returns all nodes in post order.
   //
   // If there is a path from X to Y then X appears after Y in the

diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -233,6 +233,29 @@ class MarkForCompilationPassImpl {
   // Returns true if any new clusters were created.
   StatusOr<bool> RunEdgeContractionLoopInPostOrderOnce();
 
+  // Runs through all the nodes in `cycles_graph_` and tries to contract high
+  // priority edges for clusters. Returns true if any new clusters were created.
+  //
+  // There are potentially many maximal clustering results, but they will not
+  // all be equally performant. Some clustering decision are likely to improve
+  // performance much more than others, and we cannot order contractions on this
+  // cost function, nor can we look at global information while deciding on
+  // individual edges to contract. Instead, we will make decisions on these
+  // important edges then make decisions on all other edges, causing the highest
+  // chance of all most important edges to be contracted.
+  //
+  // An example of where this might occur is with a digraph:
+  // {A -> B, B -> C, A -> X, X -> C} where B is a Size operation and X is
+  // not-compilable. In this case, the valid clusterings are {A,B} or {B,C}. B
+  // should be clustered with A because it will prevent a potentially large
+  // tensor from A being computed and copied.
+  //
+  // This pass will ensure that contraction happens, which cannot be enforced in
+  // a single pass with the current algorithm.
+  // graph and prevent B->C from being clusterd in anticipation of a later A->B
+  // cluster.
+  StatusOr<bool> ContractPreferredEdges();
+
   // Contracts as many edges as possible to create XLA clusters.  After this
   // finishes the clustering decisions made are implicitly stored in
   // `clusters_`.
@@ -314,6 +337,13 @@ class MarkForCompilationPassImpl {
   //
   // Returns nullptr if `node_id` is not a compilation candidate.
   Cluster* GetClusterForCyclesGraphNode(int node_id) {
+    // We have to check `graph_->FindNodeId(node) == nullptr` because we add all
+    // nodes in [0, graph_->num_node_ids()) to the cycle detection graph but the
+    // TF graph may be missing some node ids.
+    if (node_id >= graph_->num_node_ids() ||
+        graph_->FindNodeId(node_id) == nullptr) {
+      return nullptr;
+    }
     Cluster* cluster = cluster_for_node_[node_id].Get();
     if (cluster) {
       DCHECK_EQ(cluster->cycles_graph_node_id(), node_id);
@@ -581,6 +611,50 @@ Status MarkForCompilationPassImpl::Initialize() {
   return BuildInitialClusterSet();
 }
 
+StatusOr<bool> MarkForCompilationPassImpl::ContractPreferredEdges() {
+  bool changed = false;
+  for (int32 node : cycles_graph_.AllNodesInPostOrder()) {
+    Cluster* cluster_from = GetClusterForCyclesGraphNode(node);
+    if (!cluster_from) {
+      continue;
+    }
+
+    // Make a copy of the set of successors because we may modify the graph in
+    // TryToContractEdge.
+    std::vector<int32> successors_copy =
+        cycles_graph_.SuccessorsCopy(cluster_from->cycles_graph_node_id());
+
+    for (int to : successors_copy) {
+      iteration_count_++;
+
+      Cluster* cluster_to = GetClusterForCyclesGraphNode(to);
+      if (!cluster_to) {
+        continue;
+      }
+
+      if (cluster_to->cluster_size() == 1) {
+        Node* n = graph_->FindNodeId(cluster_to->GetIdOfOnlyNode());
+
+        // Shape consuming operations are desirable to cluster with their
+        // operands because they return a small set of scalar values after
+        // consuming a large amount of data. For example, given a graph X -> Y
+        // -> Size -> Z, where the possible clustering is [{X, Y, Size}, {Z}] or
+        // [{X, Y}, {Size, Z}], the better clustering is Size with Y because the
+        // output of size will be a small tensor while Y is a potentially large
+        // tensor that must be computed and possible transposed/copied before
+        // the second cluster executes.
+        if (IsShapeConsumerOp(*n)) {
+          TF_ASSIGN_OR_RETURN(bool contracted_edge,
+                              TryToContractEdge(cluster_from, cluster_to));
+          changed |= contracted_edge;
+        }
+      }
+    }
+  }
+
+  return changed;
+}
+
 StatusOr<bool>
 MarkForCompilationPassImpl::RunEdgeContractionLoopInPostOrderOnce() {
   bool changed = false;
@@ -596,15 +670,8 @@ MarkForCompilationPassImpl::RunEdgeContractionLoopInPostOrderOnce() {
   //    digraph { X->Y; Y->Z; } then collapsing X->Y does not make it possible
   //    to contract Y->Z if Y->Z was not contractible originally.
   for (int32 node : cycles_graph_.AllNodesInPostOrder()) {
-    // We have to check `graph_->FindNodeId(node) == nullptr` because we add all
-    // nodes in [0, graph_->num_node_ids()) to the cycle detection graph but the
-    // TF graph may be missing some node ids.
-    if (node >= graph_->num_node_ids() || graph_->FindNodeId(node) == nullptr) {
-      continue;
-    }
-
     Cluster* cluster_from = GetClusterForCyclesGraphNode(node);
-    if (cluster_from == nullptr) {
+    if (!cluster_from) {
       continue;
     }
 
@@ -623,7 +690,13 @@ Status MarkForCompilationPassImpl::RunEdgeContractionLoop() {
   // TODO(hpucha): Handle the case where kXlaClusterAttr is already set (for
   // example, from the Grappler fusion pass).
 
-  TF_ASSIGN_OR_RETURN(bool changed, RunEdgeContractionLoopInPostOrderOnce());
+  // Run twice, first only targeted at contracting very beneficial edges then
+  // without restrictions. This helps to minimize data output from clusters (and
+  // possible transpose operations before outputs) that might occur if a
+  // ShapeConsumingOp is on the edge of 2 clusters due to cycle considerations.
+  TF_ASSIGN_OR_RETURN(bool changed, ContractPreferredEdges());
+
+  TF_ASSIGN_OR_RETURN(changed, RunEdgeContractionLoopInPostOrderOnce());
 
   // Check that RunEdgeContractionLoopInPostOrderOnce is idempotent.  Once the
   // linear time post-order scheme has been battle tested we can move this to
@@ -711,10 +784,6 @@ MarkForCompilationPassImpl::ClusteringWillIntroduceInterDeviceDependency(
   // where a cluster is producing data for multiple devices.
   for (const auto& in_id :
        cycles_graph_.Predecessors(cluster_to.cycles_graph_node_id())) {
-    if (in_id >= graph_->num_node_ids()) {
-      continue;
-    }
-
     const Cluster* cluster_in = GetClusterForCyclesGraphNode(in_id);
     if (cluster_in) {
       TF_ASSIGN_OR_RETURN(bool devices_compatible,
@@ -1070,19 +1139,11 @@ StatusOr<bool> MarkForCompilationPassImpl::TryToContractEdgesFrom(
 
   // Make a copy of the set of successors because we may modify the graph in
   // TryToContractEdge.
-  std::vector<int32> successors_copy = [&] {
-    absl::Span<const int32> successors =
-        cycles_graph_.Successors(cluster_from->cycles_graph_node_id());
-    return std::vector<int32>(successors.begin(), successors.end());
-  }();
+  std::vector<int32> successors_copy =
+      cycles_graph_.SuccessorsCopy(cluster_from->cycles_graph_node_id());
 
   for (int to : successors_copy) {
     iteration_count_++;
-    if (to >= graph_->num_node_ids()) {
-      // Node is a fictitious node that is present only in the cycle detection
-      // graph. No clustering is possible.
-      continue;
-    }
 
     Cluster* cluster_to = GetClusterForCyclesGraphNode(to);
     if (!cluster_to) {

diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
@@ -1548,5 +1548,59 @@ TEST(XlaCompilationTest, DontClusterNodesWithForwardFromAttr) {
   EXPECT_EQ(clusters["test/z"], "");
 }
 
+// Note, this relies on other implementation details to test the
+// specific heuristic we care about here, so other changes might be at fault if
+// this CL breaks. What we care about is that if a ShapeConsumingOp can be
+// connected with a producer or consumer and cannot be clustered with both, it
+// should be clustered with the producer.
+TEST(XlaCompilationTest, ClusterShapeConsumerWithProducer) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  Output a = ops::Placeholder(root.WithOpName("test/a"), DT_FLOAT);
+  Output b = ops::Placeholder(root.WithOpName("test/b"), DT_FLOAT);
+
+  Output x = ops::MatMul(root.WithOpName("test/x"), a, b);
+  Output y = ops::Size(root.WithOpName("test/y"), x);
+  Output z = ops::Add(root.WithOpName("test/z"), y, y);
+
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+
+  // Ensure that the "Size" op can only be clustered with either the producer or
+  // consumer by putting them on different devices.
+  FindNodeByName(graph.get(), "test/x")->set_assigned_device_name(kGPU0);
+  FindNodeByName(graph.get(), "test/y")->set_assigned_device_name(kCPU0);
+  FindNodeByName(graph.get(), "test/z")->set_assigned_device_name(kGPU1);
+
+  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
+
+  std::unordered_map<string, string> clusters = GetClusters(*graph);
+
+  EXPECT_NE(clusters["test/y"], "");
+  EXPECT_EQ(clusters["test/x"], clusters["test/y"]);
+  EXPECT_NE(clusters["test/z"], clusters["test/y"]);
+}
+
+// Test that ShapeConsuming ops are still fully clustered whenever possible.
+TEST(XlaCompilationTest, ClusterShapeConsumerWithProducerAndConsumer) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  Output a = ops::Placeholder(root.WithOpName("test/a"), DT_FLOAT);
+  Output b = ops::Placeholder(root.WithOpName("test/b"), DT_FLOAT);
+
+  Output x = ops::MatMul(root.WithOpName("test/x"), a, b);
+  Output y = ops::Size(root.WithOpName("test/y"), x);
+  Output z = ops::Add(root.WithOpName("test/z"), y, y);
+
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+
+  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
+
+  std::unordered_map<string, string> clusters = GetClusters(*graph);
+
+  EXPECT_NE(clusters["test/y"], "");
+  EXPECT_EQ(clusters["test/y"], clusters["test/x"]);
+  EXPECT_EQ(clusters["test/y"], clusters["test/z"]);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/partially_decluster_pass.cc b/tensorflow/compiler/jit/partially_decluster_pass.cc
@@ -51,6 +51,15 @@ Status FindNodesToDecluster(const Graph& graph,
       continue;
     }
 
+    // Assume the benefit of not outputting a larger tensor outweighs the
+    // benefit of this check.
+    // TODO(tpopp): Only apply this if the value being consumed is not output
+    // from the cluster to another consumer.
+    // TODO(tpopp): See if XlaRun can be modified to avoid this issue
+    // completely.
+    if (IsShapeConsumerOp(*n)) {
+      continue;
+    }
     // We assume the only XLA-auto-clusterable operations with side effects are
     // resource variable updates.  We can't execute these twice.
     if (HasResourceInputOrOutput(*n)) {
@@ -344,12 +353,6 @@ Status PartiallyDeclusterGraph(Graph* graph,
 }  // namespace reduce_recompilation
 
 namespace decluster_root_shape_consumers {
-// Returns true if `node` an operator that consumes only the shape of its input,
-// not the data itself.
-bool IsShapeConsumerOp(const Node& node) {
-  return node.type_string() == "Shape" || node.type_string() == "Rank" ||
-         node.type_string() == "Size";
-}
 
 Status PartiallyDeclusterGraph(Graph* graph) {
   std::vector<Node*> reverse_post_order;