Skip to content

Commit

Permalink
[TF:XLA] Limit autoclustering of ShapeConsumingOps on the first pass.
Browse files Browse the repository at this point in the history
Additionally prevent partial_declustering of these ops.

PiperOrigin-RevId: 248091621
  • Loading branch information
tensorflower-gardener committed May 14, 2019
1 parent c095504 commit c1de6bc
Show file tree
Hide file tree
Showing 12 changed files with 242 additions and 35 deletions.
1 change: 1 addition & 0 deletions tensorflow/cc/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,7 @@ tf_cc_test(
deps = [
":ops",
":scope",
"//tensorflow/cc:cc_ops",
"//tensorflow/core:framework",
"//tensorflow/core:test",
"//tensorflow/core:test_main",
Expand Down
19 changes: 19 additions & 0 deletions tensorflow/cc/framework/scope.cc
Original file line number Diff line number Diff line change
Expand Up @@ -531,4 +531,23 @@ Scope NewInternalScope(Graph* graph, Status* status, ShapeRefiner* refiner) {
return InternalScope::NewScope(graph, status, refiner);
}

Status CreateOutputWithScope(string op_name,
absl::Span<const ::tensorflow::Input> inputs,
const Scope& scope, Output* output) {
TF_RETURN_IF_ERROR(scope.status());
const auto unique_name = scope.GetUniqueNameForOp(op_name);
auto builder = ::tensorflow::NodeBuilder(unique_name, op_name);
for (auto input : inputs) {
TF_RETURN_IF_ERROR(scope.status());
builder = builder.Input(input.node());
}
::tensorflow::Node* ret;
scope.UpdateBuilder(&builder);
TF_RETURN_IF_ERROR(scope.status());
scope.UpdateStatus(builder.Finalize(scope.graph(), &ret));
TF_RETURN_IF_ERROR(scope.status());
*output = Output(ret, 0);
return Status::OK();
}

} // namespace tensorflow
6 changes: 6 additions & 0 deletions tensorflow/cc/framework/scope.h
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,12 @@ struct CompositeOpScopes {
Scope last;
};

// Creates a node of the given operation, with the given inputs, and assigns the
// result to output. This does not support the ability to add additional
// attributes.
Status CreateOutputWithScope(string op_name,
absl::Span<const ::tensorflow::Input> inputs,
const Scope& scope, Output* output);
/// @}

} // namespace tensorflow
Expand Down
12 changes: 12 additions & 0 deletions tensorflow/cc/framework/scope_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ limitations under the License.
==============================================================================*/

#include "tensorflow/cc/framework/scope.h"

#include "tensorflow/cc/ops/array_ops.h"
#include "tensorflow/core/platform/test.h"

namespace tensorflow {
Expand Down Expand Up @@ -145,4 +147,14 @@ TEST(ScopeTest, ControlDeps) {
EXPECT_EQ(c_c.control_deps().size(), 3);
}

TEST(ScopeTest, CreateOutput) {
Scope root = Scope::NewRootScope();
Output a = ops::Placeholder(root.WithOpName("a"), DT_FLOAT);
Output add;
ASSERT_TRUE(
CreateOutputWithScope("Add", {a, a}, root.WithOpName("add"), &add).ok());
EXPECT_EQ(add.node()->name(), "add");
EXPECT_EQ(add.node()->type_string(), "Add");
}

} // namespace tensorflow
10 changes: 10 additions & 0 deletions tensorflow/compiler/jit/graphcycles/graphcycles.cc
Original file line number Diff line number Diff line change
Expand Up @@ -410,6 +410,16 @@ absl::Span<const int32> GraphCycles::Predecessors(int32 node) const {
return rep_->nodes_[node]->in.GetSequence();
}

std::vector<int32> GraphCycles::SuccessorsCopy(int32 node) const {
absl::Span<const int32> successors = Successors(node);
return std::vector<int32>(successors.begin(), successors.end());
}

std::vector<int32> GraphCycles::PredecessorsCopy(int32 node) const {
absl::Span<const int32> predecessors = Predecessors(node);
return std::vector<int32>(predecessors.begin(), predecessors.end());
}

namespace {
void SortInPostOrder(absl::Span<Node* const> nodes,
std::vector<int32>* to_sort) {
Expand Down
9 changes: 9 additions & 0 deletions tensorflow/compiler/jit/graphcycles/graphcycles.h
Original file line number Diff line number Diff line change
Expand Up @@ -118,9 +118,18 @@ class GraphCycles {
// Expensive: should only be called from graphcycles_test.cc.
bool CheckInvariants() const;

// Warning: Do not use these if iterating over the span and modifying the
// GraphCycles at the same time. Instead use SuccessorsCopy/PredecessorsCopy.
absl::Span<const int32> Successors(int32 node) const;
absl::Span<const int32> Predecessors(int32 node) const;

// Return a copy of the sucessors set. This is needed for code using the
// collection while modifying the GraphCycles.
std::vector<int32> SuccessorsCopy(int32 node) const;
// Return a copy of the predecessors set. This is needed for code using the
// collection while modifying the GraphCycles.
std::vector<int32> PredecessorsCopy(int32 node) const;

// Returns all nodes in post order.
//
// If there is a path from X to Y then X appears after Y in the
Expand Down
107 changes: 84 additions & 23 deletions tensorflow/compiler/jit/mark_for_compilation_pass.cc
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,29 @@ class MarkForCompilationPassImpl {
// Returns true if any new clusters were created.
StatusOr<bool> RunEdgeContractionLoopInPostOrderOnce();

// Runs through all the nodes in `cycles_graph_` and tries to contract high
// priority edges for clusters. Returns true if any new clusters were created.
//
// There are potentially many maximal clustering results, but they will not
// all be equally performant. Some clustering decision are likely to improve
// performance much more than others, and we cannot order contractions on this
// cost function, nor can we look at global information while deciding on
// individual edges to contract. Instead, we will make decisions on these
// important edges then make decisions on all other edges, causing the highest
// chance of all most important edges to be contracted.
//
// An example of where this might occur is with a digraph:
// {A -> B, B -> C, A -> X, X -> C} where B is a Size operation and X is
// not-compilable. In this case, the valid clusterings are {A,B} or {B,C}. B
// should be clustered with A because it will prevent a potentially large
// tensor from A being computed and copied.
//
// This pass will ensure that contraction happens, which cannot be enforced in
// a single pass with the current algorithm.
// graph and prevent B->C from being clusterd in anticipation of a later A->B
// cluster.
StatusOr<bool> ContractPreferredEdges();

// Contracts as many edges as possible to create XLA clusters. After this
// finishes the clustering decisions made are implicitly stored in
// `clusters_`.
Expand Down Expand Up @@ -314,6 +337,13 @@ class MarkForCompilationPassImpl {
//
// Returns nullptr if `node_id` is not a compilation candidate.
Cluster* GetClusterForCyclesGraphNode(int node_id) {
// We have to check `graph_->FindNodeId(node) == nullptr` because we add all
// nodes in [0, graph_->num_node_ids()) to the cycle detection graph but the
// TF graph may be missing some node ids.
if (node_id >= graph_->num_node_ids() ||
graph_->FindNodeId(node_id) == nullptr) {
return nullptr;
}
Cluster* cluster = cluster_for_node_[node_id].Get();
if (cluster) {
DCHECK_EQ(cluster->cycles_graph_node_id(), node_id);
Expand Down Expand Up @@ -581,6 +611,50 @@ Status MarkForCompilationPassImpl::Initialize() {
return BuildInitialClusterSet();
}

StatusOr<bool> MarkForCompilationPassImpl::ContractPreferredEdges() {
bool changed = false;
for (int32 node : cycles_graph_.AllNodesInPostOrder()) {
Cluster* cluster_from = GetClusterForCyclesGraphNode(node);
if (!cluster_from) {
continue;
}

// Make a copy of the set of successors because we may modify the graph in
// TryToContractEdge.
std::vector<int32> successors_copy =
cycles_graph_.SuccessorsCopy(cluster_from->cycles_graph_node_id());

for (int to : successors_copy) {
iteration_count_++;

Cluster* cluster_to = GetClusterForCyclesGraphNode(to);
if (!cluster_to) {
continue;
}

if (cluster_to->cluster_size() == 1) {
Node* n = graph_->FindNodeId(cluster_to->GetIdOfOnlyNode());

// Shape consuming operations are desirable to cluster with their
// operands because they return a small set of scalar values after
// consuming a large amount of data. For example, given a graph X -> Y
// -> Size -> Z, where the possible clustering is [{X, Y, Size}, {Z}] or
// [{X, Y}, {Size, Z}], the better clustering is Size with Y because the
// output of size will be a small tensor while Y is a potentially large
// tensor that must be computed and possible transposed/copied before
// the second cluster executes.
if (IsShapeConsumerOp(*n)) {
TF_ASSIGN_OR_RETURN(bool contracted_edge,
TryToContractEdge(cluster_from, cluster_to));
changed |= contracted_edge;
}
}
}
}

return changed;
}

StatusOr<bool>
MarkForCompilationPassImpl::RunEdgeContractionLoopInPostOrderOnce() {
bool changed = false;
Expand All @@ -596,15 +670,8 @@ MarkForCompilationPassImpl::RunEdgeContractionLoopInPostOrderOnce() {
// digraph { X->Y; Y->Z; } then collapsing X->Y does not make it possible
// to contract Y->Z if Y->Z was not contractible originally.
for (int32 node : cycles_graph_.AllNodesInPostOrder()) {
// We have to check `graph_->FindNodeId(node) == nullptr` because we add all
// nodes in [0, graph_->num_node_ids()) to the cycle detection graph but the
// TF graph may be missing some node ids.
if (node >= graph_->num_node_ids() || graph_->FindNodeId(node) == nullptr) {
continue;
}

Cluster* cluster_from = GetClusterForCyclesGraphNode(node);
if (cluster_from == nullptr) {
if (!cluster_from) {
continue;
}

Expand All @@ -623,7 +690,13 @@ Status MarkForCompilationPassImpl::RunEdgeContractionLoop() {
// TODO(hpucha): Handle the case where kXlaClusterAttr is already set (for
// example, from the Grappler fusion pass).

TF_ASSIGN_OR_RETURN(bool changed, RunEdgeContractionLoopInPostOrderOnce());
// Run twice, first only targeted at contracting very beneficial edges then
// without restrictions. This helps to minimize data output from clusters (and
// possible transpose operations before outputs) that might occur if a
// ShapeConsumingOp is on the edge of 2 clusters due to cycle considerations.
TF_ASSIGN_OR_RETURN(bool changed, ContractPreferredEdges());

TF_ASSIGN_OR_RETURN(changed, RunEdgeContractionLoopInPostOrderOnce());

// Check that RunEdgeContractionLoopInPostOrderOnce is idempotent. Once the
// linear time post-order scheme has been battle tested we can move this to
Expand Down Expand Up @@ -711,10 +784,6 @@ MarkForCompilationPassImpl::ClusteringWillIntroduceInterDeviceDependency(
// where a cluster is producing data for multiple devices.
for (const auto& in_id :
cycles_graph_.Predecessors(cluster_to.cycles_graph_node_id())) {
if (in_id >= graph_->num_node_ids()) {
continue;
}

const Cluster* cluster_in = GetClusterForCyclesGraphNode(in_id);
if (cluster_in) {
TF_ASSIGN_OR_RETURN(bool devices_compatible,
Expand Down Expand Up @@ -1070,19 +1139,11 @@ StatusOr<bool> MarkForCompilationPassImpl::TryToContractEdgesFrom(

// Make a copy of the set of successors because we may modify the graph in
// TryToContractEdge.
std::vector<int32> successors_copy = [&] {
absl::Span<const int32> successors =
cycles_graph_.Successors(cluster_from->cycles_graph_node_id());
return std::vector<int32>(successors.begin(), successors.end());
}();
std::vector<int32> successors_copy =
cycles_graph_.SuccessorsCopy(cluster_from->cycles_graph_node_id());

for (int to : successors_copy) {
iteration_count_++;
if (to >= graph_->num_node_ids()) {
// Node is a fictitious node that is present only in the cycle detection
// graph. No clustering is possible.
continue;
}

Cluster* cluster_to = GetClusterForCyclesGraphNode(to);
if (!cluster_to) {
Expand Down
54 changes: 54 additions & 0 deletions tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1548,5 +1548,59 @@ TEST(XlaCompilationTest, DontClusterNodesWithForwardFromAttr) {
EXPECT_EQ(clusters["test/z"], "");
}

// Note, this relies on other implementation details to test the
// specific heuristic we care about here, so other changes might be at fault if
// this CL breaks. What we care about is that if a ShapeConsumingOp can be
// connected with a producer or consumer and cannot be clustered with both, it
// should be clustered with the producer.
TEST(XlaCompilationTest, ClusterShapeConsumerWithProducer) {
Scope root = Scope::NewRootScope().ExitOnError();
Output a = ops::Placeholder(root.WithOpName("test/a"), DT_FLOAT);
Output b = ops::Placeholder(root.WithOpName("test/b"), DT_FLOAT);

Output x = ops::MatMul(root.WithOpName("test/x"), a, b);
Output y = ops::Size(root.WithOpName("test/y"), x);
Output z = ops::Add(root.WithOpName("test/z"), y, y);

std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
TF_ASSERT_OK(root.ToGraph(graph.get()));

// Ensure that the "Size" op can only be clustered with either the producer or
// consumer by putting them on different devices.
FindNodeByName(graph.get(), "test/x")->set_assigned_device_name(kGPU0);
FindNodeByName(graph.get(), "test/y")->set_assigned_device_name(kCPU0);
FindNodeByName(graph.get(), "test/z")->set_assigned_device_name(kGPU1);

TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));

std::unordered_map<string, string> clusters = GetClusters(*graph);

EXPECT_NE(clusters["test/y"], "");
EXPECT_EQ(clusters["test/x"], clusters["test/y"]);
EXPECT_NE(clusters["test/z"], clusters["test/y"]);
}

// Test that ShapeConsuming ops are still fully clustered whenever possible.
TEST(XlaCompilationTest, ClusterShapeConsumerWithProducerAndConsumer) {
Scope root = Scope::NewRootScope().ExitOnError();
Output a = ops::Placeholder(root.WithOpName("test/a"), DT_FLOAT);
Output b = ops::Placeholder(root.WithOpName("test/b"), DT_FLOAT);

Output x = ops::MatMul(root.WithOpName("test/x"), a, b);
Output y = ops::Size(root.WithOpName("test/y"), x);
Output z = ops::Add(root.WithOpName("test/z"), y, y);

std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
TF_ASSERT_OK(root.ToGraph(graph.get()));

TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));

std::unordered_map<string, string> clusters = GetClusters(*graph);

EXPECT_NE(clusters["test/y"], "");
EXPECT_EQ(clusters["test/y"], clusters["test/x"]);
EXPECT_EQ(clusters["test/y"], clusters["test/z"]);
}

} // namespace
} // namespace tensorflow
15 changes: 9 additions & 6 deletions tensorflow/compiler/jit/partially_decluster_pass.cc
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,15 @@ Status FindNodesToDecluster(const Graph& graph,
continue;
}

// Assume the benefit of not outputting a larger tensor outweighs the
// benefit of this check.
// TODO(tpopp): Only apply this if the value being consumed is not output
// from the cluster to another consumer.
// TODO(tpopp): See if XlaRun can be modified to avoid this issue
// completely.
if (IsShapeConsumerOp(*n)) {
continue;
}
// We assume the only XLA-auto-clusterable operations with side effects are
// resource variable updates. We can't execute these twice.
if (HasResourceInputOrOutput(*n)) {
Expand Down Expand Up @@ -344,12 +353,6 @@ Status PartiallyDeclusterGraph(Graph* graph,
} // namespace reduce_recompilation

namespace decluster_root_shape_consumers {
// Returns true if `node` an operator that consumes only the shape of its input,
// not the data itself.
bool IsShapeConsumerOp(const Node& node) {
return node.type_string() == "Shape" || node.type_string() == "Rank" ||
node.type_string() == "Size";
}

Status PartiallyDeclusterGraph(Graph* graph) {
std::vector<Node*> reverse_post_order;
Expand Down
Loading

0 comments on commit c1de6bc

Please sign in to comment.