[NNC] fix SyncThreads insertion and reenable CudaSharedMem test (pytorch#44909)

nickgg · facebook-github-bot · commit 4bbb6adff590 · 2020-09-21T09:27:22.000-07:00
Summary: A previous fix for masking Cuda dimensions (pytorch#44733) changed the behaviour of inserting thread synchronization barriers in the Cuda CodeGen, causing the CudaSharedMemReduce_1 to be flaky and ultimately disabled. The issue is working out where these barriers must be inserted - solving this optimally is very hard, and I think not possible without dependency analysis we don't have, so I've changed our logic to be quite pessimistic. We'll insert barriers before and after any blocks that have thread dimensions masked (even between blocks that have no data dependencies). This should be correct, but it's an area we could improve performance. To address this somewhat I've added a simplifier pass that removes obviously unnecessary syncThreads. To avoid this test being flaky again, I've added a check against the generated code to ensure there is a syncThread in the right place. Also fixed a couple of non-functional but clarity issues in the generated code: fixed the missing newline after Stores in the CudaPrinter, and prevented the PrioritizeLoad mutator from pulling out loads contained within simple Let statements (such as those produced by the Registerizer). Pull Request resolved: pytorch#44909 Reviewed By: agolynski Differential Revision: D23800565 Pulled By: nickgg fbshipit-source-id: bddef1f40d8d461da965685f01d00b468d8a2c2f
diff --git a/test/cpp/tensorexpr/test_cuda.cpp b/test/cpp/tensorexpr/test_cuda.cpp
@@ -762,6 +762,25 @@ void testCudaSharedMemReduce_1() {
 
   // TODO: check the generated code for correctness.
   CudaCodeGen cuda_cg(loop_k1, a, b);
+
+  std::ostringstream oss;
+  oss << *cuda_cg.stmt();
+
+  // Check the c write is not masked, but the d write is.
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: c_ = 0
+# CHECK: for (int m = 0; m < 128
+# CHECK:   c_ = c_ +
+# CHECK: __syncthreads();
+# CHECK: if (threadIdx.x<1
+# CHECK:   b[blockIdx.x] =
+# CHECK: __syncthreads();
+# CHECK: atomicAdd(&b[blockIdx.x], c_)
+)IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
   PaddedBuffer<float> a_v(1, M, N, "a_v");
   PaddedBuffer<float> b_v(1, "b_v");
   PaddedBuffer<float> b_ref(1, "b_ref");
diff --git a/test/cpp/tensorexpr/test_simplify.cpp b/test/cpp/tensorexpr/test_simplify.cpp
@@ -3687,11 +3687,9 @@ void testSimplifyFuseConditions() {
             Store::make(a, {1}, j, mask),
             nullptr),
     });
-
     Stmt* simplified = IRSimplifier::simplify(body);
     IS_NODE_WITH_NAME(Block, simplified, block);
     ASSERT_EQ(block->nstmts(), 3);
-
     auto it = block->begin();
     it++;
     IS_NODE_WITH_NAME(Cond, *it, cond);
@@ -3720,7 +3718,6 @@ void testSimplifyFuseConditions() {
             Store::make(a, {1}, j, mask),
             nullptr),
     });
-
     Stmt* simplified = IRSimplifier::simplify(body);
     IS_NODE_WITH_NAME(Block, simplified, block);
     ASSERT_EQ(block->nstmts(), 1);
@@ -3751,7 +3748,6 @@ void testSimplifyFuseConditions() {
             Store::make(a, {1}, j, mask),
             nullptr),
     });
-
     Stmt* simplified = IRSimplifier::simplify(body);
     IS_NODE_WITH_NAME(Block, simplified, block);
     ASSERT_EQ(block->nstmts(), 3);
@@ -3786,7 +3782,6 @@ void testSimplifyFuseConditions() {
                                      CompareSelectOperation::kLT),
                                  Store::make(a, {1}, i, mask),
                                  nullptr)});
-
     Stmt* simplified = IRSimplifier::simplify(body);
     IS_NODE_WITH_NAME(Block, simplified, block);
     ASSERT_EQ(block->nstmts(), 1);
@@ -3861,5 +3856,99 @@ void testSimplifyFuseConditions() {
   }
 }
 
+void testSimplifySyncThreads() {
+  KernelScope kernel_scope;
+  Buffer a(BufHandle("A", {4}, kInt));
+  auto mask = IntImm::make(1);
+  VarHandle i("i", kInt);
+
+  {
+    // Merge two inner SyncThreads.
+    auto body = Block::make({Store::make(a, {0}, 1, 1),
+                             new SyncThreads(),
+                             new SyncThreads(),
+                             Store::make(a, {1}, 0, 1)});
+    Stmt* simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Block, simplified, block);
+    ASSERT_EQ(block->nstmts(), 3);
+    auto it = block->begin();
+    IS_NODE(Store, *it++);
+    IS_NODE(SyncThreads, *it++);
+    IS_NODE(Store, *it++);
+  }
+
+  {
+    // Eliminate outer SyncThreads.
+    auto body = Block::make(
+        {new SyncThreads(), Store::make(a, {1}, 0, 1), new SyncThreads()});
+
+    Stmt* simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Block, simplified, block);
+    ASSERT_EQ(block->nstmts(), 1);
+    auto it = block->begin();
+    IS_NODE(Store, *it);
+  }
+
+  {
+    // Merge many inner SyncThreads.
+    auto body = Block::make({Store::make(a, {0}, 1, 1),
+                             new SyncThreads(),
+                             new SyncThreads(),
+                             new SyncThreads(),
+                             new SyncThreads(),
+                             new SyncThreads(),
+                             Store::make(a, {1}, 0, 1)});
+
+    Stmt* simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Block, simplified, block);
+    ASSERT_EQ(block->nstmts(), 3);
+    auto it = block->begin();
+    IS_NODE(Store, *it++);
+    IS_NODE(SyncThreads, *it++);
+    IS_NODE(Store, *it++);
+  }
+
+  {
+    // Merge multiple outer SyncThreads.
+    auto body = Block::make({new SyncThreads(),
+                             new SyncThreads(),
+                             Store::make(a, {1}, 0, 1),
+                             new SyncThreads(),
+                             new SyncThreads(),
+                             new SyncThreads(),
+                             new SyncThreads()});
+
+    Stmt* simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Block, simplified, block);
+    ASSERT_EQ(block->nstmts(), 1);
+    auto it = block->begin();
+    IS_NODE(Store, *it);
+  }
+
+  {
+    // Merge multiple sections;
+    auto body = Block::make({Store::make(a, {0}, 1, 1),
+                             new SyncThreads(),
+                             new SyncThreads(),
+                             Store::make(a, {1}, 0, 1),
+                             Store::make(a, {2}, 0, 1),
+                             new SyncThreads(),
+                             new SyncThreads(),
+                             new SyncThreads(),
+                             Store::make(a, {3}, 0, 1)});
+
+    Stmt* simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Block, simplified, block);
+    ASSERT_EQ(block->nstmts(), 6);
+    auto it = block->begin();
+    IS_NODE(Store, *it++);
+    IS_NODE(SyncThreads, *it++);
+    IS_NODE(Store, *it++);
+    IS_NODE(Store, *it++);
+    IS_NODE(SyncThreads, *it++);
+    IS_NODE(Store, *it++);
+  }
+}
+
 } // namespace jit
 } // namespace torch
diff --git a/test/cpp/tensorexpr/tests.h b/test/cpp/tensorexpr/tests.h
@@ -215,6 +215,7 @@ namespace jit {
   _(DontSimplifyRand)                       \
   _(SimplifyReorderForCond)                 \
   _(SimplifyFuseConditions)                 \
+  _(SimplifySyncThreads)                    \
   _(RegisterizerSimple)                     \
   _(RegisterizerLoop)                       \
   _(RegisterizerLoopFixedLoad)              \
@@ -434,6 +435,7 @@ namespace jit {
   _(CudaOneBlockMultiThreadGlobalReduce1)  \
   _(CudaNoThreadIdxWrite_1)                \
   _(CudaLocalMemReduce_1)                  \
+  _(CudaSharedMemReduce_1)                 \
   _(CudaTestRand01)                        \
   _(CudaSigmoid)                           \
   _(CudaHalfCast)                          \
@@ -449,7 +451,6 @@ namespace jit {
   _(CudaMaskInnerLoopOneBlock)             \
   _(CudaMaskMultiDimMultiAxis)             \
   _(CudaMaskMultiDimMultiLevel)
-// _(CudaSharedMemReduce_1)
 
 #define DECLARE_TENSOREXPR_TEST(name) void test##name();
 TH_FORALL_TENSOREXPR_TESTS(DECLARE_TENSOREXPR_TEST)
diff --git a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
@@ -436,6 +436,7 @@ void CudaPrinter::visit(const Store* v) {
     os() << *v->base_handle() << "[" << *v->flat_index() << "] = ";
   }
   os() << *v->value() << ";";
+  os() << std::endl;
 }
 
 void CudaPrinter::visit(const AtomicAdd* v) {
@@ -505,6 +506,9 @@ class PrioritizeLoad : public IRMutator {
     if (nested_if_then_else_ > 0) {
       return IRMutator::mutate(v);
     }
+    if (nested_let_) {
+      return IRMutator::mutate(v);
+    }
     if (thread_local_bufs_.count(v->base_handle()) > 0) {
       return IRMutator::mutate(v);
     }
@@ -566,6 +570,13 @@ class PrioritizeLoad : public IRMutator {
     return s;
   }
 
+  Stmt* mutate(const Let* v) override {
+    nested_let_ = true;
+    Stmt* s = IRMutator::mutate(v);
+    nested_let_ = false;
+    return s;
+  }
+
   Stmt* mutate(const Block* v) override {
     bool any_change = false;
 
@@ -631,8 +642,9 @@ class PrioritizeLoad : public IRMutator {
   //   v = false_v;
   // }
   // int v2 = v + 2;
-  int nested_if_then_else_ = 0;
+  int nested_if_then_else_{0};
   const Store* nested_store_{nullptr};
+  bool nested_let_{false};
   std::unordered_set<const Var*> thread_local_bufs_;
 };
 
@@ -703,13 +715,6 @@ Stmt* GPUMetaVarRewriter::mutate(const For* v) {
           IRSimplifier::simplify(new Max(old_reach, v->stop(), true));
     }
 
-    // If a thread dimension has changed, insert a syncThreads in the enclosing
-    // Block.
-    if (last_thread_dim_ && !exprEquals(last_thread_dim_, v->stop())) {
-      need_sync_ = true;
-    }
-    last_thread_dim_ = v->stop();
-
     const Var* metaVar = gpu_thread_vars_[gpu_thread_index];
     body = Substitute(Stmt::clone(body), {{v->var(), metaVar}});
   }
@@ -750,16 +755,6 @@ Stmt* GPUMetaVarRewriter::mutate(const Block* v) {
       stmt_new = Stmt::clone(stmt_new);
     }
 
-    if (need_sync_) {
-      // sync is special, we never want to mask it and it is never part of
-      // another segment.
-      pushAndReset(false);
-      current.stmts().push_back(new SyncThreads());
-      pushAndReset(true);
-
-      need_sync_ = false;
-    }
-
     // Likewise, Allocate and Free should never be masked.
     if (dynamic_cast<Allocate*>(stmt) || dynamic_cast<Free*>(stmt)) {
       pushAndReset(false);
@@ -796,8 +791,8 @@ Stmt* GPUMetaVarRewriter::mutate(const Block* v) {
   }
 
   std::vector<Stmt*> stmts;
-  int rsqi = 0;
   for (auto& segment : innerSegments) {
+    bool need_sync = false;
     // We never mask loops, they'll mask their contents.
     if (!segment.mask()) {
       TORCH_INTERNAL_ASSERT(segment.stmts().size() == 1);
@@ -812,6 +807,7 @@ Stmt* GPUMetaVarRewriter::mutate(const Block* v) {
     auto& thread_extents = cuda_analysis_->gpu_thread_extents();
     for (size_t i = 0; i < gpu_thread_vars_.size(); ++i) {
       if (!exprEquals(current_thread_reach_[i], thread_extents[i])) {
+        need_sync = true;
         // Mask it against the current dimensions.
         inner = new Cond(
             new CompareSelect(
@@ -836,7 +832,13 @@ Stmt* GPUMetaVarRewriter::mutate(const Block* v) {
       }
     }
 
+    if (need_sync) {
+      stmts.push_back(new SyncThreads());
+    }
     stmts.push_back(inner);
+    if (need_sync) {
+      stmts.push_back(new SyncThreads());
+    }
   }
 
   return new Block(stmts);
diff --git a/torch/csrc/jit/tensorexpr/cuda_codegen.h b/torch/csrc/jit/tensorexpr/cuda_codegen.h
@@ -138,8 +138,6 @@ class GPUMetaVarRewriter : public IRMutator {
   std::vector<const Expr*> current_block_reach_;
   std::vector<const Expr*> current_thread_reach_;
 
-  bool need_sync_ = false;
-  const Expr* last_thread_dim_ = nullptr;
   const CudaAnalysis* cuda_analysis_;
 };
 
diff --git a/torch/csrc/jit/tensorexpr/ir_simplifier.cpp b/torch/csrc/jit/tensorexpr/ir_simplifier.cpp
@@ -1930,7 +1930,6 @@ Block* TermExpander::fuseConditions(Block* v) {
     // erase, which shortens the list.
     stmts.pop_back();
     stmts.push_back(prev_cond);
-
     did_anything = true;
   }
 
@@ -1948,6 +1947,51 @@ Block* TermExpander::fuseConditions(Block* v) {
   return new Block(stmts);
 }
 
+Stmt* TermExpander::fuseSyncThreads(Block* block) {
+  // only really first if highest level Block.
+  bool first = block->get_parent() == nullptr;
+  SyncThreads* last = nullptr;
+  std::vector<Stmt*> stmts;
+  bool did_anything = false;
+
+  for (auto* s : *block) {
+    SyncThreads* sync = dynamic_cast<SyncThreads*>(s);
+    if (!sync) {
+      first = false;
+      last = nullptr;
+      stmts.push_back(s);
+      continue;
+    }
+
+    if (first || last) {
+      did_anything = true;
+      continue;
+    }
+
+    last = sync;
+    first = false;
+    stmts.push_back(s);
+  }
+
+  if (last) {
+    stmts.pop_back();
+    did_anything = true;
+  }
+
+  if (!did_anything) {
+    return block;
+  }
+
+  // clean up parents.
+  for (auto* s : stmts) {
+    if (s->get_parent() == block) {
+      block->remove_stmt(s);
+    }
+  }
+
+  return new Block({stmts});
+}
+
 Stmt* TermExpander::mutate(const Block* v) {
   Stmt* new_stmt = IRSimplifierBase::mutate(v);
   Block* new_block = dynamic_cast<Block*>(new_stmt);
@@ -1956,7 +2000,9 @@ Stmt* TermExpander::mutate(const Block* v) {
   }
 
   // fuseConditions will return the original block if it cannot fuse.
-  return fuseConditions(new_block);
+  new_block = fuseConditions(new_block);
+  /// fuseSyncThreads too.
+  return fuseSyncThreads(new_block);
 }
 
 bool exprEquals(const Expr* A, const Expr* B) {
diff --git a/torch/csrc/jit/tensorexpr/ir_simplifier.h b/torch/csrc/jit/tensorexpr/ir_simplifier.h
@@ -570,6 +570,7 @@ class TORCH_API TermExpander : public IRSimplifierBase {
 
   // Override to enable condition fusing.
   Block* fuseConditions(Block* v);
+  Stmt* fuseSyncThreads(Block* block);
   Stmt* mutate(const Block* v) override;
 };
 
diff --git a/torch/csrc/jit/tensorexpr/stmt.h b/torch/csrc/jit/tensorexpr/stmt.h
@@ -676,7 +676,7 @@ class TORCH_API For : public StmtNode<For> {
 // This node could only shows up as an internal with GPU backends.
 // TODO: move to this an internal IR.
 // TODO: make IR nodes extensible.
-class AtomicAdd : public StmtNode<AtomicAdd> {
+class TORCH_API AtomicAdd : public StmtNode<AtomicAdd> {
  public:
   AtomicAdd(
       const Buf* buf,
@@ -711,7 +711,7 @@ class AtomicAdd : public StmtNode<AtomicAdd> {
   const Expr* value_;
 };
 
-class SyncThreads : public StmtNode<SyncThreads> {
+class TORCH_API SyncThreads : public StmtNode<SyncThreads> {
  public:
   SyncThreads() {}
 };