koparasy · Jun 5, 2021
diff --git a/‎polly/include/polly/MatmulOptimizer.h
+74 b/‎polly/include/polly/MatmulOptimizer.h
+74
diff --git a/‎polly/include/polly/ScheduleOptimizer.h
-20 b/‎polly/include/polly/ScheduleOptimizer.h
-20
diff --git a/‎polly/include/polly/ScheduleTreeTransform.h
+60 b/‎polly/include/polly/ScheduleTreeTransform.h
+60
diff --git a/‎polly/lib/CMakeLists.txt
+1 b/‎polly/lib/CMakeLists.txt
+1
@@ -0,0 +1,74 @@
+//===- MatmulOptimizer.h -------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef POLLY_MATMULOPTIMIZER_H
+#define POLLY_MATMULOPTIMIZER_H
+
+#include "isl/isl-noexceptions.h"
+
+namespace llvm {
+class TargetTransformInfo;
+}
+
+namespace polly {
+struct Dependences;
+
+/// Apply the BLIS matmul optimization pattern if possible.
+///
+/// Make the loops containing the matrix multiplication be the innermost
+/// loops and apply the BLIS matmul optimization pattern. BLIS implements
+/// gemm as three nested loops around a macro-kernel, plus two packing
+/// routines. The macro-kernel is implemented in terms of two additional
+/// loops around a micro-kernel. The micro-kernel is a loop around a rank-1
+/// (i.e., outer product) update.
+///
+/// For a detailed description please see [1].
+///
+/// The order of the loops defines the data reused in the BLIS implementation
+/// of gemm ([1]). In particular, elements of the matrix B, the second
+/// operand of matrix multiplication, are reused between iterations of the
+/// innermost loop. To keep the reused data in cache, only elements of matrix
+/// A, the first operand of matrix multiplication, should be evicted during
+/// an iteration of the innermost loop. To provide such a cache replacement
+/// policy, elements of the matrix A can, in particular, be loaded first and,
+/// consequently, be least-recently-used.
+///
+/// In our case matrices are stored in row-major order instead of
+/// column-major order used in the BLIS implementation ([1]). It affects only
+/// on the form of the BLIS micro kernel and the computation of its
+/// parameters. In particular, reused elements of the matrix B are
+/// successively multiplied by specific elements of the matrix A.
+///
+/// Refs.:
+/// [1] - Analytical Modeling is Enough for High Performance BLIS
+/// Tze Meng Low, Francisco D Igual, Tyler M Smith, Enrique S Quintana-Orti
+/// Technical Report, 2014
+/// http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf
+///
+/// @see ScheduleTreeOptimizer::createMicroKernel
+/// @see ScheduleTreeOptimizer::createMacroKernel
+/// @see getMicroKernelParams
+/// @see getMacroKernelParams
+///
+/// TODO: Implement the packing transformation.
+///
+/// @param Node The node that contains a band to be optimized. The node
+///             is required to successfully pass
+///             ScheduleTreeOptimizer::isMatrMultPattern.
+/// @param TTI  Target Transform Info.
+/// @param D    The dependencies.
+///
+/// @returns    The transformed schedule or nullptr if the optimization
+///             cannot be applied.
+isl::schedule_node
+tryOptimizeMatMulPattern(isl::schedule_node Node,
+                         const llvm::TargetTransformInfo *TTI,
+                         const Dependences *D);
+
+} // namespace polly
+#endif // POLLY_MATMULOPTIMIZER_H
@@ -37,26 +37,6 @@ struct IslScheduleOptimizerPrinterPass
 private:
   llvm::raw_ostream &OS;
 };
-
-/// Build the desired set of partial tile prefixes.
-///
-/// We build a set of partial tile prefixes, which are prefixes of the vector
-/// loop that have exactly VectorWidth iterations.
-///
-/// 1. Drop all constraints involving the dimension that represents the
-///    vector loop.
-/// 2. Constrain the last dimension to get a set, which has exactly VectorWidth
-///    iterations.
-/// 3. Subtract loop domain from it, project out the vector loop dimension and
-///    get a set that contains prefixes, which do not have exactly VectorWidth
-///    iterations.
-/// 4. Project out the vector loop dimension of the set that was build on the
-///    first step and subtract the set built on the previous step to get the
-///    desired set of prefixes.
-///
-/// @param ScheduleRange A range of a map, which describes a prefix schedule
-///                      relation.
-isl::set getPartialTilePrefixes(isl::set ScheduleRange, int VectorWidth);
 } // namespace polly
 
 namespace llvm {
 
@@ -13,6 +13,7 @@
 #ifndef POLLY_SCHEDULETREETRANSFORM_H
 #define POLLY_SCHEDULETREETRANSFORM_H
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "isl/isl-noexceptions.h"
 #include <cassert>
@@ -164,6 +165,65 @@ isl::schedule applyFullUnroll(isl::schedule_node BandToUnroll);
 /// Replace the AST band @p BandToUnroll by a partially unrolled equivalent.
 isl::schedule applyPartialUnroll(isl::schedule_node BandToUnroll, int Factor);
 
+/// Build the desired set of partial tile prefixes.
+///
+/// We build a set of partial tile prefixes, which are prefixes of the vector
+/// loop that have exactly VectorWidth iterations.
+///
+/// 1. Drop all constraints involving the dimension that represents the
+///    vector loop.
+/// 2. Constrain the last dimension to get a set, which has exactly VectorWidth
+///    iterations.
+/// 3. Subtract loop domain from it, project out the vector loop dimension and
+///    get a set that contains prefixes, which do not have exactly VectorWidth
+///    iterations.
+/// 4. Project out the vector loop dimension of the set that was build on the
+///    first step and subtract the set built on the previous step to get the
+///    desired set of prefixes.
+///
+/// @param ScheduleRange A range of a map, which describes a prefix schedule
+///                      relation.
+isl::set getPartialTilePrefixes(isl::set ScheduleRange, int VectorWidth);
+
+/// Create an isl::union_set, which describes the isolate option based on
+/// IsolateDomain.
+///
+/// @param IsolateDomain An isl::set whose @p OutDimsNum last dimensions should
+///                      belong to the current band node.
+/// @param OutDimsNum    A number of dimensions that should belong to
+///                      the current band node.
+isl::union_set getIsolateOptions(isl::set IsolateDomain, isl_size OutDimsNum);
+
+/// Create an isl::union_set, which describes the specified option for the
+/// dimension of the current node.
+///
+/// @param Ctx    An isl::ctx, which is used to create the isl::union_set.
+/// @param Option The name of the option.
+isl::union_set getDimOptions(isl::ctx Ctx, const char *Option);
+
+/// Tile a schedule node.
+///
+/// @param Node            The node to tile.
+/// @param Identifier      An name that identifies this kind of tiling and
+///                        that is used to mark the tiled loops in the
+///                        generated AST.
+/// @param TileSizes       A vector of tile sizes that should be used for
+///                        tiling.
+/// @param DefaultTileSize A default tile size that is used for dimensions
+///                        that are not covered by the TileSizes vector.
+isl::schedule_node tileNode(isl::schedule_node Node, const char *Identifier,
+                            llvm::ArrayRef<int> TileSizes, int DefaultTileSize);
+
+/// Tile a schedule node and unroll point loops.
+///
+/// @param Node            The node to register tile.
+/// @param TileSizes       A vector of tile sizes that should be used for
+///                        tiling.
+/// @param DefaultTileSize A default tile size that is used for dimensions
+isl::schedule_node applyRegisterTiling(isl::schedule_node Node,
+                                       llvm::ArrayRef<int> TileSizes,
+                                       int DefaultTileSize);
+
 } // namespace polly
 
 #endif // POLLY_SCHEDULETREETRANSFORM_H
@@ -99,6 +99,7 @@ add_llvm_pass_plugin(Polly
   Transform/RewriteByReferenceParameters.cpp
   Transform/ScopInliner.cpp
   Transform/ManualOptimizer.cpp
+  Transform/MatmulOptimizer.cpp
   ${POLLY_HEADER_FILES}
 
   LINK_COMPONENTS