|
| 1 | +//===- MatmulOptimizer.h -------------------------------------------------===// |
| 2 | +// |
| 3 | +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | +// See https://llvm.org/LICENSE.txt for license information. |
| 5 | +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | +// |
| 7 | +//===----------------------------------------------------------------------===// |
| 8 | + |
| 9 | +#ifndef POLLY_MATMULOPTIMIZER_H |
| 10 | +#define POLLY_MATMULOPTIMIZER_H |
| 11 | + |
| 12 | +#include "isl/isl-noexceptions.h" |
| 13 | + |
| 14 | +namespace llvm { |
| 15 | +class TargetTransformInfo; |
| 16 | +} |
| 17 | + |
| 18 | +namespace polly { |
| 19 | +struct Dependences; |
| 20 | + |
| 21 | +/// Apply the BLIS matmul optimization pattern if possible. |
| 22 | +/// |
| 23 | +/// Make the loops containing the matrix multiplication be the innermost |
| 24 | +/// loops and apply the BLIS matmul optimization pattern. BLIS implements |
| 25 | +/// gemm as three nested loops around a macro-kernel, plus two packing |
| 26 | +/// routines. The macro-kernel is implemented in terms of two additional |
| 27 | +/// loops around a micro-kernel. The micro-kernel is a loop around a rank-1 |
| 28 | +/// (i.e., outer product) update. |
| 29 | +/// |
| 30 | +/// For a detailed description please see [1]. |
| 31 | +/// |
| 32 | +/// The order of the loops defines the data reused in the BLIS implementation |
| 33 | +/// of gemm ([1]). In particular, elements of the matrix B, the second |
| 34 | +/// operand of matrix multiplication, are reused between iterations of the |
| 35 | +/// innermost loop. To keep the reused data in cache, only elements of matrix |
| 36 | +/// A, the first operand of matrix multiplication, should be evicted during |
| 37 | +/// an iteration of the innermost loop. To provide such a cache replacement |
| 38 | +/// policy, elements of the matrix A can, in particular, be loaded first and, |
| 39 | +/// consequently, be least-recently-used. |
| 40 | +/// |
| 41 | +/// In our case matrices are stored in row-major order instead of |
| 42 | +/// column-major order used in the BLIS implementation ([1]). It affects only |
| 43 | +/// on the form of the BLIS micro kernel and the computation of its |
| 44 | +/// parameters. In particular, reused elements of the matrix B are |
| 45 | +/// successively multiplied by specific elements of the matrix A. |
| 46 | +/// |
| 47 | +/// Refs.: |
| 48 | +/// [1] - Analytical Modeling is Enough for High Performance BLIS |
| 49 | +/// Tze Meng Low, Francisco D Igual, Tyler M Smith, Enrique S Quintana-Orti |
| 50 | +/// Technical Report, 2014 |
| 51 | +/// http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf |
| 52 | +/// |
| 53 | +/// @see ScheduleTreeOptimizer::createMicroKernel |
| 54 | +/// @see ScheduleTreeOptimizer::createMacroKernel |
| 55 | +/// @see getMicroKernelParams |
| 56 | +/// @see getMacroKernelParams |
| 57 | +/// |
| 58 | +/// TODO: Implement the packing transformation. |
| 59 | +/// |
| 60 | +/// @param Node The node that contains a band to be optimized. The node |
| 61 | +/// is required to successfully pass |
| 62 | +/// ScheduleTreeOptimizer::isMatrMultPattern. |
| 63 | +/// @param TTI Target Transform Info. |
| 64 | +/// @param D The dependencies. |
| 65 | +/// |
| 66 | +/// @returns The transformed schedule or nullptr if the optimization |
| 67 | +/// cannot be applied. |
| 68 | +isl::schedule_node |
| 69 | +tryOptimizeMatMulPattern(isl::schedule_node Node, |
| 70 | + const llvm::TargetTransformInfo *TTI, |
| 71 | + const Dependences *D); |
| 72 | + |
| 73 | +} // namespace polly |
| 74 | +#endif // POLLY_MATMULOPTIMIZER_H |
0 commit comments