From a97eed928f410d3e4f91e3821a90cb41a5cff89c Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Thu, 30 Jan 2025 13:54:53 -0800
Subject: [PATCH 001/282] Bump version to 20.1.0git (#125067)

---
 cmake/Modules/LLVMVersion.cmake          | 2 +-
 libcxx/include/__config                  | 2 +-
 llvm/utils/gn/secondary/llvm/version.gni | 2 +-
 llvm/utils/lit/lit/__init__.py           | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/cmake/Modules/LLVMVersion.cmake b/cmake/Modules/LLVMVersion.cmake
index 897dd963bd9ab..281d0444255ba 100644
--- a/cmake/Modules/LLVMVersion.cmake
+++ b/cmake/Modules/LLVMVersion.cmake
@@ -4,7 +4,7 @@ if(NOT DEFINED LLVM_VERSION_MAJOR)
   set(LLVM_VERSION_MAJOR 20)
 endif()
 if(NOT DEFINED LLVM_VERSION_MINOR)
-  set(LLVM_VERSION_MINOR 0)
+  set(LLVM_VERSION_MINOR 1)
 endif()
 if(NOT DEFINED LLVM_VERSION_PATCH)
   set(LLVM_VERSION_PATCH 0)
diff --git a/libcxx/include/__config b/libcxx/include/__config
index 30b2d147dc563..a866a7e651837 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -28,7 +28,7 @@
 // _LIBCPP_VERSION represents the version of libc++, which matches the version of LLVM.
 // Given a LLVM release LLVM XX.YY.ZZ (e.g. LLVM 17.0.1 == 17.00.01), _LIBCPP_VERSION is
 // defined to XXYYZZ.
-#  define _LIBCPP_VERSION 200000
+#  define _LIBCPP_VERSION 200100
 
 #  define _LIBCPP_CONCAT_IMPL(_X, _Y) _X##_Y
 #  define _LIBCPP_CONCAT(_X, _Y) _LIBCPP_CONCAT_IMPL(_X, _Y)
diff --git a/llvm/utils/gn/secondary/llvm/version.gni b/llvm/utils/gn/secondary/llvm/version.gni
index e0107cdeae76d..a99e5ba892da5 100644
--- a/llvm/utils/gn/secondary/llvm/version.gni
+++ b/llvm/utils/gn/secondary/llvm/version.gni
@@ -1,4 +1,4 @@
 llvm_version_major = 20
-llvm_version_minor = 0
+llvm_version_minor = 1
 llvm_version_patch = 0
 llvm_version = "$llvm_version_major.$llvm_version_minor.$llvm_version_patch"
diff --git a/llvm/utils/lit/lit/__init__.py b/llvm/utils/lit/lit/__init__.py
index ea56d192fb394..e1e31718020f2 100644
--- a/llvm/utils/lit/lit/__init__.py
+++ b/llvm/utils/lit/lit/__init__.py
@@ -2,7 +2,7 @@
 
 __author__ = "Daniel Dunbar"
 __email__ = "daniel@minormatter.com"
-__versioninfo__ = (20, 0, 0)
+__versioninfo__ = (20, 1, 0)
 __version__ = ".".join(str(v) for v in __versioninfo__) + "dev"
 
 __all__ = []

From fa12df5e22aa5ae6d3c0a5b261acd15bd49081e8 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 29 Jan 2025 09:09:14 +0100
Subject: [PATCH 002/282] [SCEV] Check correct value for UB (#124302)

This is a followup to #117152. That patch introduced a check for
UB/poison on BEValue. However, the SCEV we're actually going to use is
Shifted. In some cases, it's possible for Shifted to contain UB, while
BEValue doesn't.

In the test case the values are:

BEValue: (-1 * (zext i8 (-83 + ((-83 /u {1,+,1}<%loop>) *
{-1,+,-1}<%loop>)) to i32))<nuw><nsw>
Shifted: (-173 + (-1 * (zext i8 ((-83 /u {0,+,1}<%loop>) *
{0,+,-1}<%loop>) to i32))<nuw><nsw>)<nuw><nsw>

Fixes https://github.com/llvm/llvm-project/issues/123550.

(cherry picked from commit 07efe2c18a63423943a4f9d9daeada23601f84c8)
---
 llvm/lib/Analysis/ScalarEvolution.cpp         | 26 +++++++++----------
 .../test/Analysis/ScalarEvolution/pr123550.ll |  8 +++---
 2 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 7d7d37b3d228d..2ce40877b523e 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -5917,20 +5917,18 @@ const SCEV *ScalarEvolution::createAddRecFromPHI(PHINode *PN) {
     //   PHI(f(0), f({1,+,1})) --> f({0,+,1})
 
     // Do not allow refinement in rewriting of BEValue.
-    if (isGuaranteedNotToCauseUB(BEValue)) {
-      const SCEV *Shifted = SCEVShiftRewriter::rewrite(BEValue, L, *this);
-      const SCEV *Start = SCEVInitRewriter::rewrite(Shifted, L, *this, false);
-      if (Shifted != getCouldNotCompute() && Start != getCouldNotCompute() &&
-          ::impliesPoison(BEValue, Start)) {
-        const SCEV *StartVal = getSCEV(StartValueV);
-        if (Start == StartVal) {
-          // Okay, for the entire analysis of this edge we assumed the PHI
-          // to be symbolic.  We now need to go back and purge all of the
-          // entries for the scalars that use the symbolic expression.
-          forgetMemoizedResults(SymbolicName);
-          insertValueToMap(PN, Shifted);
-          return Shifted;
-        }
+    const SCEV *Shifted = SCEVShiftRewriter::rewrite(BEValue, L, *this);
+    const SCEV *Start = SCEVInitRewriter::rewrite(Shifted, L, *this, false);
+    if (Shifted != getCouldNotCompute() && Start != getCouldNotCompute() &&
+        isGuaranteedNotToCauseUB(Shifted) && ::impliesPoison(Shifted, Start)) {
+      const SCEV *StartVal = getSCEV(StartValueV);
+      if (Start == StartVal) {
+        // Okay, for the entire analysis of this edge we assumed the PHI
+        // to be symbolic.  We now need to go back and purge all of the
+        // entries for the scalars that use the symbolic expression.
+        forgetMemoizedResults(SymbolicName);
+        insertValueToMap(PN, Shifted);
+        return Shifted;
       }
     }
   }
diff --git a/llvm/test/Analysis/ScalarEvolution/pr123550.ll b/llvm/test/Analysis/ScalarEvolution/pr123550.ll
index c1f2051248a12..709da00935ef3 100644
--- a/llvm/test/Analysis/ScalarEvolution/pr123550.ll
+++ b/llvm/test/Analysis/ScalarEvolution/pr123550.ll
@@ -1,16 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -disable-output -passes='print<scalar-evolution>' < %s 2>&1 | FileCheck %s
 
-; FIXME: This is a miscompile.
+; %srem should have exit value 130.
 define i32 @test() {
 ; CHECK-LABEL: 'test'
 ; CHECK-NEXT:  Classifying expressions for: @test
 ; CHECK-NEXT:    %phi = phi i32 [ -173, %bb ], [ %sub, %loop ]
-; CHECK-NEXT:    --> (-173 + (-1 * (zext i8 ((-83 /u {0,+,1}<%loop>) * {0,+,-1}<%loop>) to i32))<nuw><nsw>)<nuw><nsw> U: empty-set S: empty-set Exits: -173 LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> %phi U: [-173,1) S: [-173,1) Exits: -173 LoopDispositions: { %loop: Variant }
 ; CHECK-NEXT:    %iv2 = phi i32 [ 1, %bb ], [ %iv2.inc, %loop ]
 ; CHECK-NEXT:    --> {1,+,1}<nuw><nsw><%loop> U: [1,2) S: [1,2) Exits: 1 LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %srem = srem i32 729259140, %phi
-; CHECK-NEXT:    --> (729259140 + (-1 * (-173 + (-1 * (zext i8 ((-83 /u {0,+,1}<%loop>) * {0,+,-1}<%loop>) to i32))<nuw><nsw>)<nuw><nsw> * (729259140 /u (-173 + (-1 * (zext i8 ((-83 /u {0,+,1}<%loop>) * {0,+,-1}<%loop>) to i32))<nuw><nsw>)<nuw><nsw>)))<nuw><nsw> U: empty-set S: empty-set Exits: 729259140 LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> %srem U: [0,1073741824) S: [0,1073741824) Exits: 130 LoopDispositions: { %loop: Variant }
 ; CHECK-NEXT:    %trunc = trunc i32 %iv2 to i8
 ; CHECK-NEXT:    --> {1,+,1}<%loop> U: [1,2) S: [1,2) Exits: 1 LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %urem = urem i8 -83, %trunc
@@ -22,7 +22,7 @@ define i32 @test() {
 ; CHECK-NEXT:    %iv2.inc = add i32 %iv2, 1
 ; CHECK-NEXT:    --> {2,+,1}<nuw><nsw><%loop> U: [2,3) S: [2,3) Exits: 2 LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %srem.lcssa = phi i32 [ %srem, %loop ]
-; CHECK-NEXT:    --> (729259140 + (-1 * (-173 + (-1 * (zext i8 ((-83 /u {0,+,1}<%loop>) * {0,+,-1}<%loop>) to i32))<nuw><nsw>)<nuw><nsw> * (729259140 /u (-173 + (-1 * (zext i8 ((-83 /u {0,+,1}<%loop>) * {0,+,-1}<%loop>) to i32))<nuw><nsw>)<nuw><nsw>)))<nuw><nsw> U: empty-set S: empty-set --> 729259140 U: [729259140,729259141) S: [729259140,729259141)
+; CHECK-NEXT:    --> %srem U: [0,1073741824) S: [0,1073741824) --> 130 U: [130,131) S: [130,131)
 ; CHECK-NEXT:  Determining loop execution counts for: @test
 ; CHECK-NEXT:  Loop %loop: backedge-taken count is i32 0
 ; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i32 0

From 66a50e246d0f5f0839f2093c1981b3cac44d6df7 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Wed, 29 Jan 2025 16:51:19 -0800
Subject: [PATCH 003/282] workflows/release-binaries: Stop using ccache
 (#124415)

Using ccache relies on the GitHub Actions Cache, which may be
susceptible to cache poisoning. See
https://adnanthekhan.com/2024/05/06/the-monsters-in-your-build-cache-github-actions-cache-poisoning/

Even though these attacks may be difficult, it's better to err on the
side of caution and ensure that the build environment for our releases
is as isolated as possible. Additionally, ccache was only being used for
the stage1 build, which is a small part of the overall build, so the
speed up from using it was not that large.

(cherry picked from commit b32e55df246e26f3ea8edc65e92e4c19d2658f0c)
---
 .github/workflows/release-binaries.yml | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/release-binaries.yml b/.github/workflows/release-binaries.yml
index 2ca4aea8a3b0e..c49939ea48c5f 100644
--- a/.github/workflows/release-binaries.yml
+++ b/.github/workflows/release-binaries.yml
@@ -226,14 +226,6 @@ jobs:
       id: setup-stage
       uses: ./workflows-main/.github/workflows/release-binaries-setup-stage
 
-    - name: Setup sccache
-      uses: hendrikmuhs/ccache-action@ca3acd2731eef11f1572ccb126356c2f9298d35e # v1.2.9
-      with:
-        # Default to 2G to workaround: https://github.com/hendrikmuhs/ccache-action/issues/174
-        max-size: 2G
-        key: ${{ needs.prepare.outputs.ccache }}-${{ runner.os }}-${{ runner.arch }}-release
-        variant: ${{ needs.prepare.outputs.ccache }}
-
     - name: Configure
       id: build
       shell: bash
@@ -246,9 +238,8 @@ jobs:
             ${{ needs.prepare.outputs.target-cmake-flags }} \
             -C clang/cmake/caches/Release.cmake \
             -DBOOTSTRAP_LLVM_PARALLEL_LINK_JOBS=1 \
-            -DBOOTSTRAP_CPACK_PACKAGE_FILE_NAME="${{ needs.prepare.outputs.release-binary-basename }}" \
-            -DCMAKE_C_COMPILER_LAUNCHER=$CCACHE_BIN \
-            -DCMAKE_CXX_COMPILER_LAUNCHER=$CCACHE_BIN
+            -DBOOTSTRAP_CPACK_PACKAGE_FILE_NAME="${{ needs.prepare.outputs.release-binary-basename }}"
+
     - name: Build
       shell: bash
       run: |

From 0e240b08c6ff4f891bf3741d25aca17057d6992f Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Thu, 30 Jan 2025 19:37:54 -0800
Subject: [PATCH 004/282] workflows/premerge: Add macOS testing for release
 branch (#124303)

Also, remove the old pre-merge tests since Linux and Windows are tested on buildkite now.

(cherry picked from commit b89617d02d1c8a2701c1d3290d2ff45cd858ccde)
---
 .github/workflows/clang-tests.yml  | 38 ----------------
 .github/workflows/libclc-tests.yml | 39 -----------------
 .github/workflows/lld-tests.yml    | 38 ----------------
 .github/workflows/lldb-tests.yml   | 39 -----------------
 .github/workflows/llvm-tests.yml   | 10 -----
 .github/workflows/premerge.yaml    | 70 ++++++++++++++++++++++++++++++
 6 files changed, 70 insertions(+), 164 deletions(-)
 delete mode 100644 .github/workflows/clang-tests.yml
 delete mode 100644 .github/workflows/libclc-tests.yml
 delete mode 100644 .github/workflows/lld-tests.yml
 delete mode 100644 .github/workflows/lldb-tests.yml

diff --git a/.github/workflows/clang-tests.yml b/.github/workflows/clang-tests.yml
deleted file mode 100644
index 2569ce19518e3..0000000000000
--- a/.github/workflows/clang-tests.yml
+++ /dev/null
@@ -1,38 +0,0 @@
-name: Clang Tests
-
-permissions:
-  contents: read
-
-on:
-  workflow_dispatch:
-  push:
-    branches:
-      - 'release/**'
-    paths:
-      - 'clang/**'
-      - '.github/workflows/clang-tests.yml'
-      - '.github/workflows/llvm-project-tests.yml'
-      - '!llvm/**'
-  pull_request:
-    branches:
-      - 'release/**'
-    paths:
-      - 'clang/**'
-      - '.github/workflows/clang-tests.yml'
-      - '.github/workflows/llvm-project-tests.yml'
-      - '!llvm/**'
-
-concurrency:
-  # Skip intermediate builds: always.
-  # Cancel intermediate builds: only if it is a pull request build.
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
-
-jobs:
-  check_clang:
-    if: github.repository_owner == 'llvm'
-    name: Test clang,lldb,libclc
-    uses: ./.github/workflows/llvm-project-tests.yml
-    with:
-      build_target: check-clang
-      projects: clang;lldb;libclc
diff --git a/.github/workflows/libclc-tests.yml b/.github/workflows/libclc-tests.yml
deleted file mode 100644
index 23192f776a985..0000000000000
--- a/.github/workflows/libclc-tests.yml
+++ /dev/null
@@ -1,39 +0,0 @@
-name: libclc Tests
-
-permissions:
-  contents: read
-
-on:
-  workflow_dispatch:
-  push:
-    branches:
-      - 'release/**'
-    paths:
-      - 'libclc/**'
-      - '.github/workflows/libclc-tests.yml'
-      - '.github/workflows/llvm-project-tests.yml'
-      - '!clang/**'
-      - '!llvm/**'
-  pull_request:
-    branches:
-      - 'release/**'
-    paths:
-      - 'libclc/**'
-      - '.github/workflows/libclc-tests.yml'
-      - '.github/workflows/llvm-project-tests.yml'
-      - '!clang/**'
-      - '!llvm/**'
-
-concurrency:
-  # Skip intermediate builds: always.
-  # Cancel intermediate builds: only if it is a pull request build.
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
-
-jobs:
-  check_libclc:
-    if: github.repository_owner == 'llvm'
-    name: Test libclc
-    uses: ./.github/workflows/llvm-project-tests.yml
-    with:
-      projects: clang;libclc
diff --git a/.github/workflows/lld-tests.yml b/.github/workflows/lld-tests.yml
deleted file mode 100644
index 599c0975fa685..0000000000000
--- a/.github/workflows/lld-tests.yml
+++ /dev/null
@@ -1,38 +0,0 @@
-name: LLD Tests
-
-permissions:
-  contents: read
-
-on:
-  workflow_dispatch:
-  push:
-    branches:
-      - 'release/**'
-    paths:
-      - 'lld/**'
-      - '.github/workflows/lld-tests.yml'
-      - '.github/workflows/llvm-project-tests.yml'
-      - '!llvm/**'
-  pull_request:
-    branches:
-      - 'release/**'
-    paths:
-      - 'lld/**'
-      - '.github/workflows/lld-tests.yml'
-      - '.github/workflows/llvm-project-tests.yml'
-      - '!llvm/**'
-
-concurrency:
-  # Skip intermediate builds: always.
-  # Cancel intermediate builds: only if it is a pull request build.
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
-
-jobs:
-  check_lld:
-    if: github.repository_owner == 'llvm'
-    name: Test lld
-    uses: ./.github/workflows/llvm-project-tests.yml
-    with:
-      build_target: check-lld
-      projects: lld
diff --git a/.github/workflows/lldb-tests.yml b/.github/workflows/lldb-tests.yml
deleted file mode 100644
index 6bb9721956258..0000000000000
--- a/.github/workflows/lldb-tests.yml
+++ /dev/null
@@ -1,39 +0,0 @@
-name: lldb Tests
-
-permissions:
-  contents: read
-
-on:
-  workflow_dispatch:
-  push:
-    branches:
-      - 'release/**'
-    paths:
-      - 'lldb/**'
-      - '.github/workflows/lldb-tests.yml'
-      - '.github/workflows/llvm-project-tests.yml'
-      - '!clang/**'
-      - '!llvm/**'
-  pull_request:
-    branches:
-      - 'release/**'
-    paths:
-      - 'lldb/**'
-      - '.github/workflows/lldb-tests.yml'
-      - '.github/workflows/llvm-project-tests.yml'
-      - '!clang/**'
-      - '!llvm/**'
-
-concurrency:
-  # Skip intermediate builds: always.
-  # Cancel intermediate builds: only if it is a pull request build.
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
-
-jobs:
-  build_lldb:
-    if: github.repository_owner == 'llvm'
-    name: Build lldb
-    uses: ./.github/workflows/llvm-project-tests.yml
-    with:
-      projects: clang;lldb
diff --git a/.github/workflows/llvm-tests.yml b/.github/workflows/llvm-tests.yml
index 4e570a7cb1455..9b3d49d4e99b9 100644
--- a/.github/workflows/llvm-tests.yml
+++ b/.github/workflows/llvm-tests.yml
@@ -11,14 +11,12 @@ on:
     paths:
       - 'llvm/**'
       - '.github/workflows/llvm-tests.yml'
-      - '.github/workflows/llvm-project-tests.yml'
   pull_request:
     branches:
       - 'release/**'
     paths:
       - 'llvm/**'
       - '.github/workflows/llvm-tests.yml'
-      - '.github/workflows/llvm-project-tests.yml'
 
 concurrency:
   # Skip intermediate builds: always.
@@ -27,14 +25,6 @@ concurrency:
   cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
 
 jobs:
-  check-all:
-    if: github.repository_owner == 'llvm'
-    name: Build and Test
-    uses: ./.github/workflows/llvm-project-tests.yml
-    with:
-      build_target: check-all
-      projects: clang;lld;libclc;lldb
-
   abi-dump-setup:
     if: github.repository_owner == 'llvm'
     runs-on: ubuntu-latest
diff --git a/.github/workflows/premerge.yaml b/.github/workflows/premerge.yaml
index 54d6e1bf092cf..d7fa671882186 100644
--- a/.github/workflows/premerge.yaml
+++ b/.github/workflows/premerge.yaml
@@ -10,6 +10,7 @@ on:
   push:
     branches:
       - 'main'
+      - 'release/**'
 
 jobs:
   premerge-checks-linux:
@@ -132,3 +133,72 @@ jobs:
           call C:\\BuildTools\\Common7\\Tools\\VsDevCmd.bat -arch=amd64 -host_arch=amd64
           bash .ci/monolithic-windows.sh "${{ steps.vars.outputs.windows-projects }}" "${{ steps.vars.outputs.windows-check-targets }}"
 
+  permerge-check-macos:
+    runs-on: macos-14
+    concurrency:
+      group: ${{ github.workflow }}-macos-${{ github.event.pull_request.number || github.sha }}
+      cancel-in-progress: true
+    if: >-
+      (startswith(github.ref_name, 'release/') ||
+       startswith(github.base_ref, 'refs/heads/release/'))
+    steps:
+      - name: Checkout LLVM
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 2
+      - name: Setup ccache
+        uses: hendrikmuhs/ccache-action@v1.2.14
+        with:
+          max-size: "2000M"
+      - name: Install Ninja
+        uses: llvm/actions/install-ninja@main
+      - name: Build and Test
+        run: |
+          modified_files=$(git diff --name-only HEAD~1...HEAD)
+          modified_dirs=$(echo "$modified_files" | cut -d'/' -f1 | sort -u)
+
+          echo $modified_files
+          echo $modified_dirs
+
+          . ./.ci/compute-projects.sh
+
+          all_projects="clang clang-tools-extra lld lldb llvm mlir"
+          modified_projects="$(keep-modified-projects ${all_projects})"
+
+          # We have to disable the runtimes builds due to https://github.com/llvm/llvm-project/issues/90568
+          # and the lldb tests depend on libcxx, so we need to skip them.
+          mac_check_targets=$(check-targets ${modified_projects} | sort | uniq | tr '\n' ' ' | sed -e 's/check-lldb //g')
+          mac_projects=$(add-dependencies ${modified_projects} | sort | uniq | tr '\n' ' ')
+
+          mac_runtimes_to_test=$(compute-runtimes-to-test ${modified_projects})
+          mac_runtime_check_targets=$(check-targets ${mac_runtimes_to_test} | sort | uniq | tr '\n' ' ')
+          mac_runtimes=$(echo ${mac_runtimes_to_test} | tr ' ' '\n' | sort | uniq | tr '\n' ' ')
+
+          if [[ "${mac_projects}" == "" ]]; then
+            echo "No projects to build"
+            exit 0
+          fi
+
+          echo "Projects to test: ${modified_projects}"
+          echo "Runtimes to test: ${mac_runtimes_to_test}"
+          echo "Building projects: ${mac_projects}"
+          echo "Running project checks targets: ${mac_check_targets}"
+          echo "Building runtimes: ${mac_runtimes}"
+          echo "Running runtimes checks targets: ${mac_runtime_check_targets}"
+
+          # -DLLVM_DISABLE_ASSEMBLY_FILES=ON is for
+          # https://github.com/llvm/llvm-project/issues/81967
+          # Disable sharding in lit so that the LIT_XFAIL environment var works.
+          cmake -G Ninja \
+                -B build \
+                -S llvm \
+                -DLLVM_ENABLE_PROJECTS="$(echo ${mac_projects} | tr ' ' ';')" \
+                -DLLVM_DISABLE_ASSEMBLY_FILES=ON \
+                -DCMAKE_BUILD_TYPE=Release \
+                -DLLDB_INCLUDE_TESTS=OFF \
+                -DLLVM_ENABLE_ASSERTIONS=ON \
+                -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+                -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
+
+          # The libcxx tests fail, so we are skipping the runtime targets.
+          ninja -C build  $mac_check_targets

From f2e1acdcdafdf3e4eec8406f5ceeda883485917f Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Fri, 31 Jan 2025 13:45:08 -0800
Subject: [PATCH 005/282] workflows/premerge: Fix condition for macos job
 (#125237)

(cherry picked from commit 95c0c784ac9a91a8e12331ad9574ac6ad75318b1)
---
 .github/workflows/premerge.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/premerge.yaml b/.github/workflows/premerge.yaml
index d7fa671882186..91aa3eefde372 100644
--- a/.github/workflows/premerge.yaml
+++ b/.github/workflows/premerge.yaml
@@ -139,8 +139,9 @@ jobs:
       group: ${{ github.workflow }}-macos-${{ github.event.pull_request.number || github.sha }}
       cancel-in-progress: true
     if: >-
+      github.repository_owner == 'llvm' &&
       (startswith(github.ref_name, 'release/') ||
-       startswith(github.base_ref, 'refs/heads/release/'))
+       startswith(github.base_ref, 'release/'))
     steps:
       - name: Checkout LLVM
         uses: actions/checkout@v4

From 78b5d67e7194a687dc8b6a6b90f3e64191f74189 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Fri, 31 Jan 2025 15:43:43 -0800
Subject: [PATCH 006/282] workflows/premerge: Enable macos builds (#125176)

We still have buildkite for testing Linux and Windows, so we don't need
to enable those builds yet.
---
 .github/workflows/premerge.yaml | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/premerge.yaml b/.github/workflows/premerge.yaml
index 91aa3eefde372..c22b35e122b9f 100644
--- a/.github/workflows/premerge.yaml
+++ b/.github/workflows/premerge.yaml
@@ -5,8 +5,6 @@ permissions:
 
 on:
   pull_request:
-    paths:
-      - .github/workflows/premerge.yaml
   push:
     branches:
       - 'main'
@@ -14,7 +12,7 @@ on:
 
 jobs:
   premerge-checks-linux:
-    if: github.repository_owner == 'llvm'
+    if: false && github.repository_owner == 'llvm'
     runs-on: llvm-premerge-linux-runners
     concurrency:
       group: ${{ github.workflow }}-linux-${{ github.event.pull_request.number || github.sha }}
@@ -73,7 +71,7 @@ jobs:
           ./.ci/monolithic-linux.sh "$(echo ${linux_projects} | tr ' ' ';')" "$(echo ${linux_check_targets})" "$(echo ${linux_runtimes} | tr ' ' ';')" "$(echo ${linux_runtime_check_targets})"
 
   premerge-checks-windows:
-    if: github.repository_owner == 'llvm'
+    if: false && github.repository_owner == 'llvm'
     runs-on: llvm-premerge-windows-runners
     concurrency:
       group: ${{ github.workflow }}-windows-${{ github.event.pull_request.number || github.sha }}

From b23297a7f160ca37102799d1d1b1deb8114f01a4 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Wed, 29 Jan 2025 12:48:11 +0000
Subject: [PATCH 007/282] [GlobalISel] Do not run verifier after
 ResetMachineFunctionPass (#124799)

After we fall back from GlobalISel to SDAG, the verifier gets called,
which calls getReservedRegs which uses SIMachineFunctionInfo::usesAGPRs
which caches the result of UsesAGPRs. Because we have just fallen-back
the function is empty and it incorrectly gets cached to false. This
patch makes sure we don't try to run the verifier whilst the function is
empty.

(cherry picked from commit 66e0498dafbfa7f8fd7deaa88ae62bdf38a12113)
---
 llvm/lib/CodeGen/TargetPassConfig.cpp               | 6 ++++--
 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
index 847a1aef39c56..cf407b5073035 100644
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -1039,11 +1039,13 @@ bool TargetPassConfig::addCoreISelPasses() {
 
     if (addGlobalInstructionSelect())
       return true;
+  }
 
-    // Pass to reset the MachineFunction if the ISel failed.
+  // Pass to reset the MachineFunction if the ISel failed. Outside of the above
+  // if so that the verifier is not added to it.
+  if (Selector == SelectorType::GlobalISel)
     addPass(createResetMachineFunctionPass(
         reportDiagnosticWhenGlobalISelFallback(), isGlobalISelAbortEnabled()));
-  }
 
   // Run the SDAG InstSelector, providing a fallback path when we do not want to
   // abort on not-yet-supported input.
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
index d9ee276c3f076..44cb4e803ffad 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,SDAG %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 -global-isel-abort=2 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 -global-isel-abort=2 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL %s
 
 declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half>, <8 x half>, <4 x float>, i32 immarg, i32 immarg, i32 immarg)
 declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half>, <8 x half>, <16 x float>, i32 immarg, i32 immarg, i32 immarg)

From d5333f265bb7d8a9ba3c51cb7266385827ba7c69 Mon Sep 17 00:00:00 2001
From: vdonaldson <37090318+vdonaldson@users.noreply.github.com>
Date: Thu, 30 Jan 2025 15:15:52 -0500
Subject: [PATCH 008/282] [flang] Build fix (#125087)

Reinstate a preprocessor define for a nonstandard exception.

[124978](https://github.com/llvm/llvm-project/issues/124978#event-16113621403)

(cherry picked from commit 7fa1257c35581268deb5f0fc2faa3ae46358f958)
---
 flang/runtime/exceptions.cpp | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/flang/runtime/exceptions.cpp b/flang/runtime/exceptions.cpp
index b41c7cf438f56..257c71b51edb3 100644
--- a/flang/runtime/exceptions.cpp
+++ b/flang/runtime/exceptions.cpp
@@ -13,11 +13,14 @@
 #include <cfenv>
 #if defined(__aarch64__) && !defined(_WIN32)
 #include <fpu_control.h>
-#elif defined(__x86_64__)
+#elif defined(__x86_64__) && !defined(_WIN32)
 #include <xmmintrin.h>
 #endif
 
-// fenv.h may not define exception macros.
+// File fenv.h usually, but not always, defines standard exceptions as both
+// enumerator values and preprocessor #defines. Some x86 environments also
+// define a nonstandard __FE_DENORM enumerator, but without a corresponding
+// #define, which makes it more difficult to determine if it is present or not.
 #ifndef FE_INVALID
 #define FE_INVALID 0
 #endif
@@ -33,6 +36,12 @@
 #ifndef FE_INEXACT
 #define FE_INEXACT 0
 #endif
+#if FE_INVALID == 1 && FE_DIVBYZERO == 4 && FE_OVERFLOW == 8 && \
+    FE_UNDERFLOW == 16 && FE_INEXACT == 32
+#define __FE_DENORM 2
+#else
+#define __FE_DENORM 0
+#endif
 
 namespace Fortran::runtime {
 
@@ -44,11 +53,7 @@ uint32_t RTNAME(MapException)(uint32_t excepts) {
   Terminator terminator{__FILE__, __LINE__};
 
   static constexpr uint32_t v{FE_INVALID};
-#if __x86_64__
-  static constexpr uint32_t s{__FE_DENORM}; // nonstandard, not a #define
-#else
-  static constexpr uint32_t s{0};
-#endif
+  static constexpr uint32_t s{__FE_DENORM};
   static constexpr uint32_t z{FE_DIVBYZERO};
   static constexpr uint32_t o{FE_OVERFLOW};
   static constexpr uint32_t u{FE_UNDERFLOW};

From c560d6f2c7cbc2d3656cdff22366e20f532a2d4d Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Fri, 31 Jan 2025 11:49:28 -0500
Subject: [PATCH 009/282] [flang] Fix building on aarch64 *BSD and musl libc
 after 9d8dc45d17088300e9e2086594ca581b119193c8 (#125183)

The fpu_control.h header appears to be GLIBC specific.

(cherry picked from commit e31c6c97b795e57a7b1ee31ad37eced40c6305ed)
---
 flang/runtime/exceptions.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flang/runtime/exceptions.cpp b/flang/runtime/exceptions.cpp
index 257c71b51edb3..344e7216cfaae 100644
--- a/flang/runtime/exceptions.cpp
+++ b/flang/runtime/exceptions.cpp
@@ -11,7 +11,7 @@
 #include "flang/Runtime/exceptions.h"
 #include "terminator.h"
 #include <cfenv>
-#if defined(__aarch64__) && !defined(_WIN32)
+#if defined(__aarch64__) && defined(__GLIBC__)
 #include <fpu_control.h>
 #elif defined(__x86_64__) && !defined(_WIN32)
 #include <xmmintrin.h>

From 1c660caad162ec9355781101b1f37e8fbd270ac4 Mon Sep 17 00:00:00 2001
From: Balazs Benics <benicsbalazs@gmail.com>
Date: Sat, 1 Feb 2025 02:35:07 +0100
Subject: [PATCH 010/282] [analyzer][docs] CSA release notes for clang-20
 (#124798)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The commits were gathered using:
```sh
git log --reverse --oneline llvmorg-20-init..llvm/main \
  clang/{lib/StaticAnalyzer,include/clang/StaticAnalyzer} | grep -v NFC | grep -v OpenACC | grep -v -i revert
```

After this I categorized the changes and dropped the less user-facing
commits.

FYI, I also ignored Webkit changes because I assue it's fairly specific
for them, and they likely already know what they ship xD.

I used the `LLVM_ENABLE_SPHINX=ON` and `LLVM_ENABLE_DOXYGEN=ON` cmake
options to enable the `docs-clang-html` build target, which generates
the html into `build/tools/clang/docs/html/ReleaseNotes.html` of which I
attach the screenshots to let you judge if it looks all good or not.

I also used Grammarly this time to check for blatant typos.

---------

Co-authored-by: Donát Nagy <donat.nagy@ericsson.com>
---
 clang/docs/ReleaseNotes.rst | 105 +++++++++++++++++++++++++++++++-----
 1 file changed, 92 insertions(+), 13 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index d8a94703bd9c5..3530d1c0e4c19 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -1356,30 +1356,61 @@ Static Analyzer
 New features
 ^^^^^^^^^^^^
 
-- Now CSA models `__builtin_*_overflow` functions. (#GH102602)
+- The ``__builtin_*_overflow`` functions are now properly modeled. (#GH102602)
 
-- MallocChecker now checks for ``ownership_returns(class, idx)`` and ``ownership_takes(class, idx)``
-  attributes with class names different from "malloc". Clang static analyzer now reports an error
-  if class of allocation and deallocation function mismatches.
+- ``unix.Malloc`` now checks for ``ownership_returns(class, idx)`` and ``ownership_takes(class, idx)``
+  attributes with class names different from "malloc". It now reports an error
+  if the class of allocation and deallocation function mismatches.
   `Documentation <https://clang.llvm.org/docs/analyzer/checkers.html#unix-mismatcheddeallocator-c-c>`__.
 
 - Function effects, e.g. the ``nonblocking`` and ``nonallocating`` "performance constraint"
   attributes, are now verified. For example, for functions declared with the ``nonblocking``
-  attribute, the compiler can generate warnings about the use of any language features, or calls to
+  attribute, the compiler can generate warnings about the use of any language features or calls to
   other functions, which may block.
 
 - Introduced ``-warning-suppression-mappings`` flag to control diagnostic
-  suppressions per file. See `documentation <https://clang.llvm.org/docs/WarningSuppressionMappings.html>_` for details.
+  suppressions per file. See `documentation <https://clang.llvm.org/docs/WarningSuppressionMappings.html>`__ for details.
+
+- Started to model GCC asm statements in some basic way. (#GH103714, #GH109838)
 
 Crash and bug fixes
 ^^^^^^^^^^^^^^^^^^^
 
 - In loops where the loop condition is opaque (i.e. the analyzer cannot
   determine whether it's true or false), the analyzer will no longer assume
-  execution paths that perform more that two iterations. These unjustified
+  execution paths that perform more than two iterations. These unjustified
   assumptions caused false positive reports (e.g. 100+ out-of-bounds reports in
   the FFMPEG codebase) in loops where the programmer intended only two or three
   steps but the analyzer wasn't able to understand that the loop is limited.
+  (#GH119388)
+
+- In clang-19, the ``crosscheck-with-z3-timeout-threshold`` was set to 300ms,
+  but it is now reset back to 15000, aka. 15 seconds. This is to reduce the
+  number of flaky diagnostics due to Z3 query timeouts.
+  If you are affected, read the details at #GH118291 carefully.
+
+- Same as the previous point, but for ``crosscheck-with-z3-rlimit-threshold``
+  and ``crosscheck-with-z3-eqclass-timeout-threshold``.
+  This option is now set to zero, aka. disabled by default. (#GH118291)
+
+- Fixed a crash in the ``unix.Stream`` checker when modeling ``fread``. (#GH108393)
+
+- Fixed a crash in the ``core.StackAddressEscape`` checker related to ``alloca``.
+  Fixes (#GH107852).
+
+- Fixed a crash when invoking a function pointer cast from some non-function pointer. (#GH111390)
+
+- Fixed a crash when modeling some ``ArrayInitLoopExpr``. Fixes (#GH112813).
+
+- Fixed a crash in loop unrolling. Fixes (#GH121201).
+
+- The iteration orders of some internal representations of symbols were changed
+  to make their internal ordering more stable. This should improve determinism.
+  This also reduces the number of flaky reports exposed by the Z3 query timeouts.
+  (#GH121749)
+
+- The ``unix.BlockInCriticalSection`` now recognizes the ``lock()`` member function
+  as expected, even if it's inherited from a base class. Fixes (#GH104241).
 
 Improvements
 ^^^^^^^^^^^^
@@ -1388,6 +1419,40 @@ Improvements
   error if the attribute is attached to a function that returns a non-pointer value.
   Fixes (#GH99501)
 
+- Improved the escape heuristics of member variables of non-trivial std types. (#GH100405)
+  Also when invoking an opaque member function. (#GH111138)
+
+- Improved the ``nullability.NullReturnedFromNonnull`` checker by reporting
+  more violations of the ``returns_nonnull`` attribute.
+  `Documentation <https://clang.llvm.org/docs/analyzer/checkers.html#nullability-nullreturnedfromnonnull-c-c-objc>`_.
+  (#GH106048)
+
+- The ``unix.Stream`` checker now notes the last ``fclose`` call in the diagnostics. (#GH109112)
+
+- The ``core.StackAddressEscape`` checker now detects more leak issues through output
+  parameters and global variables. (#GH105653, #GH105648, #GH107003) Fixes (#GH106834).
+
+- The ``unix.Malloc`` checker was made more consistent with the
+  `ownership attributes <https://clang.llvm.org/docs/AttributeReference.html#analyzer-ownership-attrs>`_.
+  (#GH104599, #GH110115) This also fixed #GH104229.
+
+- The number of false-positive reports of ``alpha.core.FixedAddr`` checker was slightly reduced.
+  (#GH108993, #GH110458)
+
+- Improved the default (range-based) solver by reasoning about more commutative
+  operations, and better deducing some concrete values from their known ranges.
+  (#GH112583, #GH112887, #GH115579)
+
+- A new option ``crosscheck-with-z3-max-attempts-per-query`` should help
+  reducing the number of flaky reports if Z3 query timeouts are used.
+  By default, Z3 queries are attempted at most 3 times, giving it more chances,
+  thus reducing number of flaky issues on timeouts. Read the details in this
+  `RFC <https://discourse.llvm.org/t/analyzer-rfc-retry-z3-crosscheck-queries-on-timeout/83711>`__.
+  (#GH120239)
+
+- The resulting pointer of ``fread`` is now known to never alias with the
+  pointers of ``stdin``, ``stdout`` or ``stderr``. (#GH100085)
+
 Moved checkers
 ^^^^^^^^^^^^^^
 
@@ -1400,21 +1465,35 @@ Moved checkers
   To detect too large arguments passed to malloc, consider using the checker
   ``alpha.taint.TaintedAlloc``.
 
-- The checkers ``alpha.nondeterministic.PointerSorting`` and
+- Both ``alpha.nondeterministic.PointerSorting`` and
   ``alpha.nondeterministic.PointerIteration`` were moved to a new bugprone
   checker named ``bugprone-nondeterministic-pointer-iteration-order``. The
   original checkers were implemented only using AST matching and make more
   sense as a single clang-tidy check.
 
-- The checker ``alpha.unix.Chroot`` was modernized, improved and moved to
-  ``unix.Chroot``. Testing was done on open source projects that use chroot(),
-  and false issues addressed in the improvements based on real use cases. Open
-  source projects used for testing include nsjail, lxroot, dive and ruri.
+- The checker ``alpha.unix.Chroot`` was modernized, improved, and moved to
+  ``unix.Chroot``. Testing was done on open-source projects that use chroot(),
+  and false issues addressed in the improvements based on real use cases.
+  Open-source projects used for testing include ``nsjail``, ``lxroot``, ``dive`` and ``ruri``.
   This checker conforms to SEI Cert C recommendation `POS05-C. Limit access to
   files by creating a jail
   <https://wiki.sei.cmu.edu/confluence/display/c/POS05-C.+Limit+access+to+files+by+creating+a+jail>`_.
   Fixes (#GH34697).
-  (#GH117791) [Documentation](https://clang.llvm.org/docs/analyzer/checkers.html#unix-chroot-c).
+  (#GH117791) `Documentation <https://clang.llvm.org/docs/analyzer/checkers.html#unix-chroot-c>`__.
+
+- The checker ``alpha.core.PointerSub`` was moved to ``security.PointerSub``
+  after it was significantly improved in #GH96501, #GH102580, #GH111846.
+
+- The checker ``alpha.security.MmapWriteExec`` was moved to ``security.MmapWriteExec``.
+
+- The checker ``alpha.unix.cstring.NotNullTerminated`` was moved to ``unix.cstring.NotNullTerminated``.
+
+- The division by tainted value diagnostic was split from the checker ``core.DivideZero``
+  into a separate checker ``optin.taint.TaintedDiv``. (#GH106389)
+
+- Both ``alpha.security.taint.TaintPropagation`` and ``alpha.security.taint.GenericTaint``
+  were moved to ``optin.taint.TaintPropagation`` and ``optin.taint.GenericTaint`` respectively.
+  (#GH67352)
 
 .. _release-notes-sanitizers:
 

From 97becb32056ac77a2981b3024eb06fb57f7708e3 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov@arm.com>
Date: Thu, 30 Jan 2025 10:42:12 +0000
Subject: [PATCH 011/282] [AArch64] Add MSVC mangling for the __mfp8 type
 (#124968)

Fixes #124907

(cherry picked from commit 7939ce6295e7fc0214cd307f97dfccc0cabde381)
---
 clang/lib/AST/MicrosoftMangle.cpp                |  5 +++++
 clang/test/CodeGenCXX/aarch64-ms-mangle-mfp8.cpp | 14 ++++++++++++++
 2 files changed, 19 insertions(+)
 create mode 100644 clang/test/CodeGenCXX/aarch64-ms-mangle-mfp8.cpp

diff --git a/clang/lib/AST/MicrosoftMangle.cpp b/clang/lib/AST/MicrosoftMangle.cpp
index edeeaeaa9ae17..42b735ccf4a2c 100644
--- a/clang/lib/AST/MicrosoftMangle.cpp
+++ b/clang/lib/AST/MicrosoftMangle.cpp
@@ -2792,6 +2792,10 @@ void MicrosoftCXXNameMangler::mangleType(const BuiltinType *T, Qualifiers,
     mangleArtificialTagType(TagTypeKind::Struct, "__bf16", {"__clang"});
     break;
 
+  case BuiltinType::MFloat8:
+    mangleArtificialTagType(TagTypeKind::Struct, "__mfp8", {"__clang"});
+    break;
+
 #define WASM_REF_TYPE(InternalName, MangledName, Id, SingletonId, AS)          \
   case BuiltinType::Id:                                                        \
     mangleArtificialTagType(TagTypeKind::Struct, MangledName);                 \
@@ -2808,6 +2812,7 @@ void MicrosoftCXXNameMangler::mangleType(const BuiltinType *T, Qualifiers,
 
 #define SVE_TYPE(Name, Id, SingletonId) \
   case BuiltinType::Id:
+#define SVE_SCALAR_TYPE(Name, MangledName, Id, SingletonId, Bits)
 #include "clang/Basic/AArch64SVEACLETypes.def"
 #define PPC_VECTOR_TYPE(Name, Id, Size) \
   case BuiltinType::Id:
diff --git a/clang/test/CodeGenCXX/aarch64-ms-mangle-mfp8.cpp b/clang/test/CodeGenCXX/aarch64-ms-mangle-mfp8.cpp
new file mode 100644
index 0000000000000..b5fd9171ad81a
--- /dev/null
+++ b/clang/test/CodeGenCXX/aarch64-ms-mangle-mfp8.cpp
@@ -0,0 +1,14 @@
+// RUN: %clang_cc1 -triple aarch64-windows-msvc -emit-llvm -o - %s | FileCheck %s
+
+typedef __mfp8 mf8;
+typedef __attribute__((neon_vector_type(8))) __mfp8 mf8x8_t;
+typedef __attribute__((neon_vector_type(16))) __mfp8 mf8x16_t;
+
+// CHECK: "?f@@YAXU__mfp8@__clang@@@Z"
+void f(mf8 v) {}
+
+// CHECK: "?f@@YAXT?$__vector@U__mfp8@__clang@@$07@__clang@@@Z"
+void f(mf8x8_t v) {}
+
+// CHECK: "?f@@YAXT?$__vector@U__mfp8@__clang@@$0BA@@__clang@@@Z"
+void f(mf8x16_t v) {}

From 40ca089d9930a2647b8942924f9346358339411a Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Thu, 30 Jan 2025 08:58:23 +0800
Subject: [PATCH 012/282] [CodeGenPrepare] Replace deleted ext instr with the
 promoted value. (#71058)

This PR replaces the deleted ext with the promoted value in `AddrMode`.
Fixes #70938.

(cherry picked from commit 3c6aa04cf4dee65113e2a780b9f90b36bb4c4e04)
---
 llvm/lib/CodeGen/CodeGenPrepare.cpp           | 11 +++++
 .../X86/codegen-prepare-addrmode-sext.ll      | 48 +++++++++++++++++++
 2 files changed, 59 insertions(+)

diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 47486a30bba5b..088062afab177 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -3074,6 +3074,14 @@ struct ExtAddrMode : public TargetLowering::AddrMode {
   void print(raw_ostream &OS) const;
   void dump() const;
 
+  // Replace From in ExtAddrMode with To.
+  // E.g., SExt insts may be promoted and deleted. We should replace them with
+  // the promoted values.
+  void replaceWith(Value *From, Value *To) {
+    if (ScaledReg == From)
+      ScaledReg = To;
+  }
+
   FieldName compare(const ExtAddrMode &other) {
     // First check that the types are the same on each field, as differing types
     // is something we can't cope with later on.
@@ -5365,6 +5373,9 @@ bool AddressingModeMatcher::matchOperationAddr(User *AddrInst, unsigned Opcode,
       TPT.rollback(LastKnownGood);
       return false;
     }
+
+    // SExt has been deleted. Make sure it is not referenced by the AddrMode.
+    AddrMode.replaceWith(Ext, PromotedOperand);
     return true;
   }
   case Instruction::Call:
diff --git a/llvm/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll b/llvm/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll
index f3070cd55903b..9755916f82378 100644
--- a/llvm/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll
+++ b/llvm/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll
@@ -620,3 +620,51 @@ define i8 @oneArgPromotionBlockSExtZExt(i1 %arg1, ptr %base) {
   %res = load i8, ptr %arrayidx
   ret i8 %res
 }
+
+; Check that we replace the deleted sext with the promoted value.
+define void @pr70938(ptr %f) {
+; CHECK-LABEL: define void @pr70938(
+; CHECK-SAME: ptr [[F:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 0, 1
+; CHECK-NEXT:    [[SUNKADDR:%.*]] = mul i64 [[ADD]], 2
+; CHECK-NEXT:    [[SUNKADDR1:%.*]] = getelementptr i8, ptr [[F]], i64 [[SUNKADDR]]
+; CHECK-NEXT:    [[SUNKADDR2:%.*]] = getelementptr i8, ptr [[SUNKADDR1]], i64 1
+; CHECK-NEXT:    store i8 0, ptr [[SUNKADDR2]], align 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %add = add nsw i32 0, 1
+  %idxprom3 = sext i32 %add to i64
+  %arrayidx4 = getelementptr [2 x [1 x [2 x i8]]], ptr %f, i64 0, i64 %idxprom3
+  %arrayidx8 = getelementptr [2 x i8], ptr %arrayidx4, i64 0, i64 %idxprom3
+  br label %if.end
+
+if.end:                                           ; preds = %entry
+  store i8 0, ptr %arrayidx8, align 1
+  ret void
+}
+
+define void @pr119429() {
+; CHECK-LABEL: define void @pr119429() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[AND:%.*]] = and i64 0, 0
+; CHECK-NEXT:    [[SUNKADDR:%.*]] = inttoptr i64 [[AND]] to ptr
+; CHECK-NEXT:    [[SUNKADDR1:%.*]] = mul i64 [[AND]], 2
+; CHECK-NEXT:    [[SUNKADDR2:%.*]] = getelementptr i8, ptr [[SUNKADDR]], i64 [[SUNKADDR1]]
+; CHECK-NEXT:    store i64 0, ptr [[SUNKADDR2]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %and = and i32 0, 0
+  %conv1 = zext i32 %and to i64
+  %sub = add i64 %conv1, 0
+  br label %if.end
+
+if.end:
+  %mul = shl i64 %sub, 1
+  %add = add i64 %mul, %conv1
+  %ptr = inttoptr i64 %add to ptr
+  store i64 0, ptr %ptr, align 8
+  ret void
+}

From a00b0f99237987a3e6983ff6b4a3f0e2f560b31a Mon Sep 17 00:00:00 2001
From: higher-performance <higher.performance.github@gmail.com>
Date: Thu, 30 Jan 2025 15:34:08 -0500
Subject: [PATCH 013/282] Fix false negative when value initializing a field
 annotated with [[clang::require_field_initialization]] (#124329)

It turns out we weren't handling one case: the value-initialization of a
field inside a struct.

I'm not sure why this falls under `IK_Direct` rather than `IK_Value` in
Clang, but it seems to work.

(cherry picked from commit 20fd7df0b847bb46aac2f0b5b71d242220027cbc)
---
 clang/lib/Sema/SemaInit.cpp          |  4 ++-
 clang/test/SemaCXX/uninitialized.cpp | 44 ++++++++++++++++++++++++++++
 2 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index b95cbbf422205..450edcb52ae15 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -4573,7 +4573,9 @@ static void TryConstructorInitialization(Sema &S,
 
   CXXConstructorDecl *CtorDecl = cast<CXXConstructorDecl>(Best->Function);
   if (Result != OR_Deleted) {
-    if (!IsListInit && Kind.getKind() == InitializationKind::IK_Default &&
+    if (!IsListInit &&
+        (Kind.getKind() == InitializationKind::IK_Default ||
+         Kind.getKind() == InitializationKind::IK_Direct) &&
         DestRecordDecl != nullptr && DestRecordDecl->isAggregate() &&
         DestRecordDecl->hasUninitializedExplicitInitFields()) {
       S.Diag(Kind.getLocation(), diag::warn_field_requires_explicit_init)
diff --git a/clang/test/SemaCXX/uninitialized.cpp b/clang/test/SemaCXX/uninitialized.cpp
index 52d9897cf9be6..7578b288d7b3f 100644
--- a/clang/test/SemaCXX/uninitialized.cpp
+++ b/clang/test/SemaCXX/uninitialized.cpp
@@ -2,6 +2,8 @@
 // RUN: %clang_cc1 -fsyntax-only -Wall -Wc++20-compat -Wuninitialized -Wno-unused-value -Wno-unused-lambda-capture -Wno-uninitialized-const-reference -std=c++1z -verify %s -fexperimental-new-constant-interpreter
 // RUN: %clang_cc1 -fsyntax-only -Wall -Wc++20-compat -Wuninitialized -Wno-unused-value -Wno-unused-lambda-capture -Wno-uninitialized-const-reference -std=c++20 -verify %s
 
+void* operator new(__SIZE_TYPE__, void*);
+
 // definitions for std::move
 namespace std {
 inline namespace foo {
@@ -1540,6 +1542,48 @@ void aggregate() {
     };
   };
 
+  struct Embed {
+    int embed1;  // #FIELD_EMBED1
+    int embed2 [[clang::require_explicit_initialization]];  // #FIELD_EMBED2
+  };
+  struct EmbedDerived : Embed {};
+  struct F {
+    Embed f1;
+    // expected-warning@+1 {{field in 'Embed' requires explicit initialization but is not explicitly initialized}} expected-note@#FIELD_EMBED2 {{'embed2' declared here}}
+    explicit F(const char(&)[1]) : f1() {
+      // expected-warning@+1 {{field in 'Embed' requires explicit initialization but is not explicitly initialized}} expected-note@#FIELD_EMBED2 {{'embed2' declared here}}
+      ::new(static_cast<void*>(&f1)) decltype(f1);
+      // expected-warning@+1 {{field in 'Embed' requires explicit initialization but is not explicitly initialized}} expected-note@#FIELD_EMBED2 {{'embed2' declared here}}
+      ::new(static_cast<void*>(&f1)) decltype(f1)();
+#if __cplusplus >= 202002L
+      // expected-warning@+1 {{field 'embed2' requires explicit initialization but is not explicitly initialized}} expected-note@#FIELD_EMBED2 {{'embed2' declared here}}
+      ::new(static_cast<void*>(&f1)) decltype(f1)(1);
+#endif
+      // expected-warning@+1 {{field 'embed2' requires explicit initialization but is not explicitly initialized}} expected-note@#FIELD_EMBED2 {{'embed2' declared here}}
+      ::new(static_cast<void*>(&f1)) decltype(f1){1};
+    }
+#if __cplusplus >= 202002L
+    // expected-warning@+1 {{field 'embed2' requires explicit initialization but is not explicitly initialized}} expected-note@#FIELD_EMBED2 {{'embed2' declared here}}
+    explicit F(const char(&)[2]) : f1(1) {}
+#else
+    explicit F(const char(&)[2]) : f1{1, 2} { }
+#endif
+    // expected-warning@+1 {{field 'embed2' requires explicit initialization but is not explicitly initialized}} expected-note@#FIELD_EMBED2 {{'embed2' declared here}}
+    explicit F(const char(&)[3]) : f1{} {}
+    // expected-warning@+1 {{field 'embed2' requires explicit initialization but is not explicitly initialized}} expected-note@#FIELD_EMBED2 {{'embed2' declared here}}
+    explicit F(const char(&)[4]) : f1{1} {}
+    // expected-warning@+1 {{field 'embed2' requires explicit initialization but is not explicitly initialized}} expected-note@#FIELD_EMBED2 {{'embed2' declared here}}
+    explicit F(const char(&)[5]) : f1{.embed1 = 1} {}
+  };
+  F ctors[] = {
+      F(""),
+      F("_"),
+      F("__"),
+      F("___"),
+      F("____")
+  };
+  (void)ctors;
+
   S::foo(S{1, 2, 3, 4});
   S::foo(S{.s1 = 100, .s4 = 100});
   S::foo(S{.s1 = 100}); // expected-warning {{field 's4' requires explicit initialization but is not explicitly initialized}} expected-note@#FIELD_S4 {{'s4' declared here}}

From 184d1783db8d67c3c2c7983e296151a131c6ba4e Mon Sep 17 00:00:00 2001
From: Oliver Stannard <oliver.stannard@arm.com>
Date: Wed, 29 Jan 2025 10:42:47 +0000
Subject: [PATCH 014/282] [AArch64] PAUTH_PROLOGUE should not be duplicated
 with PAuthLR (#124775)

When using PAuthLR, the PAUTH_PROLOGUE expands into a sequence of
instructions which takes the address of one of those instructions, and
uses that address to compute the return address signature. If this is
duplicated, there will be two different addresses used in calculating
the signature, so the epilogue will only be correct for (at most) one of
them.

This change also restricts code generation when using v8.3-A return
address signing, without PAuthLR. This isn't strictly needed, as
duplicating the prologue there would be valid. We could fix this by
having two copies of PAUTH_PROLOGUE, with and without isNotDuplicable,
but I don't think it's worth adding the extra complexity to a security
feature for that.

(cherry picked from commit 36b3c43524c8ca86a5050496b8773f07c5ccddff)
---
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  7 +-
 .../AArch64/pauthlr-prologue-duplication.mir  | 88 +++++++++++++++++++
 2 files changed, 94 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AArch64/pauthlr-prologue-duplication.mir

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index d112d4f10e47d..b77246200db64 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -1773,7 +1773,12 @@ def : InstAlias<"xpaclri", (XPACLRI), 0>;
 
 let Uses = [LR, SP], Defs = [LR] in {
 // Insertion point of LR signing code.
-def PAUTH_PROLOGUE : Pseudo<(outs), (ins), []>, Sched<[]>;
+def PAUTH_PROLOGUE : Pseudo<(outs), (ins), []>, Sched<[]> {
+  // When using PAuthLR, the address of one of the instructions this expands
+  // into is used as an input to the signature calculation, so this must not be
+  // duplicated.
+  let isNotDuplicable = 1;
+}
 // Insertion point of LR authentication code.
 // The RET terminator of the containing machine basic block may be replaced
 // with a combined RETA(A|B) instruction when rewriting this Pseudo.
diff --git a/llvm/test/CodeGen/AArch64/pauthlr-prologue-duplication.mir b/llvm/test/CodeGen/AArch64/pauthlr-prologue-duplication.mir
new file mode 100644
index 0000000000000..5e57604263793
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/pauthlr-prologue-duplication.mir
@@ -0,0 +1,88 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple aarch64-none-elf -run-pass=block-placement -O3 -o - %s | FileCheck %s
+
+## Check that block-placement does not perform tail duplication on the
+## PAUTH_EPILOGUE instruction. If that happened, the two prologues would use
+## different addresses while calculating the return address signature, so the
+## epilogue could only be correct for (at most) one of them.
+
+--- |
+  define void @test() "frame-pointer"="non-leaf" {
+  entry:
+    ret void
+  }
+
+  declare void @f()
+...
+---
+name:            test
+body:             |
+  ; CHECK-LABEL: name: test
+  ; CHECK: bb.0.entry:
+  ; CHECK-NEXT:   successors: %bb.1(0x30000000), %bb.2(0x50000000)
+  ; CHECK-NEXT:   liveins: $w0, $w1, $lr
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   CBZW renamable $w0, %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT:   liveins: $w0, $w1, $lr
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   B %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT:   liveins: $w1, $lr
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $w8 = MOVZWi 1, 0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.5(0x30000000), %bb.4(0x50000000)
+  ; CHECK-NEXT:   liveins: $w1, $w8, $lr
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   frame-setup PAUTH_PROLOGUE implicit-def $lr, implicit killed $lr, implicit $sp
+  ; CHECK-NEXT:   CBZW killed renamable $w1, %bb.5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   BL @f, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   BL @f, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp
+  ; CHECK-NEXT:   frame-destroy PAUTH_EPILOGUE implicit-def $lr, implicit killed $lr, implicit $sp
+  ; CHECK-NEXT:   TCRETURNdi @f, 0, csr_aarch64_aapcs, implicit $sp
+  bb.0.entry:
+    successors: %bb.1(0x30000000), %bb.2(0x50000000)
+    liveins: $w0, $w1, $lr
+
+    CBNZW renamable $w0, %bb.2
+
+  bb.1:
+    successors: %bb.3(0x80000000)
+    liveins: $w1, $lr
+
+    renamable $w8 = MOVZWi 1, 0
+    B %bb.3
+
+  bb.2:
+    successors: %bb.3(0x80000000)
+    liveins: $w0, $w1, $lr
+
+  bb.3:
+    successors: %bb.5(0x30000000), %bb.4(0x50000000)
+    liveins: $w1, $w8, $lr
+
+    frame-setup PAUTH_PROLOGUE implicit-def $lr, implicit killed $lr, implicit $sp
+    CBZW killed renamable $w1, %bb.5
+
+  bb.4:
+    successors: %bb.5(0x80000000)
+
+    BL @f, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp
+
+  bb.5:
+    BL @f, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp
+    frame-destroy PAUTH_EPILOGUE implicit-def $lr, implicit killed $lr, implicit $sp
+    TCRETURNdi @f, 0, csr_aarch64_aapcs, implicit $sp
+...

From fe005eb0696f5255912a6fbe0e6bbd8c168340f7 Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron@aaronballman.com>
Date: Fri, 31 Jan 2025 21:00:52 -0500
Subject: [PATCH 015/282] Allow 'inline' on some declarations in MS
 compatibility mode (#125250) (#125275)

Microsoft allows the 'inline' specifier on a typedef of a function type
in C modes. This is used by a system header (ufxclient.h), so instead of
giving a hard error, we diagnose with a warning. C++ mode and non-
Microsoft compatibility modes are not impacted.

Fixes https://github.com/llvm/llvm-project/issues/124869

(cherry picked from commit ef91caec2cf313624829114802cff92ae682e550)
---
 clang/docs/ReleaseNotes.rst                      |  2 ++
 clang/include/clang/Basic/DiagnosticGroups.td    |  4 +++-
 clang/include/clang/Basic/DiagnosticSemaKinds.td |  2 ++
 clang/lib/Sema/SemaDecl.cpp                      |  5 ++++-
 clang/test/Sema/MicrosoftCompatibility.c         | 16 ++++++++++++++--
 clang/test/Sema/MicrosoftCompatibility.cpp       |  7 +++++++
 6 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 3530d1c0e4c19..88c9fc8f95e4a 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -397,6 +397,8 @@ Resolutions to C++ Defect Reports
 C Language Changes
 ------------------
 
+- Clang now allows an ``inline`` specifier on a typedef declaration of a
+  function type in Microsoft compatibility mode. #GH124869
 - Extend clang's ``<limits.h>`` to define ``LONG_LONG_*`` macros for Android's bionic.
 - Macro ``__STDC_NO_THREADS__`` is no longer necessary for MSVC 2022 1939 and later.
 - Exposed the the ``__nullptr`` keyword as an alias for ``nullptr`` in all C language modes.
diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index 209792f851b6a..af57a42b1ec5a 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -1306,6 +1306,8 @@ def MicrosoftStaticAssert : DiagGroup<"microsoft-static-assert">;
 def MicrosoftInitFromPredefined : DiagGroup<"microsoft-init-from-predefined">;
 def MicrosoftStringLiteralFromPredefined : DiagGroup<
     "microsoft-string-literal-from-predefined">;
+def MicrosoftInlineOnNonFunction : DiagGroup<
+    "microsoft-inline-on-non-function">;
 
 // Aliases.
 def : DiagGroup<"msvc-include", [MicrosoftInclude]>;
@@ -1324,7 +1326,7 @@ def Microsoft : DiagGroup<"microsoft",
      MicrosoftConstInit, MicrosoftVoidPseudoDtor, MicrosoftAnonTag,
      MicrosoftCommentPaste, MicrosoftEndOfFile, MicrosoftStaticAssert,
      MicrosoftInitFromPredefined, MicrosoftStringLiteralFromPredefined,
-     MicrosoftInconsistentDllImport]>;
+     MicrosoftInconsistentDllImport, MicrosoftInlineOnNonFunction]>;
 
 def ClangClPch : DiagGroup<"clang-cl-pch">;
 
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index db911ed121e95..ec2a140e04d5b 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -482,6 +482,8 @@ def ext_use_out_of_scope_declaration : ExtWarn<
   InGroup<DiagGroup<"out-of-scope-function">>;
 def err_inline_non_function : Error<
   "'inline' can only appear on functions%select{| and non-local variables}0">;
+def warn_ms_inline_non_function : ExtWarn<err_inline_non_function.Summary>,
+  InGroup<MicrosoftInlineOnNonFunction>;
 def err_noreturn_non_function : Error<
   "'_Noreturn' can only appear on functions">;
 def warn_qual_return_type : Warning<
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 3cad9827fdab6..1ecb9aff5f319 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -6681,7 +6681,10 @@ Sema::ActOnTypedefDeclarator(Scope* S, Declarator& D, DeclContext* DC,
   DiagnoseFunctionSpecifiers(D.getDeclSpec());
 
   if (D.getDeclSpec().isInlineSpecified())
-    Diag(D.getDeclSpec().getInlineSpecLoc(), diag::err_inline_non_function)
+    Diag(D.getDeclSpec().getInlineSpecLoc(),
+         (getLangOpts().MSVCCompat && !getLangOpts().CPlusPlus)
+             ? diag::warn_ms_inline_non_function
+             : diag::err_inline_non_function)
         << getLangOpts().CPlusPlus17;
   if (D.getDeclSpec().hasConstexprSpecifier())
     Diag(D.getDeclSpec().getConstexprSpecLoc(), diag::err_invalid_constexpr)
diff --git a/clang/test/Sema/MicrosoftCompatibility.c b/clang/test/Sema/MicrosoftCompatibility.c
index 9a1f050747f9d..8d402d53e004d 100644
--- a/clang/test/Sema/MicrosoftCompatibility.c
+++ b/clang/test/Sema/MicrosoftCompatibility.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 %s -fsyntax-only -Wno-unused-value -Wmicrosoft -verify -fms-compatibility -DMSVCCOMPAT -triple i686-pc-win32
-// RUN: %clang_cc1 %s -fsyntax-only -Wno-unused-value -Wmicrosoft -verify -fms-extensions -triple i686-pc-win32
+// RUN: %clang_cc1 %s -fsyntax-only -Wno-unused-value -Wmicrosoft -verify=expected,compat -fms-compatibility -DMSVCCOMPAT -triple i686-pc-win32
+// RUN: %clang_cc1 %s -fsyntax-only -Wno-unused-value -Wmicrosoft -verify=expected,ext -fms-extensions -triple i686-pc-win32
 
 #ifdef MSVCCOMPAT
 enum ENUM1; // expected-warning {{forward references to 'enum' types are a Microsoft extension}}
@@ -35,3 +35,15 @@ size_t x;
 #else
 size_t x; // expected-error {{unknown type name 'size_t'}}
 #endif
+
+/* Microsoft allows inline, __inline, and __forceinline to appear on a typedef
+   of a function type; this is used in their system headers such as ufxclient.h
+   See GitHub #124869 for more details.
+ */
+typedef int inline Foo1(int);       // compat-warning {{'inline' can only appear on functions}} \
+                                       ext-error {{'inline' can only appear on functions}}
+typedef int __inline Foo2(int);     // compat-warning {{'inline' can only appear on functions}} \
+                                       ext-error {{'inline' can only appear on functions}}
+typedef int __forceinline Foo(int); // compat-warning {{'inline' can only appear on functions}} \
+                                       ext-error {{'inline' can only appear on functions}} \
+                                       expected-warning {{'__forceinline' attribute only applies to functions and statements}}
diff --git a/clang/test/Sema/MicrosoftCompatibility.cpp b/clang/test/Sema/MicrosoftCompatibility.cpp
index 90a45dfaaf176..391977e2765c5 100644
--- a/clang/test/Sema/MicrosoftCompatibility.cpp
+++ b/clang/test/Sema/MicrosoftCompatibility.cpp
@@ -8,3 +8,10 @@ struct cls {
 };
 
 char * cls::* __uptr wrong2 = &cls::m; // expected-error {{'__uptr' attribute cannot be used with pointers to members}}
+
+// Microsoft allows inline, __inline, and __forceinline to appear on a typedef
+// of a function type, but only in C. See GitHub #124869 for more details.
+typedef int inline Foo1(int);       // expected-error {{'inline' can only appear on functions}}
+typedef int __inline Foo2(int);     // expected-error {{'inline' can only appear on functions}}
+typedef int __forceinline Foo(int); // expected-error {{'inline' can only appear on functions}} \
+                                       expected-warning {{'__forceinline' attribute only applies to functions and statements}}

From 2c142b23aef0beae9697eb1b10e9f6d954223794 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Thu, 30 Jan 2025 08:55:34 +0000
Subject: [PATCH 016/282] [clang][SME] Account for C++ lambdas in SME builtin
 diagnostics (#124750)

A C++ lambda does not inherit attributes from the parent function. So
the SME builtin diagnostics should look at the lambda's attributes, not
the parent function's.

The fix is very simple and just adds the missing "AllowLambda" flag to
the function decl lookups.

(cherry picked from commit 2b7509e9885c9a5656bb3c201421e146a21fb88e)
---
 clang/lib/Sema/SemaARM.cpp                    |  9 ++-
 ... => aarch64-incompat-sm-builtin-calls.cpp} | 68 ++++++++++++++++++-
 2 files changed, 73 insertions(+), 4 deletions(-)
 rename clang/test/Sema/{aarch64-incompat-sm-builtin-calls.c => aarch64-incompat-sm-builtin-calls.cpp} (71%)

diff --git a/clang/lib/Sema/SemaARM.cpp b/clang/lib/Sema/SemaARM.cpp
index 9fbe8358f716b..71dfe68f104ed 100644
--- a/clang/lib/Sema/SemaARM.cpp
+++ b/clang/lib/Sema/SemaARM.cpp
@@ -636,7 +636,8 @@ static ArmSMEState getSMEState(unsigned BuiltinID) {
 
 bool SemaARM::CheckSMEBuiltinFunctionCall(unsigned BuiltinID,
                                           CallExpr *TheCall) {
-  if (const FunctionDecl *FD = SemaRef.getCurFunctionDecl()) {
+  if (const FunctionDecl *FD =
+          SemaRef.getCurFunctionDecl(/*AllowLambda=*/true)) {
     std::optional<ArmStreamingType> BuiltinType;
 
     switch (BuiltinID) {
@@ -676,7 +677,8 @@ bool SemaARM::CheckSMEBuiltinFunctionCall(unsigned BuiltinID,
 
 bool SemaARM::CheckSVEBuiltinFunctionCall(unsigned BuiltinID,
                                           CallExpr *TheCall) {
-  if (const FunctionDecl *FD = SemaRef.getCurFunctionDecl()) {
+  if (const FunctionDecl *FD =
+          SemaRef.getCurFunctionDecl(/*AllowLambda=*/true)) {
     std::optional<ArmStreamingType> BuiltinType;
 
     switch (BuiltinID) {
@@ -705,7 +707,8 @@ bool SemaARM::CheckSVEBuiltinFunctionCall(unsigned BuiltinID,
 bool SemaARM::CheckNeonBuiltinFunctionCall(const TargetInfo &TI,
                                            unsigned BuiltinID,
                                            CallExpr *TheCall) {
-  if (const FunctionDecl *FD = SemaRef.getCurFunctionDecl()) {
+  if (const FunctionDecl *FD =
+          SemaRef.getCurFunctionDecl(/*AllowLambda=*/true)) {
 
     switch (BuiltinID) {
     default:
diff --git a/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c b/clang/test/Sema/aarch64-incompat-sm-builtin-calls.cpp
similarity index 71%
rename from clang/test/Sema/aarch64-incompat-sm-builtin-calls.c
rename to clang/test/Sema/aarch64-incompat-sm-builtin-calls.cpp
index 27fa8f7c9dccb..3fbcaf4a13d67 100644
--- a/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c
+++ b/clang/test/Sema/aarch64-incompat-sm-builtin-calls.cpp
@@ -1,5 +1,5 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1  -triple aarch64-none-linux-gnu -target-feature +sve \
+// RUN: %clang_cc1  -std=c++23 -triple aarch64-none-linux-gnu -target-feature +sve \
 // RUN:   -target-feature +bf16 -target-feature +sve -target-feature +sme -target-feature +sme2 -target-feature +sve2 -target-feature +neon -Waarch64-sme-attributes -fsyntax-only -verify %s
 
 // REQUIRES: aarch64-registered-target
@@ -126,3 +126,69 @@ void missing_zt0(void) __arm_streaming {
 
 __arm_new("zt0")
 void new_zt0(void) __arm_streaming { svzero_zt(0); }
+
+/// C++ lambda tests:
+
+void use_streaming_builtin_in_lambda(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_out("za")
+{
+  [&]{
+    /// The lambda is its own function and does not inherit the SME attributes (so this should error).
+    // expected-error@+1 {{builtin can only be called from a streaming function}}
+    svld1_hor_za64(0, slice_base, pg, ptr);
+  }();
+}
+
+void use_streaming_builtin(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_out("za")
+{
+  /// Without the lambda the same builtin is okay (as the SME attributes apply).
+  svld1_hor_za64(0, slice_base, pg, ptr);
+}
+
+int16x8_t use_neon_builtin_sm(int16x8_t splat) __arm_streaming_compatible {
+  // expected-error@+1 {{builtin can only be called from a non-streaming function}}
+  return (int16x8_t)__builtin_neon_vqaddq_v((int8x16_t)splat, (int8x16_t)splat, 33);
+}
+
+int16x8_t use_neon_builtin_sm_in_lambda(int16x8_t splat) __arm_streaming_compatible {
+  return [&]{
+    /// This should not error (as we switch out of streaming mode to execute the lambda).
+    /// Note: The result int16x8_t is spilled and reloaded as a q-register.
+    return (int16x8_t)__builtin_neon_vqaddq_v((int8x16_t)splat, (int8x16_t)splat, 33);
+  }();
+}
+
+float use_incomp_sve_builtin_sm() __arm_streaming {
+  // expected-error@+1 {{builtin can only be called from a non-streaming function}}
+  return svadda(svptrue_b32(), 0, svdup_f32(1));
+}
+
+float incomp_sve_sm_fadda_sm_in_lambda(void) __arm_streaming {
+  return [&]{
+    /// This should work like the Neon builtin.
+    return svadda(svptrue_b32(), 0, svdup_f32(1));
+  }();
+}
+
+void use_streaming_builtin_in_streaming_lambda(uint32_t slice_base, const void *ptr)
+{
+  [&]  __arm_new("za") () __arm_streaming {
+    // Here the lambda is streaming with ZA state, so this is okay.
+    svld1_hor_za64(0, slice_base, svptrue_b64(), ptr);
+  }();
+}
+
+int16x8_t use_neon_builtin_in_streaming_lambda(int16x8_t splat) {
+  return [&]() __arm_streaming_compatible {
+    /// This should error as the lambda is streaming-compatible.
+    // expected-error@+1 {{builtin can only be called from a non-streaming function}}
+    return (int16x8_t)__builtin_neon_vqaddq_v((int8x16_t)splat, (int8x16_t)splat, 33);
+  }();
+}
+
+float incomp_sve_fadda_in_streaming_lambda(void) {
+  return [&]() __arm_streaming {
+    // Should error (like the Neon case above).
+    // expected-error@+1 {{builtin can only be called from a non-streaming function}}
+    return svadda(svptrue_b32(), 0, svdup_f32(1));
+  }();
+}

From d777df5cbd35b301826b2b1500b5eb02d56818d5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Wed, 29 Jan 2025 15:25:43 +0200
Subject: [PATCH 017/282] [libcxx] Use _ftelli64/_fseeki64 on Windows (#123128)

This allows using the full 64 bit range for file offsets.

This should fix the issue reported downstream at
https://github.com/mstorsjo/llvm-mingw/issues/462.

(cherry picked from commit 86e20b00c313e96db3b69d440bfb2ca9063f08f0)
---
 libcxx/include/fstream                        | 55 ++++++++++---------
 .../ifstream.members/offset_range.pass.cpp    |  6 --
 2 files changed, 29 insertions(+), 32 deletions(-)

diff --git a/libcxx/include/fstream b/libcxx/include/fstream
index f0e9425e0a53d..de5c07035dba9 100644
--- a/libcxx/include/fstream
+++ b/libcxx/include/fstream
@@ -216,12 +216,6 @@ typedef basic_fstream<wchar_t> wfstream;
 _LIBCPP_PUSH_MACROS
 #  include <__undef_macros>
 
-#  if !defined(_LIBCPP_MSVCRT) && !defined(_NEWLIB_VERSION)
-#    define _LIBCPP_HAS_OFF_T_FUNCTIONS 1
-#  else
-#    define _LIBCPP_HAS_OFF_T_FUNCTIONS 0
-#  endif
-
 #  if _LIBCPP_HAS_FILESYSTEM && _LIBCPP_HAS_LOCALIZATION
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -362,6 +356,9 @@ private:
   bool __read_mode();
   void __write_mode();
 
+  _LIBCPP_HIDE_FROM_ABI static int __fseek(FILE* __file, pos_type __offset, int __whence);
+  _LIBCPP_HIDE_FROM_ABI static pos_type __ftell(FILE* __file);
+
   _LIBCPP_EXPORTED_FROM_ABI friend FILE* __get_ostream_file(ostream&);
 
   // There are multiple (__)open function, they use different C-API open
@@ -936,31 +933,42 @@ basic_filebuf<_CharT, _Traits>::seekoff(off_type __off, ios_base::seekdir __way,
   default:
     return pos_type(off_type(-1));
   }
-#    if !_LIBCPP_HAS_OFF_T_FUNCTIONS
-  if (fseek(__file_, __width > 0 ? __width * __off : 0, __whence))
+  if (__fseek(__file_, __width > 0 ? __width * __off : 0, __whence))
     return pos_type(off_type(-1));
-  pos_type __r = ftell(__file_);
-#    else
-  if (::fseeko(__file_, __width > 0 ? __width * __off : 0, __whence))
-    return pos_type(off_type(-1));
-  pos_type __r = ftello(__file_);
-#    endif
+  pos_type __r = __ftell(__file_);
   __r.state(__st_);
   return __r;
 }
 
+template <class _CharT, class _Traits>
+int basic_filebuf<_CharT, _Traits>::__fseek(FILE* __file, pos_type __offset, int __whence) {
+#    if defined(_LIBCPP_MSVCRT_LIKE)
+  return _fseeki64(__file, __offset, __whence);
+#    elif defined(_NEWLIB_VERSION)
+  return fseek(__file, __offset, __whence);
+#    else
+  return ::fseeko(__file, __offset, __whence);
+#    endif
+}
+
+template <class _CharT, class _Traits>
+typename basic_filebuf<_CharT, _Traits>::pos_type basic_filebuf<_CharT, _Traits>::__ftell(FILE* __file) {
+#    if defined(_LIBCPP_MSVCRT_LIKE)
+  return _ftelli64(__file);
+#    elif defined(_NEWLIB_VERSION)
+  return ftell(__file);
+#    else
+  return ftello(__file);
+#    endif
+}
+
 template <class _CharT, class _Traits>
 typename basic_filebuf<_CharT, _Traits>::pos_type
 basic_filebuf<_CharT, _Traits>::seekpos(pos_type __sp, ios_base::openmode) {
   if (__file_ == nullptr || sync())
     return pos_type(off_type(-1));
-#    if !_LIBCPP_HAS_OFF_T_FUNCTIONS
-  if (fseek(__file_, __sp, SEEK_SET))
+  if (__fseek(__file_, __sp, SEEK_SET))
     return pos_type(off_type(-1));
-#    else
-  if (::fseeko(__file_, __sp, SEEK_SET))
-    return pos_type(off_type(-1));
-#    endif
   __st_ = __sp.state();
   return __sp;
 }
@@ -1007,13 +1015,8 @@ int basic_filebuf<_CharT, _Traits>::sync() {
         }
       }
     }
-#    if !_LIBCPP_HAS_OFF_T_FUNCTIONS
-    if (fseek(__file_, -__c, SEEK_CUR))
+    if (__fseek(__file_, -__c, SEEK_CUR))
       return -1;
-#    else
-    if (::fseeko(__file_, -__c, SEEK_CUR))
-      return -1;
-#    endif
     if (__update_st)
       __st_ = __state;
     __extbufnext_ = __extbufend_ = __extbuf_;
diff --git a/libcxx/test/std/input.output/file.streams/fstreams/ifstream.members/offset_range.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/ifstream.members/offset_range.pass.cpp
index 6ffe750564c2c..c6e07d045e145 100644
--- a/libcxx/test/std/input.output/file.streams/fstreams/ifstream.members/offset_range.pass.cpp
+++ b/libcxx/test/std/input.output/file.streams/fstreams/ifstream.members/offset_range.pass.cpp
@@ -11,12 +11,6 @@
 // Test that we can seek using offsets larger than 32 bit, and that we can
 // retrieve file offsets larger than 32 bit.
 
-// On MSVC targets, we only use the 32 bit fseek/ftell functions. For MinGW
-// targets, we use fseeko/ftello, but the user needs to define
-// _FILE_OFFSET_BITS=64 to make them 64 bit.
-//
-// XFAIL: target={{.*}}-windows{{.*}}
-
 // On 32 bit Android platforms, off_t is 32 bit by default. By defining
 // _FILE_OFFSET_BITS=64, one gets a 64 bit off_t, but the corresponding
 // 64 bit ftello/fseeko functions are only available since Android API 24 (7.0).

From 1eb7f4e6b46179be2388529261502318a802ec84 Mon Sep 17 00:00:00 2001
From: Hubert Tong <hubert.reinterpretcast@gmail.com>
Date: Sat, 1 Feb 2025 21:04:18 -0400
Subject: [PATCH 018/282] release/20.x: [Clang][ReleaseNotes] Document
 -fclang-abi-compat=19 re: #110503 (#125368)

#110503 updates the scope of `-fclang-abi-compat` but did not make that
clear in the release notes. This PR addresses that problem.
---
 clang/docs/ReleaseNotes.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 88c9fc8f95e4a..53534d821b2c9 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -252,7 +252,7 @@ ABI Changes in This Version
 
 - Fixed Microsoft name mangling of placeholder, auto and decltype(auto), return types for MSVC 1920+. This change resolves incompatibilities with code compiled by MSVC 1920+ but will introduce incompatibilities with code compiled by earlier versions of Clang unless such code is built with the compiler option -fms-compatibility-version=19.14 to imitate the MSVC 1914 mangling behavior.
 - Fixed the Itanium mangling of the construction vtable name. This change will introduce incompatibilities with code compiled by Clang 19 and earlier versions, unless the -fclang-abi-compat=19 option is used. (#GH108015)
-- Mangle member-like friend function templates as members of the enclosing class. (#GH110247, #GH110503)
+- Mangle member-like friend function templates as members of the enclosing class. This can be disabled using -fclang-abi-compat=19. (#GH110247, #GH110503)
 
 AST Dumping Potentially Breaking Changes
 ----------------------------------------

From af7f483a9d801252247b6c72e3763c1f55c5a506 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Sat, 1 Feb 2025 18:04:34 -0800
Subject: [PATCH 019/282] Set version to 20.1.0-rc1 (#125367)

---
 cmake/Modules/LLVMVersion.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/Modules/LLVMVersion.cmake b/cmake/Modules/LLVMVersion.cmake
index 281d0444255ba..fd7cc0868aa3c 100644
--- a/cmake/Modules/LLVMVersion.cmake
+++ b/cmake/Modules/LLVMVersion.cmake
@@ -10,6 +10,6 @@ if(NOT DEFINED LLVM_VERSION_PATCH)
   set(LLVM_VERSION_PATCH 0)
 endif()
 if(NOT DEFINED LLVM_VERSION_SUFFIX)
-  set(LLVM_VERSION_SUFFIX git)
+  set(LLVM_VERSION_SUFFIX -rc1)
 endif()
 

From 0cca13f758a8bda75eab45ad4bf896bb83921ec9 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Mon, 3 Feb 2025 16:28:24 -0800
Subject: [PATCH 020/282] workflows/premerge: Cancel in progress jobs when a PR
 is merged (#125329) (#125588)

(cherry picked from commit 2deba08e09b9412c9f4e5888237e28173dee085b)
---
 .github/workflows/premerge.yaml | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/premerge.yaml b/.github/workflows/premerge.yaml
index c22b35e122b9f..956760feaa3b5 100644
--- a/.github/workflows/premerge.yaml
+++ b/.github/workflows/premerge.yaml
@@ -5,6 +5,17 @@ permissions:
 
 on:
   pull_request:
+    types:
+      - opened
+      - synchronize
+      - reopened
+      # When a PR is closed, we still start this workflow, but then skip
+      # all the jobs, which makes it effectively a no-op.  The reason to
+      # do this is that it allows us to take advantage of concurrency groups
+      # to cancel in progress CI jobs whenever the PR is closed.
+      - closed
+    paths:
+      - .github/workflows/premerge.yaml
   push:
     branches:
       - 'main'
@@ -12,7 +23,9 @@ on:
 
 jobs:
   premerge-checks-linux:
-    if: false && github.repository_owner == 'llvm'
+    if: >-
+        false && github.repository_owner == 'llvm' &&
+        (github.event_name != 'pull_request' || github.event.action != 'closed')
     runs-on: llvm-premerge-linux-runners
     concurrency:
       group: ${{ github.workflow }}-linux-${{ github.event.pull_request.number || github.sha }}
@@ -71,7 +84,9 @@ jobs:
           ./.ci/monolithic-linux.sh "$(echo ${linux_projects} | tr ' ' ';')" "$(echo ${linux_check_targets})" "$(echo ${linux_runtimes} | tr ' ' ';')" "$(echo ${linux_runtime_check_targets})"
 
   premerge-checks-windows:
-    if: false && github.repository_owner == 'llvm'
+    if: >-
+        false && github.repository_owner == 'llvm' &&
+        (github.event_name != 'pull_request' || github.event.action != 'closed')
     runs-on: llvm-premerge-windows-runners
     concurrency:
       group: ${{ github.workflow }}-windows-${{ github.event.pull_request.number || github.sha }}
@@ -139,7 +154,8 @@ jobs:
     if: >-
       github.repository_owner == 'llvm' &&
       (startswith(github.ref_name, 'release/') ||
-       startswith(github.base_ref, 'release/'))
+       startswith(github.base_ref, 'release/')) &&
+      (github.event_name != 'pull_request' || github.event.action != 'closed')
     steps:
       - name: Checkout LLVM
         uses: actions/checkout@v4

From dc50bb0b14559f8c8f5c11bac96b6348e27d12f5 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Mon, 3 Feb 2025 13:13:11 -0800
Subject: [PATCH 021/282] workflows/release-tasks: Re-use release-binaries-all
 workflow (#125378)

This way we don't need to duplicate the list of supported targets in the
release-tasks workflow.

(cherry picked from commit d194c6b9a7fdda7a61abcd6bfe39ab465bf0cc87)
---
 .github/workflows/release-tasks.yml | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/.github/workflows/release-tasks.yml b/.github/workflows/release-tasks.yml
index 780dd0ff6325c..52076ea1821b0 100644
--- a/.github/workflows/release-tasks.yml
+++ b/.github/workflows/release-tasks.yml
@@ -89,20 +89,10 @@ jobs:
     needs:
       - validate-tag
       - release-create
-    strategy:
-      fail-fast: false
-      matrix:
-        runs-on:
-          - ubuntu-22.04
-          - windows-2022
-          - macos-13
-          - macos-14
-
-    uses: ./.github/workflows/release-binaries.yml
+    uses: ./.github/workflows/release-binaries-all.yml
     with:
       release-version: ${{ needs.validate-tag.outputs.release-version }}
       upload: true
-      runs-on: ${{ matrix.runs-on }}
     # Called workflows don't have access to secrets by default, so we need to explicitly pass secrets that we use.
     secrets:
       RELEASE_TASKS_USER_TOKEN: ${{ secrets.RELEASE_TASKS_USER_TOKEN }}

From d185bd94ff7717429fd2fffbcd0d4c7c64c05f0b Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Fri, 31 Jan 2025 07:57:43 +0000
Subject: [PATCH 022/282] [AArch64] Enable vscale_range with +sme (#124466)

If we have +sme but not +sve, we would not set vscale_range on
functions. It should be valid to apply it with the same range with just
+sme, which can help mitigate some performance regressions in cases such
as scalable vector bitcasts (https://godbolt.org/z/exhe4jd8d).

(cherry picked from commit 9f1c825fb62319b94ac9604f733afd59e9eb461b)
---
 clang/include/clang/Basic/TargetInfo.h          |  3 ++-
 clang/lib/AST/ASTContext.cpp                    |  3 ++-
 clang/lib/AST/ItaniumMangle.cpp                 |  2 +-
 clang/lib/Basic/Targets/AArch64.cpp             |  5 +++--
 clang/lib/Basic/Targets/AArch64.h               |  3 ++-
 clang/lib/Basic/Targets/RISCV.cpp               |  5 +++--
 clang/lib/Basic/Targets/RISCV.h                 |  3 ++-
 clang/lib/CodeGen/CodeGenFunction.cpp           | 17 +++++++++--------
 clang/lib/CodeGen/Targets/RISCV.cpp             |  4 ++--
 clang/lib/Sema/SemaType.cpp                     |  3 ++-
 .../sme-intrinsics/aarch64-sme-attrs.cpp        |  4 ++--
 11 files changed, 30 insertions(+), 22 deletions(-)

diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h
index 43c09cf1f973e..d762144478b48 100644
--- a/clang/include/clang/Basic/TargetInfo.h
+++ b/clang/include/clang/Basic/TargetInfo.h
@@ -1023,7 +1023,8 @@ class TargetInfo : public TransferrableTargetInfo,
 
   /// Returns target-specific min and max values VScale_Range.
   virtual std::optional<std::pair<unsigned, unsigned>>
-  getVScaleRange(const LangOptions &LangOpts) const {
+  getVScaleRange(const LangOptions &LangOpts,
+                 bool IsArmStreamingFunction) const {
     return std::nullopt;
   }
   /// The __builtin_clz* and __builtin_ctz* built-in
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index cd1bcb3b9a063..e58091ce95f62 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -10363,7 +10363,8 @@ bool ASTContext::areLaxCompatibleSveTypes(QualType FirstType,
 /// getRVVTypeSize - Return RVV vector register size.
 static uint64_t getRVVTypeSize(ASTContext &Context, const BuiltinType *Ty) {
   assert(Ty->isRVVVLSBuiltinType() && "Invalid RVV Type");
-  auto VScale = Context.getTargetInfo().getVScaleRange(Context.getLangOpts());
+  auto VScale =
+      Context.getTargetInfo().getVScaleRange(Context.getLangOpts(), false);
   if (!VScale)
     return 0;
 
diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp
index 49089c0ea3c8a..f84ccefd34cac 100644
--- a/clang/lib/AST/ItaniumMangle.cpp
+++ b/clang/lib/AST/ItaniumMangle.cpp
@@ -4198,7 +4198,7 @@ void CXXNameMangler::mangleRISCVFixedRVVVectorType(const VectorType *T) {
 
   // Apend the LMUL suffix.
   auto VScale = getASTContext().getTargetInfo().getVScaleRange(
-      getASTContext().getLangOpts());
+      getASTContext().getLangOpts(), false);
   unsigned VLen = VScale->first * llvm::RISCV::RVVBitsPerBlock;
 
   if (T->getVectorKind() == VectorKind::RVVFixedLengthData) {
diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp
index 0b899137bbb5c..57c9849ef2a72 100644
--- a/clang/lib/Basic/Targets/AArch64.cpp
+++ b/clang/lib/Basic/Targets/AArch64.cpp
@@ -703,12 +703,13 @@ ArrayRef<Builtin::Info> AArch64TargetInfo::getTargetBuiltins() const {
 }
 
 std::optional<std::pair<unsigned, unsigned>>
-AArch64TargetInfo::getVScaleRange(const LangOptions &LangOpts) const {
+AArch64TargetInfo::getVScaleRange(const LangOptions &LangOpts,
+                                  bool IsArmStreamingFunction) const {
   if (LangOpts.VScaleMin || LangOpts.VScaleMax)
     return std::pair<unsigned, unsigned>(
         LangOpts.VScaleMin ? LangOpts.VScaleMin : 1, LangOpts.VScaleMax);
 
-  if (hasFeature("sve"))
+  if (hasFeature("sve") || (IsArmStreamingFunction && hasFeature("sme")))
     return std::pair<unsigned, unsigned>(1, 16);
 
   return std::nullopt;
diff --git a/clang/lib/Basic/Targets/AArch64.h b/clang/lib/Basic/Targets/AArch64.h
index 600940f5e4e23..b75d2a9dc8eca 100644
--- a/clang/lib/Basic/Targets/AArch64.h
+++ b/clang/lib/Basic/Targets/AArch64.h
@@ -184,7 +184,8 @@ class LLVM_LIBRARY_VISIBILITY AArch64TargetInfo : public TargetInfo {
   ArrayRef<Builtin::Info> getTargetBuiltins() const override;
 
   std::optional<std::pair<unsigned, unsigned>>
-  getVScaleRange(const LangOptions &LangOpts) const override;
+  getVScaleRange(const LangOptions &LangOpts,
+                 bool IsArmStreamingFunction) const override;
   bool doesFeatureAffectCodeGen(StringRef Name) const override;
   bool validateCpuSupports(StringRef FeatureStr) const override;
   bool hasFeature(StringRef Feature) const override;
diff --git a/clang/lib/Basic/Targets/RISCV.cpp b/clang/lib/Basic/Targets/RISCV.cpp
index 8167d7603b0e1..61b8ae9d098ab 100644
--- a/clang/lib/Basic/Targets/RISCV.cpp
+++ b/clang/lib/Basic/Targets/RISCV.cpp
@@ -222,7 +222,7 @@ void RISCVTargetInfo::getTargetDefines(const LangOptions &Opts,
   // Currently we support the v1.0 RISC-V V intrinsics.
   Builder.defineMacro("__riscv_v_intrinsic", Twine(getVersionValue(1, 0)));
 
-  auto VScale = getVScaleRange(Opts);
+  auto VScale = getVScaleRange(Opts, false);
   if (VScale && VScale->first && VScale->first == VScale->second)
     Builder.defineMacro("__riscv_v_fixed_vlen",
                         Twine(VScale->first * llvm::RISCV::RVVBitsPerBlock));
@@ -289,7 +289,8 @@ bool RISCVTargetInfo::initFeatureMap(
 }
 
 std::optional<std::pair<unsigned, unsigned>>
-RISCVTargetInfo::getVScaleRange(const LangOptions &LangOpts) const {
+RISCVTargetInfo::getVScaleRange(const LangOptions &LangOpts,
+                                bool IsArmStreamingFunction) const {
   // RISCV::RVVBitsPerBlock is 64.
   unsigned VScaleMin = ISAInfo->getMinVLen() / llvm::RISCV::RVVBitsPerBlock;
 
diff --git a/clang/lib/Basic/Targets/RISCV.h b/clang/lib/Basic/Targets/RISCV.h
index bb3f3a5cda7c6..d31c46f2bb16c 100644
--- a/clang/lib/Basic/Targets/RISCV.h
+++ b/clang/lib/Basic/Targets/RISCV.h
@@ -99,7 +99,8 @@ class RISCVTargetInfo : public TargetInfo {
                  const std::vector<std::string> &FeaturesVec) const override;
 
   std::optional<std::pair<unsigned, unsigned>>
-  getVScaleRange(const LangOptions &LangOpts) const override;
+  getVScaleRange(const LangOptions &LangOpts,
+                 bool IsArmStreamingFunction) const override;
 
   bool hasFeature(StringRef Feature) const override;
 
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index bbef277a52448..08165e0b28406 100644
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -551,14 +551,6 @@ void CodeGenFunction::FinishFunction(SourceLocation EndLoc) {
     CurFn->addFnAttr("min-legal-vector-width",
                      llvm::utostr(LargestVectorWidth));
 
-  // Add vscale_range attribute if appropriate.
-  std::optional<std::pair<unsigned, unsigned>> VScaleRange =
-      getContext().getTargetInfo().getVScaleRange(getLangOpts());
-  if (VScaleRange) {
-    CurFn->addFnAttr(llvm::Attribute::getWithVScaleRangeArgs(
-        getLLVMContext(), VScaleRange->first, VScaleRange->second));
-  }
-
   // If we generated an unreachable return block, delete it now.
   if (ReturnBlock.isValid() && ReturnBlock.getBlock()->use_empty()) {
     Builder.ClearInsertionPoint();
@@ -1110,6 +1102,15 @@ void CodeGenFunction::StartFunction(GlobalDecl GD, QualType RetTy,
   if (FD && FD->isMain())
     Fn->removeFnAttr("zero-call-used-regs");
 
+  // Add vscale_range attribute if appropriate.
+  std::optional<std::pair<unsigned, unsigned>> VScaleRange =
+      getContext().getTargetInfo().getVScaleRange(
+          getLangOpts(), FD ? IsArmStreamingFunction(FD, true) : false);
+  if (VScaleRange) {
+    CurFn->addFnAttr(llvm::Attribute::getWithVScaleRangeArgs(
+        getLLVMContext(), VScaleRange->first, VScaleRange->second));
+  }
+
   llvm::BasicBlock *EntryBB = createBasicBlock("entry", CurFn);
 
   // Create a marker to make it easy to insert allocas into the entryblock
diff --git a/clang/lib/CodeGen/Targets/RISCV.cpp b/clang/lib/CodeGen/Targets/RISCV.cpp
index 2b70f2bd3f38b..2c48ba37fd206 100644
--- a/clang/lib/CodeGen/Targets/RISCV.cpp
+++ b/clang/lib/CodeGen/Targets/RISCV.cpp
@@ -367,8 +367,8 @@ ABIArgInfo RISCVABIInfo::coerceVLSVector(QualType Ty) const {
   const auto *VT = Ty->castAs<VectorType>();
   assert(VT->getElementType()->isBuiltinType() && "expected builtin type!");
 
-  auto VScale =
-      getContext().getTargetInfo().getVScaleRange(getContext().getLangOpts());
+  auto VScale = getContext().getTargetInfo().getVScaleRange(
+      getContext().getLangOpts(), false);
 
   unsigned NumElts = VT->getNumElements();
   llvm::Type *EltType = llvm::Type::getInt1Ty(getVMContext());
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index 33d5378944ddb..1a591a5376f5e 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -8491,7 +8491,8 @@ static void HandleRISCVRVVVectorBitsTypeAttr(QualType &CurType,
     return;
   }
 
-  auto VScale = S.Context.getTargetInfo().getVScaleRange(S.getLangOpts());
+  auto VScale =
+      S.Context.getTargetInfo().getVScaleRange(S.getLangOpts(), false);
   if (!VScale || !VScale->first || VScale->first != VScale->second) {
     S.Diag(Attr.getLoc(), diag::err_attribute_riscv_rvv_bits_unsupported)
         << Attr;
diff --git a/clang/test/CodeGen/AArch64/sme-intrinsics/aarch64-sme-attrs.cpp b/clang/test/CodeGen/AArch64/sme-intrinsics/aarch64-sme-attrs.cpp
index 54762c8b41412..c734c6953e5d1 100644
--- a/clang/test/CodeGen/AArch64/sme-intrinsics/aarch64-sme-attrs.cpp
+++ b/clang/test/CodeGen/AArch64/sme-intrinsics/aarch64-sme-attrs.cpp
@@ -300,12 +300,12 @@ int test_variadic_template() __arm_inout("za") {
               preserves_za_decl);
 }
 
-// CHECK: attributes #[[SM_ENABLED]] = { mustprogress noinline nounwind "aarch64_pstate_sm_enabled" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
+// CHECK: attributes #[[SM_ENABLED]] = { mustprogress noinline nounwind vscale_range(1,16) "aarch64_pstate_sm_enabled" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
 // CHECK: attributes #[[NORMAL_DECL]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
 // CHECK: attributes #[[SM_ENABLED_DECL]] = { "aarch64_pstate_sm_enabled" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
 // CHECK: attributes #[[SM_COMPATIBLE]] = { mustprogress noinline nounwind "aarch64_pstate_sm_compatible" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
 // CHECK: attributes #[[SM_COMPATIBLE_DECL]] = { "aarch64_pstate_sm_compatible" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
-// CHECK: attributes #[[SM_BODY]] = { mustprogress noinline nounwind "aarch64_pstate_sm_body" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
+// CHECK: attributes #[[SM_BODY]] = { mustprogress noinline nounwind vscale_range(1,16) "aarch64_pstate_sm_body" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
 // CHECK: attributes #[[ZA_SHARED]] = { mustprogress noinline nounwind "aarch64_inout_za" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
 // CHECK: attributes #[[ZA_SHARED_DECL]] = { "aarch64_inout_za" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
 // CHECK: attributes #[[ZA_PRESERVED]] = { mustprogress noinline nounwind "aarch64_preserves_za" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }

From 8be3fc0f5c7bdad7718394e34ebc5087704a1027 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs@arm.com>
Date: Wed, 5 Feb 2025 13:34:43 +0000
Subject: [PATCH 023/282] [AArch64] Disallow vscale x 1  partial reductions
 (#125252)

We don't want to allow partial reductions resulting in a vscale x 1 type
as we can't lower it in the backend.

(cherry picked from commit c7995a6905f2320f280013454676f992a8c6f89f)
---
 .../AArch64/AArch64TargetTransformInfo.cpp    |  19 +-
 .../AArch64/partial-reduce-dot-product.ll     | 267 ++++++++++++------
 2 files changed, 198 insertions(+), 88 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index aae2fdaf5bec3..4af3c482e6598 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4682,13 +4682,24 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost(
   EVT InputEVT = EVT::getEVT(InputTypeA);
   EVT AccumEVT = EVT::getEVT(AccumType);
 
-  if (VF.isScalable() && !ST->isSVEorStreamingSVEAvailable())
-    return Invalid;
+  unsigned VFMinValue = VF.getKnownMinValue();
+
+  if (VF.isScalable()) {
+    if (!ST->isSVEorStreamingSVEAvailable())
+      return Invalid;
+
+    // Don't accept a partial reduction if the scaled accumulator is vscale x 1,
+    // since we can't lower that type.
+    unsigned Scale =
+        AccumEVT.getScalarSizeInBits() / InputEVT.getScalarSizeInBits();
+    if (VFMinValue == Scale)
+      return Invalid;
+  }
   if (VF.isFixed() && (!ST->isNeonAvailable() || !ST->hasDotProd()))
     return Invalid;
 
   if (InputEVT == MVT::i8) {
-    switch (VF.getKnownMinValue()) {
+    switch (VFMinValue) {
     default:
       return Invalid;
     case 8:
@@ -4707,7 +4718,7 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost(
   } else if (InputEVT == MVT::i16) {
     // FIXME: Allow i32 accumulator but increase cost, as we would extend
     //        it to i64.
-    if (VF.getKnownMinValue() != 8 || AccumEVT != MVT::i64)
+    if (VFMinValue != 8 || AccumEVT != MVT::i64)
       return Invalid;
   } else
     return Invalid;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
index a0214ae88c2d6..9d0d30abce6c9 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -2434,7 +2434,7 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-MAXBW:       vector.body:
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 1 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
 ; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
@@ -2446,12 +2446,12 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP12]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i64>
 ; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = mul nuw nsw <vscale x 8 x i64> [[TMP13]], [[TMP9]]
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 1 x i64> @llvm.experimental.vector.partial.reduce.add.nxv1i64.nxv8i64(<vscale x 1 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[TMP14]])
+; CHECK-MAXBW-NEXT:    [[TMP19]] = add <vscale x 8 x i64> [[VEC_PHI]], [[TMP14]]
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
-; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vector.reduce.add.nxv1i64(<vscale x 1 x i64> [[PARTIAL_REDUCE]])
+; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vector.reduce.add.nxv8i64(<vscale x 8 x i64> [[TMP19]])
 ; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 16, [[N_VEC]]
 ; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK-MAXBW:       scalar.ph:
@@ -3106,89 +3106,188 @@ exit:                                 ; preds = %for.cond.cleanup.loopexit, %ent
   ret i64 %result
 }
 
+define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 {
+; CHECK-INTERLEAVE1-LABEL: define dso_local i32 @not_dotp_vscale1(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[N:%.*]], i64 [[COST:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT:  entry:
+; CHECK-INTERLEAVE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
+; CHECK-INTERLEAVE1:       for.body.preheader:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 2
+; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1:       vector.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
+; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = insertelement <vscale x 2 x i64> zeroinitializer, i64 [[COST]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       vector.body:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ [[TMP10]], [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP11]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT:    [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[TMP13]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD]] to <vscale x 2 x i64>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[NEXT_GEP1]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 2 x i8>, ptr [[TMP15]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD2]] to <vscale x 2 x i64>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP16]], [[TMP14]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP18]] = add <vscale x 2 x i64> [[TMP17]], [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
+; CHECK-INTERLEAVE1:       middle.block:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP18]])
+; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+;
+; CHECK-INTERLEAVED-LABEL: define dso_local i32 @not_dotp_vscale1(
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[N:%.*]], i64 [[COST:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT:  entry:
+; CHECK-INTERLEAVED-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
+; CHECK-INTERLEAVED:       for.body.preheader:
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED:       vector.ph:
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
+; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = insertelement <vscale x 2 x i64> zeroinitializer, i64 [[COST]], i32 0
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED:       vector.body:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ [[TMP10]], [[VECTOR_PH]] ], [ [[TMP27:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP28:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP11]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT:    [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 [[TMP15]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[TMP13]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i8>, ptr [[TMP16]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD]] to <vscale x 2 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD3]] to <vscale x 2 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[NEXT_GEP2]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = mul i64 [[TMP20]], 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[NEXT_GEP2]], i64 [[TMP21]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 2 x i8>, ptr [[TMP19]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 2 x i8>, ptr [[TMP22]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD4]] to <vscale x 2 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD5]] to <vscale x 2 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP23]], [[TMP17]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP24]], [[TMP18]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP27]] = add <vscale x 2 x i64> [[TMP25]], [[VEC_PHI]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP28]] = add <vscale x 2 x i64> [[TMP26]], [[VEC_PHI1]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
+; CHECK-INTERLEAVED:       middle.block:
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 2 x i64> [[TMP28]], [[TMP27]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+;
+; CHECK-MAXBW-LABEL: define dso_local i32 @not_dotp_vscale1(
+; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[N:%.*]], i64 [[COST:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT:  entry:
+; CHECK-MAXBW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-MAXBW-NEXT:    br i1 [[CMP]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
+; CHECK-MAXBW:       for.body.preheader:
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 8
+; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW:       vector.ph:
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 8
+; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
+; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]]
+; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[N_VEC]]
+; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = insertelement <vscale x 8 x i64> zeroinitializer, i64 [[COST]], i32 0
+; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW:       vector.body:
+; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i64> [ [[TMP10]], [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP11]]
+; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT:    [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]]
+; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i64>
+; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[NEXT_GEP1]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP15]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i64>
+; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = mul nuw nsw <vscale x 8 x i64> [[TMP16]], [[TMP14]]
+; CHECK-MAXBW-NEXT:    [[TMP20]] = add <vscale x 8 x i64> [[TMP17]], [[VEC_PHI]]
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
+; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
+; CHECK-MAXBW:       middle.block:
+; CHECK-MAXBW-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vector.reduce.add.nxv8i64(<vscale x 8 x i64> [[TMP20]])
+; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+;
+entry:
+  %cmp = icmp sgt i32 %n, 0
+  br i1 %cmp, label %for.body, label %exit
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %iv = phi i32 [ %iv.next, %for.body ], [ 0, %entry ]
+  %accum = phi i64 [ %add, %for.body ], [ %cost, %entry ]
+  %a.ptr = phi ptr [ %a.gep, %for.body ], [ %a, %entry ]
+  %b.ptr = phi ptr [ %b.gep, %for.body ], [ %b, %entry ]
+  %a.load = load i8, ptr %a.ptr, align 1
+  %a.ext = zext i8 %a.load to i64
+  %b.load = load i8, ptr %b.ptr, align 1
+  %b.ext = zext i8 %b.load to i64
+  %mul = mul nuw nsw i64 %b.ext, %a.ext
+  %add = add nsw i64 %mul, %accum
+  %a.gep = getelementptr inbounds nuw i8, ptr %a.ptr, i64 1
+  %b.gep = getelementptr inbounds nuw i8, ptr %b.ptr, i64 1
+  %iv.next = add nuw nsw i32 %iv, 1
+  %cmp.2 = icmp eq i32 %iv.next, %n
+  br i1 %cmp.2, label %exit, label %for.body
+
+exit:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %cost.result = phi i64 [ %cost, %entry ], [ %add, %for.body ]
+  %result = trunc i64 %cost.result to i32
+  ret i32 %result
+}
+
 !7 = distinct !{!7, !8, !9, !10}
 !8 = !{!"llvm.loop.mustprogress"}
 !9 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
 !10 = !{!"llvm.loop.vectorize.enable", i1 true}
 attributes #0 = { vscale_range(1,16) "target-features"="+sve" }
-;.
-; CHECK-INTERLEAVE1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; CHECK-INTERLEAVE1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK-INTERLEAVE1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK-INTERLEAVE1: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
-; CHECK-INTERLEAVE1: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
-; CHECK-INTERLEAVE1: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
-; CHECK-INTERLEAVE1: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
-; CHECK-INTERLEAVE1: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
-; CHECK-INTERLEAVE1: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
-; CHECK-INTERLEAVE1: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
-; CHECK-INTERLEAVE1: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
-; CHECK-INTERLEAVE1: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
-; CHECK-INTERLEAVE1: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
-; CHECK-INTERLEAVE1: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
-; CHECK-INTERLEAVE1: [[LOOP14]] = distinct !{[[LOOP14]], [[META15:![0-9]+]], [[META1]], [[META2]]}
-; CHECK-INTERLEAVE1: [[META15]] = !{!"llvm.loop.mustprogress"}
-; CHECK-INTERLEAVE1: [[LOOP16]] = distinct !{[[LOOP16]], [[META15]], [[META2]], [[META1]]}
-; CHECK-INTERLEAVE1: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]], [[META2]]}
-; CHECK-INTERLEAVE1: [[LOOP18]] = distinct !{[[LOOP18]], [[META2]], [[META1]]}
-; CHECK-INTERLEAVE1: [[LOOP19]] = distinct !{[[LOOP19]], [[META1]], [[META2]]}
-; CHECK-INTERLEAVE1: [[LOOP20]] = distinct !{[[LOOP20]], [[META2]], [[META1]]}
-; CHECK-INTERLEAVE1: [[LOOP21]] = distinct !{[[LOOP21]], [[META1]], [[META2]]}
-; CHECK-INTERLEAVE1: [[LOOP22]] = distinct !{[[LOOP22]], [[META2]], [[META1]]}
-; CHECK-INTERLEAVE1: [[LOOP23]] = distinct !{[[LOOP23]], [[META1]], [[META2]]}
-; CHECK-INTERLEAVE1: [[LOOP24]] = distinct !{[[LOOP24]], [[META2]], [[META1]]}
-;.
-; CHECK-INTERLEAVED: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; CHECK-INTERLEAVED: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK-INTERLEAVED: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK-INTERLEAVED: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
-; CHECK-INTERLEAVED: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
-; CHECK-INTERLEAVED: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
-; CHECK-INTERLEAVED: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
-; CHECK-INTERLEAVED: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
-; CHECK-INTERLEAVED: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
-; CHECK-INTERLEAVED: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
-; CHECK-INTERLEAVED: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
-; CHECK-INTERLEAVED: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
-; CHECK-INTERLEAVED: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
-; CHECK-INTERLEAVED: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
-; CHECK-INTERLEAVED: [[LOOP14]] = distinct !{[[LOOP14]], [[META15:![0-9]+]], [[META1]], [[META2]]}
-; CHECK-INTERLEAVED: [[META15]] = !{!"llvm.loop.mustprogress"}
-; CHECK-INTERLEAVED: [[LOOP16]] = distinct !{[[LOOP16]], [[META15]], [[META2]], [[META1]]}
-; CHECK-INTERLEAVED: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]], [[META2]]}
-; CHECK-INTERLEAVED: [[LOOP18]] = distinct !{[[LOOP18]], [[META2]], [[META1]]}
-; CHECK-INTERLEAVED: [[LOOP19]] = distinct !{[[LOOP19]], [[META1]], [[META2]]}
-; CHECK-INTERLEAVED: [[LOOP20]] = distinct !{[[LOOP20]], [[META2]], [[META1]]}
-; CHECK-INTERLEAVED: [[LOOP21]] = distinct !{[[LOOP21]], [[META1]], [[META2]]}
-; CHECK-INTERLEAVED: [[LOOP22]] = distinct !{[[LOOP22]], [[META1]]}
-; CHECK-INTERLEAVED: [[LOOP23]] = distinct !{[[LOOP23]], [[META1]], [[META2]]}
-; CHECK-INTERLEAVED: [[LOOP24]] = distinct !{[[LOOP24]], [[META2]], [[META1]]}
-; CHECK-INTERLEAVED: [[LOOP25]] = distinct !{[[LOOP25]], [[META1]], [[META2]]}
-; CHECK-INTERLEAVED: [[LOOP26]] = distinct !{[[LOOP26]], [[META2]], [[META1]]}
-;.
-; CHECK-MAXBW: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; CHECK-MAXBW: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK-MAXBW: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK-MAXBW: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
-; CHECK-MAXBW: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
-; CHECK-MAXBW: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
-; CHECK-MAXBW: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
-; CHECK-MAXBW: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
-; CHECK-MAXBW: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
-; CHECK-MAXBW: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
-; CHECK-MAXBW: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
-; CHECK-MAXBW: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
-; CHECK-MAXBW: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
-; CHECK-MAXBW: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
-; CHECK-MAXBW: [[LOOP14]] = distinct !{[[LOOP14]], [[META15:![0-9]+]], [[META1]], [[META2]]}
-; CHECK-MAXBW: [[META15]] = !{!"llvm.loop.mustprogress"}
-; CHECK-MAXBW: [[LOOP16]] = distinct !{[[LOOP16]], [[META15]], [[META2]], [[META1]]}
-; CHECK-MAXBW: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]], [[META2]]}
-; CHECK-MAXBW: [[LOOP18]] = distinct !{[[LOOP18]], [[META2]], [[META1]]}
-; CHECK-MAXBW: [[LOOP19]] = distinct !{[[LOOP19]], [[META1]], [[META2]]}
-; CHECK-MAXBW: [[LOOP20]] = distinct !{[[LOOP20]], [[META2]], [[META1]]}
-; CHECK-MAXBW: [[LOOP21]] = distinct !{[[LOOP21]], [[META1]], [[META2]]}
-; CHECK-MAXBW: [[LOOP22]] = distinct !{[[LOOP22]], [[META2]], [[META1]]}
-; CHECK-MAXBW: [[LOOP23]] = distinct !{[[LOOP23]], [[META1]], [[META2]]}
-; CHECK-MAXBW: [[LOOP24]] = distinct !{[[LOOP24]], [[META2]], [[META1]]}
-;.

From 091c11f29e1665a950e74597b66add451d281b97 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20G=C3=B3rny?= <mgorny@gentoo.org>
Date: Sun, 2 Feb 2025 16:55:22 +0100
Subject: [PATCH 024/282] [offload] `gnu::format` with variadic template
 functions is Clang-only (#124406)

Use `gnu::format` attribute only when compiling with Clang, as using it
against variadic template functions is a Clang extension and is not
supported by GCC.

See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77958

Fixes #119069

(cherry picked from commit 359a9131704277bce0f806de31ac887e68a66902)
---
 .../common/include/ErrorReporting.h            | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/offload/plugins-nextgen/common/include/ErrorReporting.h b/offload/plugins-nextgen/common/include/ErrorReporting.h
index 8478977a8f86a..2ad0f2b7dd6c6 100644
--- a/offload/plugins-nextgen/common/include/ErrorReporting.h
+++ b/offload/plugins-nextgen/common/include/ErrorReporting.h
@@ -80,8 +80,10 @@ class ErrorReporter {
   /// Print \p Format, instantiated with \p Args to stderr.
   /// TODO: Allow redirection into a file stream.
   template <typename... ArgsTy>
-  [[gnu::format(__printf__, 1, 2)]] static void print(const char *Format,
-                                                      ArgsTy &&...Args) {
+#ifdef __clang__ // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77958
+  [[gnu::format(__printf__, 1, 2)]]
+#endif
+  static void print(const char *Format, ArgsTy &&...Args) {
     raw_fd_ostream OS(STDERR_FILENO, false);
     OS << llvm::format(Format, Args...);
   }
@@ -89,8 +91,10 @@ class ErrorReporter {
   /// Print \p Format, instantiated with \p Args to stderr, but colored.
   /// TODO: Allow redirection into a file stream.
   template <typename... ArgsTy>
-  [[gnu::format(__printf__, 2, 3)]] static void
-  print(ColorTy Color, const char *Format, ArgsTy &&...Args) {
+#ifdef __clang__ // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77958
+  [[gnu::format(__printf__, 2, 3)]]
+#endif
+  static void print(ColorTy Color, const char *Format, ArgsTy &&...Args) {
     raw_fd_ostream OS(STDERR_FILENO, false);
     WithColor(OS, HighlightColor(Color)) << llvm::format(Format, Args...);
   }
@@ -99,8 +103,10 @@ class ErrorReporter {
   /// a banner.
   /// TODO: Allow redirection into a file stream.
   template <typename... ArgsTy>
-  [[gnu::format(__printf__, 1, 2)]] static void reportError(const char *Format,
-                                                            ArgsTy &&...Args) {
+#ifdef __clang__ // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77958
+  [[gnu::format(__printf__, 1, 2)]]
+#endif
+  static void reportError(const char *Format, ArgsTy &&...Args) {
     print(BoldRed, "%s", ErrorBanner);
     print(BoldRed, Format, Args...);
     print("\n");

From 28e563c23815d94de3de6c3fabc5aa5ad1b554b3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20G=C3=B3rny?= <mgorny@gentoo.org>
Date: Sun, 2 Feb 2025 16:55:39 +0100
Subject: [PATCH 025/282] [offload] [test] Use test compiler ID rather than
 host (#124408)

Use the test compiler ID to verify whether tests can be run rather than
the host compiler. This makes it possible to run tests (with Clang)
while the library itself was built with GCC.

(cherry picked from commit 689ef5fda0ab07dfc452cb16d3646d53e612cb75)
---
 offload/test/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/offload/test/CMakeLists.txt b/offload/test/CMakeLists.txt
index 8a827e0a625ef..4768d9ccf223b 100644
--- a/offload/test/CMakeLists.txt
+++ b/offload/test/CMakeLists.txt
@@ -1,6 +1,6 @@
 # CMakeLists.txt file for unit testing OpenMP offloading runtime library.
-if(NOT CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR
-   CMAKE_CXX_COMPILER_VERSION VERSION_LESS 6.0.0)
+if(NOT OPENMP_TEST_COMPILER_ID STREQUAL "Clang" OR
+   OPENMP_TEST_COMPILER_VERSION VERSION_LESS 6.0.0)
   message(STATUS "Can only test with Clang compiler in version 6.0.0 or later.")
   message(WARNING "The check-offload target will not be available!")
   return()

From 5a406bde97d8ef9489669fa33c6540c26e4436f6 Mon Sep 17 00:00:00 2001
From: Ben Langmuir <blangmuir@apple.com>
Date: Wed, 29 Jan 2025 16:07:15 -0800
Subject: [PATCH 026/282] [asan][test] Attempt to fix
 suppressions-alloc-dealloc-mismatch.cpp on Darwin (#124987)

Add %env_asan_opts=alloc_dealloc_mismatch=1 since it is disabled by
default.

rdar://143830493
(cherry picked from commit f0d05b099dafda89df4c971b64b2051c33db5da1)
---
 .../asan/TestCases/suppressions-alloc-dealloc-mismatch.cpp  | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/compiler-rt/test/asan/TestCases/suppressions-alloc-dealloc-mismatch.cpp b/compiler-rt/test/asan/TestCases/suppressions-alloc-dealloc-mismatch.cpp
index fe88a5d0c9bf1..df6df6aa95471 100644
--- a/compiler-rt/test/asan/TestCases/suppressions-alloc-dealloc-mismatch.cpp
+++ b/compiler-rt/test/asan/TestCases/suppressions-alloc-dealloc-mismatch.cpp
@@ -1,10 +1,10 @@
 // Check that without suppressions, we catch the issue.
 // RUN: %clangxx_asan -O0 %s -o %t
-// RUN: not %run %t 2>&1 | FileCheck --check-prefix=CHECK-CRASH %s
+// RUN: %env_asan_opts=alloc_dealloc_mismatch=1 not %run %t 2>&1 | FileCheck --check-prefix=CHECK-CRASH %s
 
 // RUN: echo "alloc_dealloc_mismatch:function" > %t.supp
-// RUN: %clangxx_asan -O0 %s -o %t && %env_asan_opts=suppressions='"%t.supp"' %run %t 2>&1 | FileCheck --check-prefix=CHECK-IGNORE %s
-// RUN: %clangxx_asan -O3 %s -o %t && %env_asan_opts=suppressions='"%t.supp"' %run %t 2>&1 | FileCheck --check-prefix=CHECK-IGNORE %s
+// RUN: %clangxx_asan -O0 %s -o %t && %env_asan_opts=alloc_dealloc_mismatch=1:suppressions='"%t.supp"' %run %t 2>&1 | FileCheck --check-prefix=CHECK-IGNORE %s
+// RUN: %clangxx_asan -O3 %s -o %t && %env_asan_opts=alloc_dealloc_mismatch=1:suppressions='"%t.supp"' %run %t 2>&1 | FileCheck --check-prefix=CHECK-IGNORE %s
 
 #include <stdio.h>
 #include <stdlib.h>

From 2d1f7e0082b80d9041f3772eb42dbd9153f7e6a6 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Wed, 29 Jan 2025 18:23:08 -0800
Subject: [PATCH 027/282] [asan][android] XFAIL
 suppressions-alloc-dealloc-mismatch

Android is missing suppression file on device.

Follow up to #124197.

(cherry picked from commit 751ae26b959c931efb6db84a27bf2a0444120094)
---
 .../asan/TestCases/suppressions-alloc-dealloc-mismatch.cpp     | 3 +++
 .../asan/TestCases/suppressions-exec-relative-location.cpp     | 1 +
 compiler-rt/test/asan/TestCases/suppressions-function.cpp      | 1 +
 compiler-rt/test/asan/TestCases/suppressions-interceptor.cpp   | 1 +
 compiler-rt/test/asan/TestCases/suppressions-library.cpp       | 1 +
 5 files changed, 7 insertions(+)

diff --git a/compiler-rt/test/asan/TestCases/suppressions-alloc-dealloc-mismatch.cpp b/compiler-rt/test/asan/TestCases/suppressions-alloc-dealloc-mismatch.cpp
index df6df6aa95471..43478ec2f3457 100644
--- a/compiler-rt/test/asan/TestCases/suppressions-alloc-dealloc-mismatch.cpp
+++ b/compiler-rt/test/asan/TestCases/suppressions-alloc-dealloc-mismatch.cpp
@@ -6,6 +6,9 @@
 // RUN: %clangxx_asan -O0 %s -o %t && %env_asan_opts=alloc_dealloc_mismatch=1:suppressions='"%t.supp"' %run %t 2>&1 | FileCheck --check-prefix=CHECK-IGNORE %s
 // RUN: %clangxx_asan -O3 %s -o %t && %env_asan_opts=alloc_dealloc_mismatch=1:suppressions='"%t.supp"' %run %t 2>&1 | FileCheck --check-prefix=CHECK-IGNORE %s
 
+// FIXME: Upload suppressions to device.
+// XFAIL: android
+
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
diff --git a/compiler-rt/test/asan/TestCases/suppressions-exec-relative-location.cpp b/compiler-rt/test/asan/TestCases/suppressions-exec-relative-location.cpp
index 58649c74fc4f9..0a028e2ebed6d 100644
--- a/compiler-rt/test/asan/TestCases/suppressions-exec-relative-location.cpp
+++ b/compiler-rt/test/asan/TestCases/suppressions-exec-relative-location.cpp
@@ -23,6 +23,7 @@
 // RUN: %env_asan_opts=suppressions='"folder/only/"' not %run %t 2>&1 | \
 // RUN:   FileCheck --check-prefix=CHECK-WRONG-FILE-NAME %s
 
+// FIXME: Upload suppressions to device.
 // XFAIL: android
 // UNSUPPORTED: ios
 
diff --git a/compiler-rt/test/asan/TestCases/suppressions-function.cpp b/compiler-rt/test/asan/TestCases/suppressions-function.cpp
index 9a0ef95ebb2ac..22d3dae1f0ef0 100644
--- a/compiler-rt/test/asan/TestCases/suppressions-function.cpp
+++ b/compiler-rt/test/asan/TestCases/suppressions-function.cpp
@@ -6,6 +6,7 @@
 // RUN: %clangxx_asan -O0 %s -o %t && %env_asan_opts=suppressions='"%t.supp"' %run %t 2>&1 | FileCheck --check-prefix=CHECK-IGNORE %s
 // RUN: %clangxx_asan -O3 %s -o %t && %env_asan_opts=suppressions='"%t.supp"' %run %t 2>&1 | FileCheck --check-prefix=CHECK-IGNORE %s
 
+// FIXME: Upload suppressions to device.
 // XFAIL: android
 // UNSUPPORTED: ios
 
diff --git a/compiler-rt/test/asan/TestCases/suppressions-interceptor.cpp b/compiler-rt/test/asan/TestCases/suppressions-interceptor.cpp
index e44ccb8e6527d..02995cf6f0510 100644
--- a/compiler-rt/test/asan/TestCases/suppressions-interceptor.cpp
+++ b/compiler-rt/test/asan/TestCases/suppressions-interceptor.cpp
@@ -5,6 +5,7 @@
 // RUN: echo "interceptor_name:strlen" > %t.supp
 // RUN: %env_asan_opts=suppressions='"%t.supp"' %run %t 2>&1 | FileCheck --check-prefix=CHECK-IGNORE %s
 
+// FIXME: Upload suppressions to device.
 // XFAIL: android
 
 #include <stdio.h>
diff --git a/compiler-rt/test/asan/TestCases/suppressions-library.cpp b/compiler-rt/test/asan/TestCases/suppressions-library.cpp
index d11802a20ad3f..5427122eaa92f 100644
--- a/compiler-rt/test/asan/TestCases/suppressions-library.cpp
+++ b/compiler-rt/test/asan/TestCases/suppressions-library.cpp
@@ -9,6 +9,7 @@
 // RUN: echo "interceptor_via_lib:"%xdynamiclib_filename > %t.supp
 // RUN: %env_asan_opts=suppressions='"%t.supp"' %run %t 2>&1 | FileCheck --check-prefix=CHECK-IGNORE %s
 
+// FIXME: Upload suppressions to device.
 // XFAIL: android
 
 #include <stdio.h>

From 2b09e5936aa3f1f6f5a96639b1e37dab7f16ea1b Mon Sep 17 00:00:00 2001
From: Ben Langmuir <blangmuir@apple.com>
Date: Thu, 30 Jan 2025 09:18:10 -0800
Subject: [PATCH 028/282] [asan][test] Disable
 suppressions-alloc-dealloc-mismatch.cpp on Darwin

The suppressions mechanism doesn't work reliably in optimized builds,
which turns out to be a known issue (see b87543c704724 / svn r308908).
Disable this test, as it is also testing a feature (alloc/dealloc
mismatch) that is disabled by default on Darwin anyway.

rdar://143830493
(cherry picked from commit 4985804c0608a83f6ab017137c3d3d4f02827774)
---
 .../asan/TestCases/suppressions-alloc-dealloc-mismatch.cpp    | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/compiler-rt/test/asan/TestCases/suppressions-alloc-dealloc-mismatch.cpp b/compiler-rt/test/asan/TestCases/suppressions-alloc-dealloc-mismatch.cpp
index 43478ec2f3457..6ab796b1c76a6 100644
--- a/compiler-rt/test/asan/TestCases/suppressions-alloc-dealloc-mismatch.cpp
+++ b/compiler-rt/test/asan/TestCases/suppressions-alloc-dealloc-mismatch.cpp
@@ -9,6 +9,10 @@
 // FIXME: Upload suppressions to device.
 // XFAIL: android
 
+// FIXME: atos does not work for inlined functions, yet llvm-symbolizer
+// does not always work with debug info on Darwin.
+// UNSUPPORTED: darwin
+
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>

From de1e3e98748a0b374f2d3e97cd3e423bdcb6400d Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Mon, 3 Feb 2025 17:01:02 +0000
Subject: [PATCH 029/282] [VPlan] Only use SCEV for live-ins in tryToWiden.
 (#125436)

Replacing a recipe with a live-in may not be correct in all cases,
e.g. when replacing recipes involving header-phi recipes, like
reductions.

For now, only use SCEV to simplify live-ins.

More powerful input simplification can be built in top of
https://github.com/llvm/llvm-project/pull/124432 in the future.

Fixes https://github.com/llvm/llvm-project/issues/119173.
Fixes https://github.com/llvm/llvm-project/issues/125374.

PR: https://github.com/llvm/llvm-project/pull/125436
(cherry picked from commit 30f3752e54fa7cd595a434a985efbe9a7abe9b65)
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  8 +++-
 .../AArch64/mul-simplification.ll             | 41 +++++++------------
 2 files changed, 20 insertions(+), 29 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 57b7358049bce..c4b159117e2e8 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -6615,8 +6615,10 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
     // fold away.  We can generalize this for all operations using the notion
     // of neutral elements.  (TODO)
     if (I->getOpcode() == Instruction::Mul &&
-        (PSE.getSCEV(I->getOperand(0))->isOne() ||
-         PSE.getSCEV(I->getOperand(1))->isOne()))
+        ((TheLoop->isLoopInvariant(I->getOperand(0)) &&
+          PSE.getSCEV(I->getOperand(0))->isOne()) ||
+         (TheLoop->isLoopInvariant(I->getOperand(1)) &&
+          PSE.getSCEV(I->getOperand(1))->isOne())))
       return 0;
 
     // Detect reduction patterns
@@ -8566,6 +8568,8 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
       // to replace operands with constants.
       ScalarEvolution &SE = *PSE.getSE();
       auto GetConstantViaSCEV = [this, &SE](VPValue *Op) {
+        if (!Op->isLiveIn())
+          return Op;
         Value *V = Op->getUnderlyingValue();
         if (isa<Constant>(V) || !SE.isSCEVable(V->getType()))
           return Op;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/mul-simplification.ll b/llvm/test/Transforms/LoopVectorize/AArch64/mul-simplification.ll
index 6b55f5291efd8..a8d44421a3c37 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/mul-simplification.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/mul-simplification.ll
@@ -7,24 +7,10 @@ target triple = "arm64-apple-macosx"
 define i64 @mul_select_operand_known_1_via_scev() {
 ; CHECK-LABEL: define i64 @mul_select_operand_known_1_via_scev() {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
-; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ <i64 12, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_PHI]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> [[VEC_PHI]])
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ 12, %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 2, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[RED:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RED:%.*]] = phi i64 [ 12, %[[ENTRY]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[IV]], 1
 ; CHECK-NEXT:    [[CMP1_I:%.*]] = icmp eq i32 [[TMP1]], 0
 ; CHECK-NEXT:    [[NARROW_I:%.*]] = select i1 [[CMP1_I]], i32 1, i32 [[IV]]
@@ -32,9 +18,9 @@ define i64 @mul_select_operand_known_1_via_scev() {
 ; CHECK-NEXT:    [[RED_NEXT]] = mul nsw i64 [[RED]], [[MUL]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV]], 1
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[RES:%.*]] = phi i64 [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP0]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[RES:%.*]] = phi i64 [ [[RED_NEXT]], %[[LOOP]] ]
 ; CHECK-NEXT:    ret i64 [[RES]]
 ;
 entry:
@@ -65,17 +51,20 @@ define i32 @add_reduction_select_operand_constant_but_non_uniform() {
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 42, i32 0, i32 0, i32 0>, %[[VECTOR_PH]] ], [ splat (i32 42), %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ splat (i32 42), %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 42, i32 0, i32 0, i32 0>, %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP1:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2]] = add <4 x i32> zeroinitializer, [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP1]] = add <4 x i32> zeroinitializer, [[VEC_PHI1]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 64
-; CHECK-NEXT:    br i1 [[TMP0]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP0]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> splat (i32 84))
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 64, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ 42, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP3]], %[[MIDDLE_BLOCK]] ], [ 42, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[ADD2_REASS:%.*]], %[[LOOP]] ]
@@ -83,9 +72,9 @@ define i32 @add_reduction_select_operand_constant_but_non_uniform() {
 ; CHECK-NEXT:    [[ADD2_REASS]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[RDX_NEXT]] = add i32 0, [[RDX]]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[ADD2_REASS]], 64
-; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT]], %[[LOOP]] ], [ [[TMP1]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT]], %[[LOOP]] ], [ [[TMP3]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[ADD_LCSSA]]
 ;
 entry:
@@ -109,6 +98,4 @@ exit:
 ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
 ; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
 ; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
-; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
-; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
 ;.

From 412360878db74f2f5ce8ef27348afd941de06a59 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Tue, 4 Feb 2025 14:43:22 -0800
Subject: [PATCH 030/282] workflows/release-binaries: Enable PGO (#124442)

Co-authored-by: Carlo Cabrera <github@carlo.cab>
(cherry picked from commit 0572580dd040a81dc69b798e202550d51d17204a)
---
 .github/workflows/release-binaries.yml | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/release-binaries.yml b/.github/workflows/release-binaries.yml
index c49939ea48c5f..204ee6405382f 100644
--- a/.github/workflows/release-binaries.yml
+++ b/.github/workflows/release-binaries.yml
@@ -58,7 +58,6 @@ jobs:
       target-cmake-flags: ${{ steps.vars.outputs.target-cmake-flags }}
       ccache: ${{ steps.vars.outputs.ccache }}
       build-flang: ${{ steps.vars.outputs.build-flang }}
-      enable-pgo: ${{ steps.vars.outputs.enable-pgo }}
       release-binary-basename: ${{ steps.vars.outputs.release-binary-basename }}
       release-binary-filename: ${{ steps.vars.outputs.release-binary-filename }}
       build-runs-on: ${{ steps.vars.outputs.build-runs-on }}
@@ -130,9 +129,6 @@ jobs:
           echo ccache=sccache >> $GITHUB_OUTPUT
         fi
 
-        # Detect necessary CMake flags
-        echo "enable-pgo=false" >> $GITHUB_OUTPUT
-        target_cmake_flags="-DLLVM_RELEASE_ENABLE_PGO=OFF"
         # The macOS builds try to cross compile some libraries so we need to
         # add extra CMake args to disable them.
         # See https://github.com/llvm/llvm-project/issues/99767
@@ -238,13 +234,14 @@ jobs:
             ${{ needs.prepare.outputs.target-cmake-flags }} \
             -C clang/cmake/caches/Release.cmake \
             -DBOOTSTRAP_LLVM_PARALLEL_LINK_JOBS=1 \
-            -DBOOTSTRAP_CPACK_PACKAGE_FILE_NAME="${{ needs.prepare.outputs.release-binary-basename }}"
+            -DBOOTSTRAP_BOOTSTRAP_CPACK_PACKAGE_FILE_NAME="${{ needs.prepare.outputs.release-binary-basename }}"
 
     - name: Build
       shell: bash
       run: |
         ninja -v -C ${{ steps.setup-stage.outputs.build-prefix }}/build stage2-package
-        mv ${{ steps.setup-stage.outputs.build-prefix  }}/build/tools/clang/stage2-bins/${{ needs.prepare.outputs.release-binary-filename }} .
+        release_dir=`find ${{ steps.setup-stage.outputs.build-prefix }}/build -iname 'stage2-bins'`
+        mv $release_dir/${{ needs.prepare.outputs.release-binary-filename }} .
     
     - uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0
       with:
@@ -259,7 +256,7 @@ jobs:
       shell: bash
       run: |
         find ${{ steps.setup-stage.outputs.build-prefix }}/build -iname ${{ needs.prepare.outputs.release-binary-filename }} -delete
-        rm -Rf ${{ steps.setup-stage.outputs.build-prefix }}/build/tools/clang/stage2-bins/_CPack_Packages
+        find ${{ steps.setup-stage.outputs.build-prefix }}/build -iname _CPack_Packages -prune -exec rm -r {} +
     
     - name: Save Stage
       uses: ./workflows-main/.github/workflows/release-binaries-save-stage

From 8fe8a869201b71a80d4a1dac8fd751316f54b64f Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Mon, 3 Feb 2025 10:03:59 -0500
Subject: [PATCH 031/282] [flang][runtime] Make sure to link libexecinfo if it
 exists (#125344)

Fixes building the backtrace support on FreeBSD/NetBSD/OpenBSD/DragonFly and musl
libc with libexecinfo.

(cherry picked from commit cb2598dda1aae5096a77bc8a9f6679ca1b350e5e)
---
 flang/runtime/CMakeLists.txt | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/flang/runtime/CMakeLists.txt b/flang/runtime/CMakeLists.txt
index fbfaae9a88064..bf27a121e4d17 100644
--- a/flang/runtime/CMakeLists.txt
+++ b/flang/runtime/CMakeLists.txt
@@ -59,10 +59,15 @@ if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
     )
 endif()
 
+set(linked_libraries FortranDecimal)
+
 # function checks
 find_package(Backtrace)
 set(HAVE_BACKTRACE ${Backtrace_FOUND})
 set(BACKTRACE_HEADER ${Backtrace_HEADER})
+if(HAVE_BACKTRACE)
+  list(APPEND linked_libraries ${Backtrace_LIBRARY})
+endif()
 
 include(CheckCXXSymbolExists)
 include(CheckCXXSourceCompiles)
@@ -271,7 +276,7 @@ if (NOT DEFINED MSVC)
   add_flang_library(FortranRuntime
     ${sources}
     LINK_LIBS
-    FortranDecimal
+    ${linked_libraries}
 
     INSTALL_WITH_TOOLCHAIN
   )
@@ -279,7 +284,7 @@ else()
   add_flang_library(FortranRuntime
     ${sources}
     LINK_LIBS
-    FortranDecimal
+    ${linked_libraries}
   )
   set(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreaded)
   add_flang_library(FortranRuntime.static ${sources}

From d8cec6dfe6abee77a802b48588bec8747777eaea Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Mon, 3 Feb 2025 09:37:16 +0000
Subject: [PATCH 032/282] [clang] Support member function poiners in
 Decl::getFunctionType() (#125077)

This seems consistent with the documentation, which claims it:

```
/// Looks through the Decl's underlying type to extract a FunctionType
/// when possible. Will return null if the type underlying the Decl does not
/// have a FunctionType.
const FunctionType *getFunctionType(bool BlocksToo = true) const;
```
Note: This patch rewords this doc comment to clarify it includes various
function pointer types.

Without this, attaching attributes (which use `HasFunctionProto`) to
member function pointers errors with:

```
error: '<attr>' only applies to non-K&R-style functions
```

...which does not really make sense, since member functions are not K&C
functions.

With this change the Arm SME TypeAttrs work correctly on member function
pointers.

Note, however, that not all attributes work correctly when applied to
function pointers or member function pointers. For example,
`alloc_align` crashes when applied to a function pointer (on truck):
https://godbolt.org/z/YvMhnhKfx (as it only expects a `FunctionDecl` not
a `ParmVarDecl`). The same crash applies to member function pointers
(for the same reason).

(cherry picked from commit 692c9b210728323ac499a402ee6eb901f35856f2)
---
 clang/include/clang/AST/DeclBase.h            |  7 +++-
 clang/include/clang/Basic/Attr.td             |  2 +-
 clang/lib/AST/DeclBase.cpp                    |  2 +
 clang/test/AST/attr-print-emit.cpp            |  5 +++
 ...sme-attributes-member-function-pointer.cpp | 37 +++++++++++++++++++
 .../CodeGen/xfail-alloc-align-fn-pointers.cpp | 10 +++++
 6 files changed, 60 insertions(+), 3 deletions(-)
 create mode 100644 clang/test/CodeGen/AArch64/sme-attributes-member-function-pointer.cpp
 create mode 100644 clang/test/CodeGen/xfail-alloc-align-fn-pointers.cpp

diff --git a/clang/include/clang/AST/DeclBase.h b/clang/include/clang/AST/DeclBase.h
index 2c0c3a8dc2f9d..3bb82c1572ef9 100644
--- a/clang/include/clang/AST/DeclBase.h
+++ b/clang/include/clang/AST/DeclBase.h
@@ -1257,8 +1257,11 @@ class alignas(8) Decl {
   int64_t getID() const;
 
   /// Looks through the Decl's underlying type to extract a FunctionType
-  /// when possible. Will return null if the type underlying the Decl does not
-  /// have a FunctionType.
+  /// when possible. This includes direct FunctionDecls, along with various
+  /// function types and typedefs. This includes function pointers/references,
+  /// member function pointers, and optionally if \p BlocksToo is set
+  /// Objective-C block pointers. Returns nullptr if the type underlying the
+  /// Decl does not have a FunctionType.
   const FunctionType *getFunctionType(bool BlocksToo = true) const;
 
   // Looks through the Decl's underlying type to determine if it's a
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index f4ba2bc3c6de3..2a3a29bd2ee1c 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -198,7 +198,7 @@ def OpenCLKernelFunction
 // inclusive nature of subject testing).
 def HasFunctionProto : SubsetSubject<DeclBase,
                                      [{(S->getFunctionType(true) != nullptr &&
-                              isa<FunctionProtoType>(S->getFunctionType())) ||
+                                       isa<FunctionProtoType>(S->getFunctionType())) ||
                                        isa<ObjCMethodDecl>(S) ||
                                        isa<BlockDecl>(S)}],
                                      "non-K&R-style functions">;
diff --git a/clang/lib/AST/DeclBase.cpp b/clang/lib/AST/DeclBase.cpp
index 8506b95f761fe..adf6053392db3 100644
--- a/clang/lib/AST/DeclBase.cpp
+++ b/clang/lib/AST/DeclBase.cpp
@@ -1203,6 +1203,8 @@ const FunctionType *Decl::getFunctionType(bool BlocksToo) const {
 
   if (Ty->isFunctionPointerType())
     Ty = Ty->castAs<PointerType>()->getPointeeType();
+  else if (Ty->isMemberFunctionPointerType())
+    Ty = Ty->castAs<MemberPointerType>()->getPointeeType();
   else if (Ty->isFunctionReferenceType())
     Ty = Ty->castAs<ReferenceType>()->getPointeeType();
   else if (BlocksToo && Ty->isBlockPointerType())
diff --git a/clang/test/AST/attr-print-emit.cpp b/clang/test/AST/attr-print-emit.cpp
index a9bca6778d0f1..77826f8f9af09 100644
--- a/clang/test/AST/attr-print-emit.cpp
+++ b/clang/test/AST/attr-print-emit.cpp
@@ -91,3 +91,8 @@ ANNOTATE_ATTR NONNULL_ATTR void fn_non_null_annotated_attr(int *) __attribute__(
 
 [[gnu::nonnull(1)]] [[gnu::always_inline]] void cxx11_attr(int*) ANNOTATE_ATTR;
 // CHECK: {{\[\[}}gnu::nonnull(1)]] {{\[\[}}gnu::always_inline]] void cxx11_attr(int *) __attribute__((annotate("Annotated")));
+
+struct Foo;
+
+// CHECK: void as_member_fn_ptr(int *(Foo::*member)(int) __attribute__((alloc_size(1))));
+void as_member_fn_ptr(int* (Foo::*member)(int)  __attribute__((alloc_size(1))));
diff --git a/clang/test/CodeGen/AArch64/sme-attributes-member-function-pointer.cpp b/clang/test/CodeGen/AArch64/sme-attributes-member-function-pointer.cpp
new file mode 100644
index 0000000000000..ee784c816a060
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/sme-attributes-member-function-pointer.cpp
@@ -0,0 +1,37 @@
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -x c++ -std=c++20  -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK
+
+struct TestStruct;
+
+__arm_new("za", "zt0") void test(TestStruct& TS,
+  void (TestStruct::*streaming_member_ptr)() __arm_streaming,
+  void (TestStruct::*streaming_compat_member)() __arm_streaming_compatible,
+  void (TestStruct::*arm_in_member)() __arm_in("za", "zt0"),
+  void (TestStruct::*arm_inout_member)() __arm_inout("za", "zt0"),
+  void (TestStruct::*arm_preserves_member)() __arm_preserves("za", "zt0"),
+  void (TestStruct::*arm_agnostic_member)() __arm_agnostic("sme_za_state")) {
+
+  // CHECK: call void %{{.*}} [[STREAMING_MEMBER_CALL_ATTRS:#.+]]
+  (TS.*streaming_member_ptr)();
+
+  // CHECK: call void %{{.*}} [[STREAMING_COMPAT_MEMBER_CALL_ATTRS:#.+]]
+  (TS.*streaming_compat_member)();
+
+  // CHECK: call void %{{.*}} [[ARM_IN_MEMBER_CALL_ATTRS:#.+]]
+  (TS.*arm_in_member)();
+
+  // CHECK: call void %{{.*}} [[ARM_INOUT_MEMBER_CALL_ATTRS:#.+]]
+  (TS.*arm_inout_member)();
+
+  // CHECK: call void %{{.*}} [[ARM_PRESERVES_MEMBER_CALL_ATTRS:#.+]]
+  (TS.*arm_preserves_member)();
+
+  // CHECK: call void %{{.*}} [[ARM_AGNOSTIC_MEMBER_CALL_ATTRS:#.+]]
+  (TS.*arm_agnostic_member)();
+}
+
+// CHECK: attributes [[STREAMING_MEMBER_CALL_ATTRS]] = { "aarch64_pstate_sm_enabled" }
+// CHECK: attributes [[STREAMING_COMPAT_MEMBER_CALL_ATTRS]] = { "aarch64_pstate_sm_compatible" }
+// CHECK: attributes [[ARM_IN_MEMBER_CALL_ATTRS]] = { "aarch64_in_za" "aarch64_in_zt0" }
+// CHECK: attributes [[ARM_INOUT_MEMBER_CALL_ATTRS]] = { "aarch64_inout_za" "aarch64_inout_zt0" }
+// CHECK: attributes [[ARM_PRESERVES_MEMBER_CALL_ATTRS]] = { "aarch64_preserves_za" "aarch64_preserves_zt0" }
+// CHECK: attributes [[ARM_AGNOSTIC_MEMBER_CALL_ATTRS]] = { "aarch64_za_state_agnostic" }
diff --git a/clang/test/CodeGen/xfail-alloc-align-fn-pointers.cpp b/clang/test/CodeGen/xfail-alloc-align-fn-pointers.cpp
new file mode 100644
index 0000000000000..80067500284b1
--- /dev/null
+++ b/clang/test/CodeGen/xfail-alloc-align-fn-pointers.cpp
@@ -0,0 +1,10 @@
+
+// RUN: %clang_cc1 %s
+
+// FIXME: These should not crash!
+// XFAIL: *
+
+void aa_fn_ptr(char* (*member)(char*)  __attribute__((alloc_align(1))));
+
+struct Test;
+void aa_member_fn_ptr(char* (Test::*member)(char*)  __attribute__((alloc_align(1))));

From 78729e5ae25e92354654c70cb9d91c3cb7e05bba Mon Sep 17 00:00:00 2001
From: Alexandros Lamprineas <alexandros.lamprineas@arm.com>
Date: Thu, 6 Feb 2025 01:27:35 +0000
Subject: [PATCH 033/282] [FMV][AArch64] Release notes for LLVM20. (#125525)

---
 clang/docs/ReleaseNotes.rst |  7 +++++++
 llvm/docs/ReleaseNotes.md   | 14 ++++++++++++++
 2 files changed, 21 insertions(+)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 53534d821b2c9..b23963c8e611a 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -654,6 +654,10 @@ Attribute Changes in Clang
 
 - The ``target_version`` attribute is now only supported for AArch64 and RISC-V architectures.
 
+- When targeting AArch64, a function declaration annotated with ``target_version("default")``
+  now generates a mangled default version of the function, whereas before at least one more
+  version other than the default was required to trigger Function Multi Versioning.
+
 - Clang now permits the usage of the placement new operator in ``[[msvc::constexpr]]``
   context outside of the std namespace. (#GH74924)
 
@@ -1188,6 +1192,9 @@ Arm and AArch64 Support
 
   * FUJITSU-MONAKA (fujitsu-monaka)
 
+- Runtime detection of depended-on Function Multi Versioning features has been added
+  in accordance with the Arm C Language Extensions (ACLE).
+
 Android Support
 ^^^^^^^^^^^^^^^
 
diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index e0acb8f48c5b9..db9a681ebe2bc 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -130,6 +130,10 @@ Changes to building LLVM
 Changes to TableGen
 -------------------
 
+* The ARMTargetDefEmitter now binds Funtion Multi Versioning features to the
+  corresponding AArch64 Architecture Extensions such that their dependencies
+  can be autogenerated using TableGen.
+
 Changes to Interprocedural Optimizations
 ----------------------------------------
 
@@ -431,9 +435,19 @@ Changes to the C API
 Changes to the CodeGen infrastructure
 -------------------------------------
 
+* GlobalOpt can now statically resolve calls to multi-versioned functions when targeting AArch64.
+  These calls would otherwise be routed through an IFunc resolver function. This optimization
+  can be applied when the caller is either a multi-versioned function itself, or it is compiled
+  with a sufficiently high set of architecture features (including the `target` attribute, and
+  command line options).
+
 Changes to the Metadata Info
 ---------------------------------
 
+* Multi-versioned functions targeting AArch64 are annotated with new metadata named `fmv-features`.
+  The metadata string value consists of a comma-separated list of Function Multi Versioning feature
+  names as defined in the Arm C Language Extensions (ACLE).
+
 Changes to the Debug Info
 ---------------------------------
 

From 49a8fb2bef6d2321554b3f04ab0a768ed3900390 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Wed, 5 Feb 2025 19:42:06 -0800
Subject: [PATCH 034/282] workflows/premerge: Re-enable tests (#125978)

These were accidentally disabled in
0cca13f758a8bda75eab45ad4bf896bb83921ec9.
---
 .github/workflows/premerge.yaml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.github/workflows/premerge.yaml b/.github/workflows/premerge.yaml
index 956760feaa3b5..45e6bb763a0ef 100644
--- a/.github/workflows/premerge.yaml
+++ b/.github/workflows/premerge.yaml
@@ -14,8 +14,6 @@ on:
       # do this is that it allows us to take advantage of concurrency groups
       # to cancel in progress CI jobs whenever the PR is closed.
       - closed
-    paths:
-      - .github/workflows/premerge.yaml
   push:
     branches:
       - 'main'

From c08c9f97f93b29842fbfd80791088abf42c753ca Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Wed, 5 Feb 2025 14:49:01 -0800
Subject: [PATCH 035/282] [RISCV] Use getSignedConstant for negative values.
 (#125903)

The APInt constructor asserts if bits are set past the size of the APInt
unless it is signed. This currently fails on RV32 because more than XLen
bits are set.

(cherry picked from commit 0d7ee520d3a9b8997adf8eaaa22b33db9659d94e)
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp             | 4 ++--
 llvm/test/CodeGen/RISCV/rvv/vp-combine-reverse-load.ll  | 1 +
 llvm/test/CodeGen/RISCV/rvv/vp-combine-store-reverse.ll | 1 +
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 8d09e534b1858..8b5ee3e67ce63 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -16297,7 +16297,7 @@ static SDValue performVP_REVERSECombine(SDNode *N, SelectionDAG &DAG,
   SDValue Temp2 = DAG.getNode(ISD::MUL, DL, XLenVT, Temp1,
                               DAG.getConstant(ElemWidthByte, DL, XLenVT));
   SDValue Base = DAG.getNode(ISD::ADD, DL, XLenVT, VPLoad->getBasePtr(), Temp2);
-  SDValue Stride = DAG.getConstant(-ElemWidthByte, DL, XLenVT);
+  SDValue Stride = DAG.getSignedConstant(-ElemWidthByte, DL, XLenVT);
 
   MachineFunction &MF = DAG.getMachineFunction();
   MachinePointerInfo PtrInfo(VPLoad->getAddressSpace());
@@ -16358,7 +16358,7 @@ static SDValue performVP_STORECombine(SDNode *N, SelectionDAG &DAG,
                               DAG.getConstant(ElemWidthByte, DL, XLenVT));
   SDValue Base =
       DAG.getNode(ISD::ADD, DL, XLenVT, VPStore->getBasePtr(), Temp2);
-  SDValue Stride = DAG.getConstant(-ElemWidthByte, DL, XLenVT);
+  SDValue Stride = DAG.getSignedConstant(-ElemWidthByte, DL, XLenVT);
 
   MachineFunction &MF = DAG.getMachineFunction();
   MachinePointerInfo PtrInfo(VPStore->getAddressSpace());
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-combine-reverse-load.ll b/llvm/test/CodeGen/RISCV/rvv/vp-combine-reverse-load.ll
index 50e26bd141070..24d8e56fa17fe 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-combine-reverse-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-combine-reverse-load.ll
@@ -1,4 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+f,+v -verify-machineinstrs < %s | FileCheck %s
 ; RUN: llc -mtriple=riscv64 -mattr=+f,+v -verify-machineinstrs < %s | FileCheck %s
 
 define <vscale x 2 x float> @test_reverse_load_combiner(<vscale x 2 x float>* %ptr, i32 zeroext %evl) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-combine-store-reverse.ll b/llvm/test/CodeGen/RISCV/rvv/vp-combine-store-reverse.ll
index 4896a1367935a..a2466c48b0ab7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-combine-store-reverse.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-combine-store-reverse.ll
@@ -1,4 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+f,+v -verify-machineinstrs < %s | FileCheck %s
 ; RUN: llc -mtriple=riscv64 -mattr=+f,+v -verify-machineinstrs < %s | FileCheck %s
 
 define void @test_store_reverse_combiner(<vscale x 2 x float> %val, <vscale x 2 x float>* %ptr, i32 zeroext %evl) {

From 99f3ed737567acfccd9ec196aaf8595447ac1d32 Mon Sep 17 00:00:00 2001
From: Phoebe Wang <phoebe.wang@intel.com>
Date: Fri, 7 Feb 2025 08:56:13 +0800
Subject: [PATCH 036/282] [X86][AVX10] Disable m[no-]avx10.1 and switch
 m[no-]avx10.2 to alias of 512 bit options (#124511) (#125057)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Per the feedback we got, we’d like to switch m[no-]avx10.2 to alias of
512 bit options and disable m[no-]avx10.1 due to they were alias of 256
bit options.

We also change -mno-avx10.[1,2]-512 to alias of 256 bit options to
disable both 256 and 512 instructions.

Cherry-pick from
https://github.com/llvm/llvm-project/commit/9ebfee9d686b41f789b47a6acc7dcdba808fb3f9
---
 clang/docs/ReleaseNotes.rst                   |  4 ++++
 clang/include/clang/Driver/Options.td         | 12 +++++-------
 clang/lib/Driver/ToolChains/Arch/X86.cpp      | 13 ++++++++-----
 clang/test/Driver/x86-target-features.c       | 16 ++++++++++------
 clang/test/Preprocessor/x86_target_features.c |  3 +--
 5 files changed, 28 insertions(+), 20 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index b23963c8e611a..3281ac0c4dbe2 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -1165,6 +1165,10 @@ X86 Support
 - Support ISA of ``MOVRS``.
 
 - Supported ``-march/tune=diamondrapids``
+- Disable ``-m[no-]avx10.1`` and switch ``-m[no-]avx10.2`` to alias of 512 bit
+  options.
+- Change ``-mno-avx10.1-512`` to alias of ``-mno-avx10.1-256`` to disable both
+  256 and 512 bit instructions.
 
 Arm and AArch64 Support
 ^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 1af633e59d0bb..a2b47b943ef90 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -6441,15 +6441,13 @@ def mno_avx : Flag<["-"], "mno-avx">, Group<m_x86_Features_Group>;
 def mavx10_1_256 : Flag<["-"], "mavx10.1-256">, Group<m_x86_AVX10_Features_Group>;
 def mno_avx10_1_256 : Flag<["-"], "mno-avx10.1-256">, Group<m_x86_AVX10_Features_Group>;
 def mavx10_1_512 : Flag<["-"], "mavx10.1-512">, Group<m_x86_AVX10_Features_Group>;
-def mno_avx10_1_512 : Flag<["-"], "mno-avx10.1-512">, Group<m_x86_AVX10_Features_Group>;
-def mavx10_1 : Flag<["-"], "mavx10.1">, Alias<mavx10_1_256>;
-def mno_avx10_1 : Flag<["-"], "mno-avx10.1">, Alias<mno_avx10_1_256>;
+def mno_avx10_1_512 : Flag<["-"], "mno-avx10.1-512">, Alias<mno_avx10_1_256>;
+def mavx10_1 : Flag<["-"], "mavx10.1">, Flags<[Unsupported]>;
+def mno_avx10_1 : Flag<["-"], "mno-avx10.1">, Flags<[Unsupported]>;
 def mavx10_2_256 : Flag<["-"], "mavx10.2-256">, Group<m_x86_AVX10_Features_Group>;
-def mno_avx10_2_256 : Flag<["-"], "mno-avx10.2-256">, Group<m_x86_AVX10_Features_Group>;
 def mavx10_2_512 : Flag<["-"], "mavx10.2-512">, Group<m_x86_AVX10_Features_Group>;
-def mno_avx10_2_512 : Flag<["-"], "mno-avx10.2-512">, Group<m_x86_AVX10_Features_Group>;
-def mavx10_2 : Flag<["-"], "mavx10.2">, Alias<mavx10_2_256>;
-def mno_avx10_2 : Flag<["-"], "mno-avx10.2">, Alias<mno_avx10_2_256>;
+def mavx10_2 : Flag<["-"], "mavx10.2">, Alias<mavx10_2_512>;
+def mno_avx10_2 : Flag<["-"], "mno-avx10.2">, Group<m_x86_AVX10_Features_Group>;
 def mavx2 : Flag<["-"], "mavx2">, Group<m_x86_Features_Group>;
 def mno_avx2 : Flag<["-"], "mno-avx2">, Group<m_x86_Features_Group>;
 def mavx512f : Flag<["-"], "mavx512f">, Group<m_x86_Features_Group>;
diff --git a/clang/lib/Driver/ToolChains/Arch/X86.cpp b/clang/lib/Driver/ToolChains/Arch/X86.cpp
index b2109e11038fe..47c2c3e23f9fd 100644
--- a/clang/lib/Driver/ToolChains/Arch/X86.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/X86.cpp
@@ -237,15 +237,18 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple,
 
     bool IsNegative = Name.consume_front("no-");
 
-#ifndef NDEBUG
-    assert(Name.starts_with("avx10.") && "Invalid AVX10 feature name.");
     StringRef Version, Width;
     std::tie(Version, Width) = Name.substr(6).split('-');
+    assert(Name.starts_with("avx10.") && "Invalid AVX10 feature name.");
     assert((Version == "1" || Version == "2") && "Invalid AVX10 feature name.");
-    assert((Width == "256" || Width == "512") && "Invalid AVX10 feature name.");
-#endif
 
-    Features.push_back(Args.MakeArgString((IsNegative ? "-" : "+") + Name));
+    if (Width == "") {
+      assert(IsNegative && "Only negative options can omit width.");
+      Features.push_back(Args.MakeArgString("-" + Name + "-256"));
+    } else {
+      assert((Width == "256" || Width == "512") && "Invalid vector length.");
+      Features.push_back(Args.MakeArgString((IsNegative ? "-" : "+") + Name));
+    }
   }
 
   // Now add any that the user explicitly requested on the command line,
diff --git a/clang/test/Driver/x86-target-features.c b/clang/test/Driver/x86-target-features.c
index 339f593dc760a..18361251dcebc 100644
--- a/clang/test/Driver/x86-target-features.c
+++ b/clang/test/Driver/x86-target-features.c
@@ -395,7 +395,8 @@
 // EVEX512: "-target-feature" "+evex512"
 // NO-EVEX512: "-target-feature" "-evex512"
 
-// RUN: %clang --target=i386 -mavx10.1 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=AVX10_1_256 %s
+// RUN: not %clang --target=i386 -march=i386 -mavx10.1 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=UNSUPPORT-AVX10 %s
+// RUN: not %clang --target=i386 -march=i386 -mno-avx10.1 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=UNSUPPORT-AVX10 %s
 // RUN: %clang --target=i386 -mavx10.1-256 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=AVX10_1_256 %s
 // RUN: %clang --target=i386 -mavx10.1-512 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=AVX10_1_512 %s
 // RUN: %clang --target=i386 -mavx10.1-256 -mavx10.1-512 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=AVX10_1_512 %s
@@ -403,15 +404,18 @@
 // RUN: not %clang --target=i386 -march=i386 -mavx10.1-128 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=BAD-AVX10 %s
 // RUN: not %clang --target=i386 -march=i386 -mavx10.a-256 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=BAD-AVX10 %s
 // RUN: not %clang --target=i386 -march=i386 -mavx10.1024-512 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=BAD-AVX10 %s
-// RUN: %clang --target=i386 -march=i386 -mavx10.1 -mavx512f %s -### -o %t.o 2>&1 | FileCheck -check-prefix=AVX10-AVX512 %s
-// RUN: %clang --target=i386 -march=i386 -mavx10.1 -mno-avx512f %s -### -o %t.o 2>&1 | FileCheck -check-prefix=AVX10-AVX512 %s
-// RUN: %clang --target=i386 -march=i386 -mavx10.1 -mevex512 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=AVX10-EVEX512 %s
-// RUN: %clang --target=i386 -march=i386 -mavx10.1 -mno-evex512 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=AVX10-EVEX512 %s
-// RUN: %clang --target=i386 -mavx10.2 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=AVX10_2_256 %s
+// RUN: %clang --target=i386 -march=i386 -mavx10.1-256 -mavx512f %s -### -o %t.o 2>&1 | FileCheck -check-prefix=AVX10-AVX512 %s
+// RUN: %clang --target=i386 -march=i386 -mavx10.1-256 -mno-avx512f %s -### -o %t.o 2>&1 | FileCheck -check-prefix=AVX10-AVX512 %s
+// RUN: %clang --target=i386 -march=i386 -mavx10.1-256 -mevex512 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=AVX10-EVEX512 %s
+// RUN: %clang --target=i386 -march=i386 -mavx10.1-256 -mno-evex512 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=AVX10-EVEX512 %s
+// RUN: %clang --target=i386 -mavx10.2 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=AVX10_2_512 %s
+// RUN: %clang --target=i386 -mno-avx10.2 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-AVX10_2 %s
 // RUN: %clang --target=i386 -mavx10.2-256 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=AVX10_2_256 %s
 // RUN: %clang --target=i386 -mavx10.2-512 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=AVX10_2_512 %s
 // RUN: %clang --target=i386 -mavx10.2-256 -mavx10.1-512 %s -### -o %t.o 2>&1 | FileCheck -check-prefixes=AVX10_2_256,AVX10_1_512 %s
 // RUN: %clang --target=i386 -mavx10.2-512 -mavx10.1-256 %s -### -o %t.o 2>&1 | FileCheck -check-prefixes=AVX10_2_512,AVX10_1_256 %s
+// UNSUPPORT-AVX10: error: unsupported option '-m{{.*}}avx10.1' for target 'i386'
+// NO-AVX10_2: "-target-feature" "-avx10.2-256"
 // AVX10_2_256: "-target-feature" "+avx10.2-256"
 // AVX10_2_512: "-target-feature" "+avx10.2-512"
 // AVX10_1_256: "-target-feature" "+avx10.1-256"
diff --git a/clang/test/Preprocessor/x86_target_features.c b/clang/test/Preprocessor/x86_target_features.c
index fa3d0038f05a9..63222a882ff53 100644
--- a/clang/test/Preprocessor/x86_target_features.c
+++ b/clang/test/Preprocessor/x86_target_features.c
@@ -742,10 +742,8 @@
 // AVXVNNIINT16NOAVX2-NOT: #define __AVX2__ 1
 // AVXVNNIINT16NOAVX2-NOT: #define __AVXVNNIINT16__ 1
 
-// RUN: %clang -target i686-unknown-linux-gnu -march=atom -mavx10.1 -x c -E -dM -o - %s | FileCheck  -check-prefix=AVX10_1_256 %s
 // RUN: %clang -target i686-unknown-linux-gnu -march=atom -mavx10.1-256 -x c -E -dM -o - %s | FileCheck  -check-prefix=AVX10_1_256 %s
 // RUN: %clang -target i686-unknown-linux-gnu -march=atom -mavx10.1-256 -mno-avx512f -x c -E -dM -o - %s | FileCheck  -check-prefix=AVX10_1_256 %s
-// RUN: %clang -target i686-unknown-linux-gnu -march=atom -mavx10.2 -x c -E -dM -o - %s | FileCheck  -check-prefixes=AVX10_1_256,AVX10_2_256 %s
 // RUN: %clang -target i686-unknown-linux-gnu -march=atom -mavx10.2-256 -x c -E -dM -o - %s | FileCheck  -check-prefixes=AVX10_1_256,AVX10_2_256 %s
 // AVX10_1_256-NOT: __AVX10_1_512__
 // AVX10_1_256: #define __AVX10_1__ 1
@@ -758,6 +756,7 @@
 // RUN: %clang -target i686-unknown-linux-gnu -march=atom -mavx10.1-512 -x c -E -dM -o - %s | FileCheck  -check-prefix=AVX10_1_512 %s
 // RUN: %clang -target i686-unknown-linux-gnu -march=atom -mavx10.1-512 -mno-avx512f -x c -E -dM -o - %s | FileCheck  -check-prefix=AVX10_1_512 %s
 // RUN: %clang -target i686-unknown-linux-gnu -march=atom -mavx10.1-512 -mno-evex512 -x c -E -dM -o - %s | FileCheck  -check-prefix=AVX10_1_512 %s
+// RUN: %clang -target i686-unknown-linux-gnu -march=atom -mavx10.2 -x c -E -dM -o - %s | FileCheck  -check-prefixes=AVX10_1_512,AVX10_2_512 %s
 // RUN: %clang -target i686-unknown-linux-gnu -march=atom -mavx10.2-512 -x c -E -dM -o - %s | FileCheck  -check-prefixes=AVX10_1_512,AVX10_2_512 %s
 // AVX10_1_512: #define __AVX10_1_512__ 1
 // AVX10_1_512: #define __AVX10_1__ 1

From 857d8d767ab2f3c5f08f9710e98aa5fd1b1dff66 Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Sat, 1 Feb 2025 15:14:17 +0800
Subject: [PATCH 037/282] [InstCombine] Fix FMF propagation in
 `foldSelectWithFCmpToFabs` (#121580)

Consider the following pattern:
```
%cmp = fcmp <pred> double %x, 0.000000e+00
%negX = fneg <fmf> double %x
%sel = select i1 %cmp, double %x, double %negX
```
We cannot propagate ninf from fneg to select since `%negX` may not be
chosen. Similarly, we cannot propagate nnan unless `%negX` is guaranteed
to be selected when `%x` is NaN.
This patch also propagates nnan/ninf from fcmp to avoid regression in
`PhaseOrdering/generate-fabs.ll`.

Alive2: https://alive2.llvm.org/ce/z/t6U-tA
Closes https://github.com/llvm/llvm-project/issues/121430 and
https://github.com/llvm/llvm-project/issues/113989.

(cherry picked from commit 3ec6a6b85aed838b7d56bd6843cad52e822b9111)
---
 .../InstCombine/InstCombineSelect.cpp         | 11 ++++-
 llvm/test/Transforms/InstCombine/fabs.ll      | 40 +++++++++----------
 llvm/test/Transforms/InstCombine/fneg-fabs.ll | 17 ++++----
 3 files changed, 38 insertions(+), 30 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index e2af4d4c56364..29c5cef84ccdb 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -2852,10 +2852,10 @@ static Instruction *foldSelectWithFCmpToFabs(SelectInst &SI,
     if (!match(TrueVal, m_FNeg(m_Specific(X))))
       return nullptr;
 
-    // Forward-propagate nnan and ninf from the fneg to the select.
+    // Forward-propagate nnan and ninf from the fcmp to the select.
     // If all inputs are not those values, then the select is not either.
     // Note: nsz is defined differently, so it may not be correct to propagate.
-    FastMathFlags FMF = cast<FPMathOperator>(TrueVal)->getFastMathFlags();
+    FastMathFlags FMF = cast<FPMathOperator>(CondVal)->getFastMathFlags();
     if (FMF.noNaNs() && !SI.hasNoNaNs()) {
       SI.setHasNoNaNs(true);
       ChangedFMF = true;
@@ -2864,6 +2864,13 @@ static Instruction *foldSelectWithFCmpToFabs(SelectInst &SI,
       SI.setHasNoInfs(true);
       ChangedFMF = true;
     }
+    // Forward-propagate nnan from the fneg to the select.
+    // The nnan flag can be propagated iff fneg is selected when X is NaN.
+    if (!SI.hasNoNaNs() && cast<FPMathOperator>(TrueVal)->hasNoNaNs() &&
+        (Swap ? FCmpInst::isOrdered(Pred) : FCmpInst::isUnordered(Pred))) {
+      SI.setHasNoNaNs(true);
+      ChangedFMF = true;
+    }
 
     // With nsz, when 'Swap' is false:
     // fold (X < +/-0.0) ? -X : X or (X <= +/-0.0) ? -X : X to fabs(X)
diff --git a/llvm/test/Transforms/InstCombine/fabs.ll b/llvm/test/Transforms/InstCombine/fabs.ll
index cccf0f4457b6a..7b9a672f188ca 100644
--- a/llvm/test/Transforms/InstCombine/fabs.ll
+++ b/llvm/test/Transforms/InstCombine/fabs.ll
@@ -484,7 +484,7 @@ define double @select_fcmp_nnan_nsz_olt_zero(double %x) {
 ; CHECK-LABEL: @select_fcmp_nnan_nsz_olt_zero(
 ; CHECK-NEXT:    [[LTZERO:%.*]] = fcmp olt double [[X:%.*]], 0.000000e+00
 ; CHECK-NEXT:    [[NEGX:%.*]] = fneg nnan nsz double [[X]]
-; CHECK-NEXT:    [[FABS:%.*]] = select nnan i1 [[LTZERO]], double [[NEGX]], double [[X]]
+; CHECK-NEXT:    [[FABS:%.*]] = select i1 [[LTZERO]], double [[NEGX]], double [[X]]
 ; CHECK-NEXT:    ret double [[FABS]]
 ;
   %ltzero = fcmp olt double %x, 0.0
@@ -523,7 +523,7 @@ define double @select_fcmp_nnan_nsz_olt_zero_unary_fneg(double %x) {
 ; CHECK-LABEL: @select_fcmp_nnan_nsz_olt_zero_unary_fneg(
 ; CHECK-NEXT:    [[LTZERO:%.*]] = fcmp olt double [[X:%.*]], 0.000000e+00
 ; CHECK-NEXT:    [[NEGX:%.*]] = fneg nnan nsz double [[X]]
-; CHECK-NEXT:    [[FABS:%.*]] = select nnan i1 [[LTZERO]], double [[NEGX]], double [[X]]
+; CHECK-NEXT:    [[FABS:%.*]] = select i1 [[LTZERO]], double [[NEGX]], double [[X]]
 ; CHECK-NEXT:    ret double [[FABS]]
 ;
   %ltzero = fcmp olt double %x, 0.0
@@ -553,7 +553,7 @@ define float @select_fcmp_nnan_nsz_olt_negzero(float %x) {
 ; CHECK-LABEL: @select_fcmp_nnan_nsz_olt_negzero(
 ; CHECK-NEXT:    [[LTZERO:%.*]] = fcmp olt float [[X:%.*]], 0.000000e+00
 ; CHECK-NEXT:    [[NEGX:%.*]] = fneg nnan ninf nsz float [[X]]
-; CHECK-NEXT:    [[FABS:%.*]] = select nnan ninf i1 [[LTZERO]], float [[NEGX]], float [[X]]
+; CHECK-NEXT:    [[FABS:%.*]] = select i1 [[LTZERO]], float [[NEGX]], float [[X]]
 ; CHECK-NEXT:    ret float [[FABS]]
 ;
   %ltzero = fcmp olt float %x, -0.0
@@ -579,7 +579,7 @@ define float @select_fcmp_nnan_nsz_ult_negzero(float %x) {
 ; CHECK-LABEL: @select_fcmp_nnan_nsz_ult_negzero(
 ; CHECK-NEXT:    [[LTZERO:%.*]] = fcmp ult float [[X:%.*]], 0.000000e+00
 ; CHECK-NEXT:    [[NEGX:%.*]] = fneg nnan ninf nsz float [[X]]
-; CHECK-NEXT:    [[FABS:%.*]] = select nnan ninf i1 [[LTZERO]], float [[NEGX]], float [[X]]
+; CHECK-NEXT:    [[FABS:%.*]] = select nnan i1 [[LTZERO]], float [[NEGX]], float [[X]]
 ; CHECK-NEXT:    ret float [[FABS]]
 ;
   %ltzero = fcmp ult float %x, -0.0
@@ -592,7 +592,7 @@ define float @select_fcmp_nnan_nsz_olt_negzero_unary_fneg(float %x) {
 ; CHECK-LABEL: @select_fcmp_nnan_nsz_olt_negzero_unary_fneg(
 ; CHECK-NEXT:    [[LTZERO:%.*]] = fcmp olt float [[X:%.*]], 0.000000e+00
 ; CHECK-NEXT:    [[NEGX:%.*]] = fneg nnan ninf nsz float [[X]]
-; CHECK-NEXT:    [[FABS:%.*]] = select nnan ninf i1 [[LTZERO]], float [[NEGX]], float [[X]]
+; CHECK-NEXT:    [[FABS:%.*]] = select i1 [[LTZERO]], float [[NEGX]], float [[X]]
 ; CHECK-NEXT:    ret float [[FABS]]
 ;
   %ltzero = fcmp olt float %x, -0.0
@@ -607,7 +607,7 @@ define float @select_fcmp_nnan_nsz_ult_negzero_unary_fneg(float %x) {
 ; CHECK-LABEL: @select_fcmp_nnan_nsz_ult_negzero_unary_fneg(
 ; CHECK-NEXT:    [[LTZERO:%.*]] = fcmp ult float [[X:%.*]], 0.000000e+00
 ; CHECK-NEXT:    [[NEGX:%.*]] = fneg nnan ninf nsz float [[X]]
-; CHECK-NEXT:    [[FABS:%.*]] = select nnan ninf i1 [[LTZERO]], float [[NEGX]], float [[X]]
+; CHECK-NEXT:    [[FABS:%.*]] = select nnan i1 [[LTZERO]], float [[NEGX]], float [[X]]
 ; CHECK-NEXT:    ret float [[FABS]]
 ;
   %ltzero = fcmp ult float %x, -0.0
@@ -622,7 +622,7 @@ define double @select_fcmp_nnan_nsz_ole_zero(double %x) {
 ; CHECK-LABEL: @select_fcmp_nnan_nsz_ole_zero(
 ; CHECK-NEXT:    [[LEZERO:%.*]] = fcmp ole double [[X:%.*]], 0.000000e+00
 ; CHECK-NEXT:    [[NEGX:%.*]] = fneg fast double [[X]]
-; CHECK-NEXT:    [[FABS:%.*]] = select nnan ninf i1 [[LEZERO]], double [[NEGX]], double [[X]]
+; CHECK-NEXT:    [[FABS:%.*]] = select i1 [[LEZERO]], double [[NEGX]], double [[X]]
 ; CHECK-NEXT:    ret double [[FABS]]
 ;
   %lezero = fcmp ole double %x, 0.0
@@ -648,7 +648,7 @@ define double @select_fcmp_nnan_nsz_ule_zero(double %x) {
 ; CHECK-LABEL: @select_fcmp_nnan_nsz_ule_zero(
 ; CHECK-NEXT:    [[LEZERO:%.*]] = fcmp ule double [[X:%.*]], 0.000000e+00
 ; CHECK-NEXT:    [[NEGX:%.*]] = fneg fast double [[X]]
-; CHECK-NEXT:    [[FABS:%.*]] = select nnan ninf i1 [[LEZERO]], double [[NEGX]], double [[X]]
+; CHECK-NEXT:    [[FABS:%.*]] = select nnan i1 [[LEZERO]], double [[NEGX]], double [[X]]
 ; CHECK-NEXT:    ret double [[FABS]]
 ;
   %lezero = fcmp ule double %x, 0.0
@@ -661,7 +661,7 @@ define double @select_fcmp_nnan_nsz_ole_zero_unary_fneg(double %x) {
 ; CHECK-LABEL: @select_fcmp_nnan_nsz_ole_zero_unary_fneg(
 ; CHECK-NEXT:    [[LEZERO:%.*]] = fcmp ole double [[X:%.*]], 0.000000e+00
 ; CHECK-NEXT:    [[NEGX:%.*]] = fneg fast double [[X]]
-; CHECK-NEXT:    [[FABS:%.*]] = select nnan ninf i1 [[LEZERO]], double [[NEGX]], double [[X]]
+; CHECK-NEXT:    [[FABS:%.*]] = select i1 [[LEZERO]], double [[NEGX]], double [[X]]
 ; CHECK-NEXT:    ret double [[FABS]]
 ;
   %lezero = fcmp ole double %x, 0.0
@@ -676,7 +676,7 @@ define double @select_fcmp_nnan_nsz_ule_zero_unary_fneg(double %x) {
 ; CHECK-LABEL: @select_fcmp_nnan_nsz_ule_zero_unary_fneg(
 ; CHECK-NEXT:    [[LEZERO:%.*]] = fcmp ule double [[X:%.*]], 0.000000e+00
 ; CHECK-NEXT:    [[NEGX:%.*]] = fneg fast double [[X]]
-; CHECK-NEXT:    [[FABS:%.*]] = select nnan ninf i1 [[LEZERO]], double [[NEGX]], double [[X]]
+; CHECK-NEXT:    [[FABS:%.*]] = select nnan i1 [[LEZERO]], double [[NEGX]], double [[X]]
 ; CHECK-NEXT:    ret double [[FABS]]
 ;
   %lezero = fcmp ule double %x, 0.0
@@ -691,7 +691,7 @@ define float @select_fcmp_nnan_nsz_ole_negzero(float %x) {
 ; CHECK-LABEL: @select_fcmp_nnan_nsz_ole_negzero(
 ; CHECK-NEXT:    [[LEZERO:%.*]] = fcmp ole float [[X:%.*]], 0.000000e+00
 ; CHECK-NEXT:    [[NEGX:%.*]] = fneg nnan nsz float [[X]]
-; CHECK-NEXT:    [[FABS:%.*]] = select nnan i1 [[LEZERO]], float [[NEGX]], float [[X]]
+; CHECK-NEXT:    [[FABS:%.*]] = select i1 [[LEZERO]], float [[NEGX]], float [[X]]
 ; CHECK-NEXT:    ret float [[FABS]]
 ;
   %lezero = fcmp ole float %x, -0.0
@@ -730,7 +730,7 @@ define float @select_fcmp_nnan_nsz_ole_negzero_unary_fneg(float %x) {
 ; CHECK-LABEL: @select_fcmp_nnan_nsz_ole_negzero_unary_fneg(
 ; CHECK-NEXT:    [[LEZERO:%.*]] = fcmp ole float [[X:%.*]], 0.000000e+00
 ; CHECK-NEXT:    [[NEGX:%.*]] = fneg nnan nsz float [[X]]
-; CHECK-NEXT:    [[FABS:%.*]] = select nnan i1 [[LEZERO]], float [[NEGX]], float [[X]]
+; CHECK-NEXT:    [[FABS:%.*]] = select i1 [[LEZERO]], float [[NEGX]], float [[X]]
 ; CHECK-NEXT:    ret float [[FABS]]
 ;
   %lezero = fcmp ole float %x, -0.0
@@ -802,7 +802,7 @@ define <2 x float> @select_fcmp_nnan_nsz_ugt_zero(<2 x float> %x) {
 ; CHECK-LABEL: @select_fcmp_nnan_nsz_ugt_zero(
 ; CHECK-NEXT:    [[GTZERO:%.*]] = fcmp ugt <2 x float> [[X:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[NEGX:%.*]] = fneg nnan nsz arcp <2 x float> [[X]]
-; CHECK-NEXT:    [[FABS:%.*]] = select nnan <2 x i1> [[GTZERO]], <2 x float> [[X]], <2 x float> [[NEGX]]
+; CHECK-NEXT:    [[FABS:%.*]] = select <2 x i1> [[GTZERO]], <2 x float> [[X]], <2 x float> [[NEGX]]
 ; CHECK-NEXT:    ret <2 x float> [[FABS]]
 ;
   %gtzero = fcmp ugt <2 x float> %x, zeroinitializer
@@ -830,7 +830,7 @@ define <2 x float> @select_fcmp_nnan_nsz_ugt_zero_unary_fneg(<2 x float> %x) {
 ; CHECK-LABEL: @select_fcmp_nnan_nsz_ugt_zero_unary_fneg(
 ; CHECK-NEXT:    [[GTZERO:%.*]] = fcmp ugt <2 x float> [[X:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[NEGX:%.*]] = fneg nnan nsz arcp <2 x float> [[X]]
-; CHECK-NEXT:    [[FABS:%.*]] = select nnan <2 x i1> [[GTZERO]], <2 x float> [[X]], <2 x float> [[NEGX]]
+; CHECK-NEXT:    [[FABS:%.*]] = select <2 x i1> [[GTZERO]], <2 x float> [[X]], <2 x float> [[NEGX]]
 ; CHECK-NEXT:    ret <2 x float> [[FABS]]
 ;
   %gtzero = fcmp ugt <2 x float> %x, zeroinitializer
@@ -845,7 +845,7 @@ define half @select_fcmp_nnan_nsz_ogt_negzero(half %x) {
 ; CHECK-LABEL: @select_fcmp_nnan_nsz_ogt_negzero(
 ; CHECK-NEXT:    [[GTZERO:%.*]] = fcmp ogt half [[X:%.*]], 0xH0000
 ; CHECK-NEXT:    [[NEGX:%.*]] = fneg fast half [[X]]
-; CHECK-NEXT:    [[FABS:%.*]] = select nnan ninf i1 [[GTZERO]], half [[X]], half [[NEGX]]
+; CHECK-NEXT:    [[FABS:%.*]] = select nnan i1 [[GTZERO]], half [[X]], half [[NEGX]]
 ; CHECK-NEXT:    ret half [[FABS]]
 ;
   %gtzero = fcmp ogt half %x, -0.0
@@ -860,7 +860,7 @@ define half @select_fcmp_nnan_nsz_ugt_negzero(half %x) {
 ; CHECK-LABEL: @select_fcmp_nnan_nsz_ugt_negzero(
 ; CHECK-NEXT:    [[GTZERO:%.*]] = fcmp ugt half [[X:%.*]], 0xH0000
 ; CHECK-NEXT:    [[NEGX:%.*]] = fneg fast half [[X]]
-; CHECK-NEXT:    [[FABS:%.*]] = select nnan ninf i1 [[GTZERO]], half [[X]], half [[NEGX]]
+; CHECK-NEXT:    [[FABS:%.*]] = select i1 [[GTZERO]], half [[X]], half [[NEGX]]
 ; CHECK-NEXT:    ret half [[FABS]]
 ;
   %gtzero = fcmp ugt half %x, -0.0
@@ -890,7 +890,7 @@ define <2 x double> @select_fcmp_nnan_nsz_uge_zero(<2 x double> %x) {
 ; CHECK-LABEL: @select_fcmp_nnan_nsz_uge_zero(
 ; CHECK-NEXT:    [[GEZERO:%.*]] = fcmp uge <2 x double> [[X:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[NEGX:%.*]] = fneg reassoc nnan nsz <2 x double> [[X]]
-; CHECK-NEXT:    [[FABS:%.*]] = select nnan <2 x i1> [[GEZERO]], <2 x double> [[X]], <2 x double> [[NEGX]]
+; CHECK-NEXT:    [[FABS:%.*]] = select <2 x i1> [[GEZERO]], <2 x double> [[X]], <2 x double> [[NEGX]]
 ; CHECK-NEXT:    ret <2 x double> [[FABS]]
 ;
   %gezero = fcmp uge <2 x double> %x, zeroinitializer
@@ -918,7 +918,7 @@ define <2 x double> @select_fcmp_nnan_nsz_uge_zero_unary_fneg(<2 x double> %x) {
 ; CHECK-LABEL: @select_fcmp_nnan_nsz_uge_zero_unary_fneg(
 ; CHECK-NEXT:    [[GEZERO:%.*]] = fcmp uge <2 x double> [[X:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[NEGX:%.*]] = fneg reassoc nnan nsz <2 x double> [[X]]
-; CHECK-NEXT:    [[FABS:%.*]] = select nnan <2 x i1> [[GEZERO]], <2 x double> [[X]], <2 x double> [[NEGX]]
+; CHECK-NEXT:    [[FABS:%.*]] = select <2 x i1> [[GEZERO]], <2 x double> [[X]], <2 x double> [[NEGX]]
 ; CHECK-NEXT:    ret <2 x double> [[FABS]]
 ;
   %gezero = fcmp uge <2 x double> %x, zeroinitializer
@@ -948,7 +948,7 @@ define half @select_fcmp_nnan_nsz_uge_negzero(half %x) {
 ; CHECK-LABEL: @select_fcmp_nnan_nsz_uge_negzero(
 ; CHECK-NEXT:    [[GEZERO:%.*]] = fcmp uge half [[X:%.*]], 0xH0000
 ; CHECK-NEXT:    [[NEGX:%.*]] = fneg nnan nsz half [[X]]
-; CHECK-NEXT:    [[FABS:%.*]] = select nnan i1 [[GEZERO]], half [[X]], half [[NEGX]]
+; CHECK-NEXT:    [[FABS:%.*]] = select i1 [[GEZERO]], half [[X]], half [[NEGX]]
 ; CHECK-NEXT:    ret half [[FABS]]
 ;
   %gezero = fcmp uge half %x, -0.0
@@ -976,7 +976,7 @@ define half @select_fcmp_nnan_nsz_uge_negzero_unary_fneg(half %x) {
 ; CHECK-LABEL: @select_fcmp_nnan_nsz_uge_negzero_unary_fneg(
 ; CHECK-NEXT:    [[GEZERO:%.*]] = fcmp uge half [[X:%.*]], 0xH0000
 ; CHECK-NEXT:    [[NEGX:%.*]] = fneg nnan nsz half [[X]]
-; CHECK-NEXT:    [[FABS:%.*]] = select nnan i1 [[GEZERO]], half [[X]], half [[NEGX]]
+; CHECK-NEXT:    [[FABS:%.*]] = select i1 [[GEZERO]], half [[X]], half [[NEGX]]
 ; CHECK-NEXT:    ret half [[FABS]]
 ;
   %gezero = fcmp uge half %x, -0.0
diff --git a/llvm/test/Transforms/InstCombine/fneg-fabs.ll b/llvm/test/Transforms/InstCombine/fneg-fabs.ll
index fdcdfd123eefa..c013a2c9f5449 100644
--- a/llvm/test/Transforms/InstCombine/fneg-fabs.ll
+++ b/llvm/test/Transforms/InstCombine/fneg-fabs.ll
@@ -20,8 +20,8 @@ define double @select_noFMF_nfabs_lt(double %x) {
 ; One test where the neg has fmfs.
 define double @select_nsz_nfabs_lt_fmfProp(double %x) {
 ; CHECK-LABEL: @select_nsz_nfabs_lt_fmfProp(
-; CHECK-NEXT:    [[TMP1:%.*]] = call nnan ninf nsz double @llvm.fabs.f64(double [[X:%.*]])
-; CHECK-NEXT:    [[SEL:%.*]] = fneg nnan ninf nsz double [[TMP1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call nnan nsz double @llvm.fabs.f64(double [[X:%.*]])
+; CHECK-NEXT:    [[SEL:%.*]] = fneg nnan nsz double [[TMP1]]
 ; CHECK-NEXT:    ret double [[SEL]]
 ;
   %cmp = fcmp olt double %x, 0.000000e+00
@@ -32,8 +32,8 @@ define double @select_nsz_nfabs_lt_fmfProp(double %x) {
 
 define double @select_nsz_nnan_nfabs_lt_fmfProp(double %x) {
 ; CHECK-LABEL: @select_nsz_nnan_nfabs_lt_fmfProp(
-; CHECK-NEXT:    [[TMP1:%.*]] = call nnan ninf nsz double @llvm.fabs.f64(double [[X:%.*]])
-; CHECK-NEXT:    [[SEL:%.*]] = fneg nnan ninf nsz double [[TMP1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call nnan nsz double @llvm.fabs.f64(double [[X:%.*]])
+; CHECK-NEXT:    [[SEL:%.*]] = fneg nnan nsz double [[TMP1]]
 ; CHECK-NEXT:    ret double [[SEL]]
 ;
   %cmp = fcmp olt double %x, 0.000000e+00
@@ -147,8 +147,9 @@ define double @select_noFMF_nfabs_gt(double %x) {
 ; One test where the neg has fmfs.
 define double @select_nsz_nfabs_gt_fmfProp(double %x) {
 ; CHECK-LABEL: @select_nsz_nfabs_gt_fmfProp(
-; CHECK-NEXT:    [[TMP1:%.*]] = call nnan ninf nsz double @llvm.fabs.f64(double [[X:%.*]])
-; CHECK-NEXT:    [[SEL:%.*]] = fneg nnan ninf nsz double [[TMP1]]
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt double [[X:%.*]], 0.000000e+00
+; CHECK-NEXT:    [[NEGX:%.*]] = fneg fast double [[X]]
+; CHECK-NEXT:    [[SEL:%.*]] = select nsz i1 [[CMP]], double [[NEGX]], double [[X]]
 ; CHECK-NEXT:    ret double [[SEL]]
 ;
   %cmp = fcmp ogt double %x, 0.000000e+00
@@ -159,8 +160,8 @@ define double @select_nsz_nfabs_gt_fmfProp(double %x) {
 
 define double @select_nsz_nnan_nfabs_gt_fmfProp(double %x) {
 ; CHECK-LABEL: @select_nsz_nnan_nfabs_gt_fmfProp(
-; CHECK-NEXT:    [[TMP1:%.*]] = call nnan ninf nsz double @llvm.fabs.f64(double [[X:%.*]])
-; CHECK-NEXT:    [[SEL:%.*]] = fneg nnan ninf nsz double [[TMP1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call nnan nsz double @llvm.fabs.f64(double [[X:%.*]])
+; CHECK-NEXT:    [[SEL:%.*]] = fneg nnan nsz double [[TMP1]]
 ; CHECK-NEXT:    ret double [[SEL]]
 ;
   %cmp = fcmp ogt double %x, 0.000000e+00

From 0892dddc23c64bf2aab2c5ee1fc9f4dcd68771e0 Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Mon, 3 Feb 2025 10:35:14 -0800
Subject: [PATCH 038/282] [llvm] Add CMake flag to compile out the telemetry
 framework (#124850)

Add a CMake flag (LLVM_BUILD_TELEMETRY) to disable building the
telemetry framework. The flag being enabled does *not* mean that
telemetry is being collected, it merely means we're building the generic
telemetry framework. Hence the flag is enabled by default.

Motivated by this Discourse thread:
https://discourse.llvm.org/t/how-to-disable-building-llvm-clang-telemetry/84305

(cherry picked from commit bac62ee5b473e70981a6bd9759ec316315fca07d)
---
 llvm/CMakeLists.txt           | 1 +
 llvm/lib/CMakeLists.txt       | 4 +++-
 llvm/unittests/CMakeLists.txt | 4 +++-
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index c9ff3696e22d6..d1b4c2700ce8e 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -829,6 +829,7 @@ option (LLVM_ENABLE_DOXYGEN "Use doxygen to generate llvm API documentation." OF
 option (LLVM_ENABLE_SPHINX "Use Sphinx to generate llvm documentation." OFF)
 option (LLVM_ENABLE_OCAMLDOC "Build OCaml bindings documentation." ON)
 option (LLVM_ENABLE_BINDINGS "Build bindings." ON)
+option (LLVM_BUILD_TELEMETRY "Build the telemtry library. This does not enable telemetry." ON)
 
 set(LLVM_INSTALL_DOXYGEN_HTML_DIR "${CMAKE_INSTALL_DOCDIR}/llvm/doxygen-html"
     CACHE STRING "Doxygen-generated HTML documentation install directory")
diff --git a/llvm/lib/CMakeLists.txt b/llvm/lib/CMakeLists.txt
index f6465612d30c0..d0a2bc9294381 100644
--- a/llvm/lib/CMakeLists.txt
+++ b/llvm/lib/CMakeLists.txt
@@ -41,7 +41,9 @@ add_subdirectory(ProfileData)
 add_subdirectory(Passes)
 add_subdirectory(TargetParser)
 add_subdirectory(TextAPI)
-add_subdirectory(Telemetry)
+if (LLVM_BUILD_TELEMETRY)
+  add_subdirectory(Telemetry)
+endif()
 add_subdirectory(ToolDrivers)
 add_subdirectory(XRay)
 if (LLVM_INCLUDE_TESTS)
diff --git a/llvm/unittests/CMakeLists.txt b/llvm/unittests/CMakeLists.txt
index 81abce51b8939..12e229b1c3498 100644
--- a/llvm/unittests/CMakeLists.txt
+++ b/llvm/unittests/CMakeLists.txt
@@ -63,7 +63,9 @@ add_subdirectory(Support)
 add_subdirectory(TableGen)
 add_subdirectory(Target)
 add_subdirectory(TargetParser)
-add_subdirectory(Telemetry)
+if (LLVM_BUILD_TELEMETRY)
+  add_subdirectory(Telemetry)
+endif()
 add_subdirectory(Testing)
 add_subdirectory(TextAPI)
 add_subdirectory(Transforms)

From 7fa1a3a398d510f0876af8908f213dd90bd9a07d Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Mon, 3 Feb 2025 15:05:19 -0800
Subject: [PATCH 039/282] [CMake] Fix typo in docstring: telemtry -> telemetry
 (NFC)

Thanks Nikita for spotting it.

(cherry picked from commit 13ded6829bf7ca793795c50d47dd2b95482e5cfa)
---
 llvm/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index d1b4c2700ce8e..f5293e8663243 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -829,7 +829,7 @@ option (LLVM_ENABLE_DOXYGEN "Use doxygen to generate llvm API documentation." OF
 option (LLVM_ENABLE_SPHINX "Use Sphinx to generate llvm documentation." OFF)
 option (LLVM_ENABLE_OCAMLDOC "Build OCaml bindings documentation." ON)
 option (LLVM_ENABLE_BINDINGS "Build bindings." ON)
-option (LLVM_BUILD_TELEMETRY "Build the telemtry library. This does not enable telemetry." ON)
+option (LLVM_BUILD_TELEMETRY "Build the telemetry library. This does not enable telemetry." ON)
 
 set(LLVM_INSTALL_DOXYGEN_HTML_DIR "${CMAKE_INSTALL_DOCDIR}/llvm/doxygen-html"
     CACHE STRING "Doxygen-generated HTML documentation install directory")

From 4d16551293fe4a0bf56cebd3f52d2a6e36f166a8 Mon Sep 17 00:00:00 2001
From: Pavel Labath <pavel@labath.sk>
Date: Fri, 31 Jan 2025 09:07:11 +0100
Subject: [PATCH 040/282] [lldb] Add support for gdb-style 'x' packet (#124733)

See also
https://discourse.llvm.org/t/rfc-fixing-incompatibilties-of-the-x-packet-w-r-t-gdb/84288
and https://sourceware.org/pipermail/gdb/2025-January/051705.html

(cherry picked from commit 13d0318a9848ec322ceea4f37fb6b421d70407b0)
---
 .../Python/lldbsuite/test/gdbclientutils.py   |  6 ++
 .../GDBRemoteCommunicationClient.cpp          | 22 +++++---
 .../gdb-remote/GDBRemoteCommunicationClient.h | 11 +++-
 .../Process/gdb-remote/ProcessGDBRemote.cpp   | 36 +++++++-----
 .../gdb_remote_client/TestReadMemory.py       | 55 +++++++++++++++++++
 5 files changed, 106 insertions(+), 24 deletions(-)
 create mode 100644 lldb/test/API/functionalities/gdb_remote_client/TestReadMemory.py

diff --git a/lldb/packages/Python/lldbsuite/test/gdbclientutils.py b/lldb/packages/Python/lldbsuite/test/gdbclientutils.py
index 1784487323ad6..4b782b3b470fe 100644
--- a/lldb/packages/Python/lldbsuite/test/gdbclientutils.py
+++ b/lldb/packages/Python/lldbsuite/test/gdbclientutils.py
@@ -126,6 +126,9 @@ def respond(self, packet):
         if packet[0] == "m":
             addr, length = [int(x, 16) for x in packet[1:].split(",")]
             return self.readMemory(addr, length)
+        if packet[0] == "x":
+            addr, length = [int(x, 16) for x in packet[1:].split(",")]
+            return self.x(addr, length)
         if packet[0] == "M":
             location, encoded_data = packet[1:].split(":")
             addr, length = [int(x, 16) for x in location.split(",")]
@@ -267,6 +270,9 @@ def writeRegister(self, register, value_hex):
     def readMemory(self, addr, length):
         return "00" * length
 
+    def x(self, addr, length):
+        return ""
+
     def writeMemory(self, addr, data_hex):
         return "OK"
 
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
index b3f1c6f052955..581dd8f8e0b6b 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
@@ -275,7 +275,6 @@ void GDBRemoteCommunicationClient::ResetDiscoverableSettings(bool did_exec) {
     m_supports_vCont_s = eLazyBoolCalculate;
     m_supports_vCont_S = eLazyBoolCalculate;
     m_supports_p = eLazyBoolCalculate;
-    m_supports_x = eLazyBoolCalculate;
     m_supports_QSaveRegisterState = eLazyBoolCalculate;
     m_qHostInfo_is_valid = eLazyBoolCalculate;
     m_curr_pid_is_valid = eLazyBoolCalculate;
@@ -295,6 +294,7 @@ void GDBRemoteCommunicationClient::ResetDiscoverableSettings(bool did_exec) {
     m_supports_qXfer_siginfo_read = eLazyBoolCalculate;
     m_supports_augmented_libraries_svr4_read = eLazyBoolCalculate;
     m_uses_native_signals = eLazyBoolCalculate;
+    m_x_packet_state.reset();
     m_supports_qProcessInfoPID = true;
     m_supports_qfProcessInfo = true;
     m_supports_qUserName = true;
@@ -348,6 +348,7 @@ void GDBRemoteCommunicationClient::GetRemoteQSupported() {
   m_supports_memory_tagging = eLazyBoolNo;
   m_supports_qSaveCore = eLazyBoolNo;
   m_uses_native_signals = eLazyBoolNo;
+  m_x_packet_state.reset();
 
   m_max_packet_size = UINT64_MAX; // It's supposed to always be there, but if
                                   // not, we assume no limit
@@ -401,6 +402,8 @@ void GDBRemoteCommunicationClient::GetRemoteQSupported() {
         m_supports_qSaveCore = eLazyBoolYes;
       else if (x == "native-signals+")
         m_uses_native_signals = eLazyBoolYes;
+      else if (x == "binary-upload+")
+        m_x_packet_state = xPacketState::Prefixed;
       // Look for a list of compressions in the features list e.g.
       // qXfer:features:read+;PacketSize=20000;qEcho+;SupportedCompressions=zlib-
       // deflate,lzma
@@ -715,19 +718,20 @@ Status GDBRemoteCommunicationClient::WriteMemoryTags(
   return status;
 }
 
-bool GDBRemoteCommunicationClient::GetxPacketSupported() {
-  if (m_supports_x == eLazyBoolCalculate) {
+GDBRemoteCommunicationClient::xPacketState
+GDBRemoteCommunicationClient::GetxPacketState() {
+  if (!m_x_packet_state)
+    GetRemoteQSupported();
+  if (!m_x_packet_state) {
     StringExtractorGDBRemote response;
-    m_supports_x = eLazyBoolNo;
-    char packet[256];
-    snprintf(packet, sizeof(packet), "x0,0");
-    if (SendPacketAndWaitForResponse(packet, response) ==
+    m_x_packet_state = xPacketState::Unimplemented;
+    if (SendPacketAndWaitForResponse("x0,0", response) ==
         PacketResult::Success) {
       if (response.IsOKResponse())
-        m_supports_x = eLazyBoolYes;
+        m_x_packet_state = xPacketState::Bare;
     }
   }
-  return m_supports_x;
+  return *m_x_packet_state;
 }
 
 lldb::pid_t GDBRemoteCommunicationClient::GetCurrentProcessID(bool allow_lazy) {
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h
index 898d176abc346..1118a76d7211b 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h
@@ -218,7 +218,14 @@ class GDBRemoteCommunicationClient : public GDBRemoteClientBase {
 
   bool GetpPacketSupported(lldb::tid_t tid);
 
-  bool GetxPacketSupported();
+  enum class xPacketState {
+    Unimplemented,
+    Prefixed, // Successful responses start with a 'b' character. This is the
+              // style used by GDB.
+    Bare,     // No prefix, packets starts with the memory being read. This is
+              // LLDB's original style.
+  };
+  xPacketState GetxPacketState();
 
   bool GetVAttachOrWaitSupported();
 
@@ -541,7 +548,6 @@ class GDBRemoteCommunicationClient : public GDBRemoteClientBase {
   LazyBool m_attach_or_wait_reply = eLazyBoolCalculate;
   LazyBool m_prepare_for_reg_writing_reply = eLazyBoolCalculate;
   LazyBool m_supports_p = eLazyBoolCalculate;
-  LazyBool m_supports_x = eLazyBoolCalculate;
   LazyBool m_avoid_g_packets = eLazyBoolCalculate;
   LazyBool m_supports_QSaveRegisterState = eLazyBoolCalculate;
   LazyBool m_supports_qXfer_auxv_read = eLazyBoolCalculate;
@@ -561,6 +567,7 @@ class GDBRemoteCommunicationClient : public GDBRemoteClientBase {
   LazyBool m_supports_memory_tagging = eLazyBoolCalculate;
   LazyBool m_supports_qSaveCore = eLazyBoolCalculate;
   LazyBool m_uses_native_signals = eLazyBoolCalculate;
+  std::optional<xPacketState> m_x_packet_state;
 
   bool m_supports_qProcessInfoPID : 1, m_supports_qfProcessInfo : 1,
       m_supports_qUserName : 1, m_supports_qGroupName : 1,
diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
index 538c868014009..21a0fa283644d 100644
--- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
@@ -2609,11 +2609,15 @@ void ProcessGDBRemote::WillPublicStop() {
 // Process Memory
 size_t ProcessGDBRemote::DoReadMemory(addr_t addr, void *buf, size_t size,
                                       Status &error) {
+  using xPacketState = GDBRemoteCommunicationClient::xPacketState;
+
   GetMaxMemorySize();
-  bool binary_memory_read = m_gdb_comm.GetxPacketSupported();
+  xPacketState x_state = m_gdb_comm.GetxPacketState();
+
   // M and m packets take 2 bytes for 1 byte of memory
-  size_t max_memory_size =
-      binary_memory_read ? m_max_memory_size : m_max_memory_size / 2;
+  size_t max_memory_size = x_state != xPacketState::Unimplemented
+                               ? m_max_memory_size
+                               : m_max_memory_size / 2;
   if (size > max_memory_size) {
     // Keep memory read sizes down to a sane limit. This function will be
     // called multiple times in order to complete the task by
@@ -2624,8 +2628,8 @@ size_t ProcessGDBRemote::DoReadMemory(addr_t addr, void *buf, size_t size,
   char packet[64];
   int packet_len;
   packet_len = ::snprintf(packet, sizeof(packet), "%c%" PRIx64 ",%" PRIx64,
-                          binary_memory_read ? 'x' : 'm', (uint64_t)addr,
-                          (uint64_t)size);
+                          x_state != xPacketState::Unimplemented ? 'x' : 'm',
+                          (uint64_t)addr, (uint64_t)size);
   assert(packet_len + 1 < (int)sizeof(packet));
   UNUSED_IF_ASSERT_DISABLED(packet_len);
   StringExtractorGDBRemote response;
@@ -2634,19 +2638,25 @@ size_t ProcessGDBRemote::DoReadMemory(addr_t addr, void *buf, size_t size,
       GDBRemoteCommunication::PacketResult::Success) {
     if (response.IsNormalResponse()) {
       error.Clear();
-      if (binary_memory_read) {
+      if (x_state != xPacketState::Unimplemented) {
         // The lower level GDBRemoteCommunication packet receive layer has
         // already de-quoted any 0x7d character escaping that was present in
         // the packet
 
-        size_t data_received_size = response.GetBytesLeft();
-        if (data_received_size > size) {
-          // Don't write past the end of BUF if the remote debug server gave us
-          // too much data for some reason.
-          data_received_size = size;
+        llvm::StringRef data_received = response.GetStringRef();
+        if (x_state == xPacketState::Prefixed &&
+            !data_received.consume_front("b")) {
+          error = Status::FromErrorStringWithFormatv(
+              "unexpected response to GDB server memory read packet '{0}': "
+              "'{1}'",
+              packet, data_received);
+          return 0;
         }
-        memcpy(buf, response.GetStringRef().data(), data_received_size);
-        return data_received_size;
+        // Don't write past the end of BUF if the remote debug server gave us
+        // too much data for some reason.
+        size_t memcpy_size = std::min(size, data_received.size());
+        memcpy(buf, data_received.data(), memcpy_size);
+        return memcpy_size;
       } else {
         return response.GetHexBytes(
             llvm::MutableArrayRef<uint8_t>((uint8_t *)buf, size), '\xdd');
diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestReadMemory.py b/lldb/test/API/functionalities/gdb_remote_client/TestReadMemory.py
new file mode 100644
index 0000000000000..81dcb54aef5d8
--- /dev/null
+++ b/lldb/test/API/functionalities/gdb_remote_client/TestReadMemory.py
@@ -0,0 +1,55 @@
+import lldb
+from lldbsuite.support import seven
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test.decorators import *
+from lldbsuite.test.gdbclientutils import *
+from lldbsuite.test.lldbgdbclient import GDBRemoteTestBase
+
+
+class TestReadMemory(GDBRemoteTestBase):
+    def test_x_with_prefix(self):
+        class MyResponder(MockGDBServerResponder):
+            def qSupported(self, client_features):
+                # binary-upload+ indicates we use the gdb style of x packets
+                return super().qSupported(client_features) + ";binary-upload+"
+
+            def x(self, addr, length):
+                return "bfoobar" if addr == 0x1000 else "E01"
+
+        self.server.responder = MyResponder()
+        target = self.dbg.CreateTargetWithFileAndTargetTriple("", "x86_64-pc-linux")
+        process = self.connect(target)
+
+        error = lldb.SBError()
+        self.assertEqual(b"foobar", process.ReadMemory(0x1000, 10, error))
+
+    def test_x_bare(self):
+        class MyResponder(MockGDBServerResponder):
+            def x(self, addr, length):
+                # The OK response indicates we use the old lldb style.
+                if addr == 0 and length == 0:
+                    return "OK"
+                return "foobar" if addr == 0x1000 else "E01"
+
+        self.server.responder = MyResponder()
+        target = self.dbg.CreateTargetWithFileAndTargetTriple("", "x86_64-pc-linux")
+        process = self.connect(target)
+
+        error = lldb.SBError()
+        self.assertEqual(b"foobar", process.ReadMemory(0x1000, 10, error))
+
+    def test_m_fallback(self):
+        class MyResponder(MockGDBServerResponder):
+            def x(self, addr, length):
+                # If `x` is unsupported, we should fall back to `m`.
+                return ""
+
+            def readMemory(self, addr, length):
+                return seven.hexlify("foobar") if addr == 0x1000 else "E01"
+
+        self.server.responder = MyResponder()
+        target = self.dbg.CreateTargetWithFileAndTargetTriple("", "x86_64-pc-linux")
+        process = self.connect(target)
+
+        error = lldb.SBError()
+        self.assertEqual(b"foobar", process.ReadMemory(0x1000, 10, error))

From 04209c2a2b3eddbfcf6a172e521d40c23780ce41 Mon Sep 17 00:00:00 2001
From: Pavel Labath <pavel@labath.sk>
Date: Fri, 7 Feb 2025 07:52:23 +0100
Subject: [PATCH 041/282] Add info about the gdb x packet into the release
 notes (#125680)

See also #125653.
---
 llvm/docs/ReleaseNotes.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index db9a681ebe2bc..44a0b17d6a07b 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -604,6 +604,8 @@ Changes to LLDB
 
 * Incorrect floating-point register DWARF numbers for LoongArch were [fixed](https://github.com/llvm/llvm-project/pull/120391).
 
+* Support was added for handling the GDB Remote Protocol `x` packet in the format introduced by GDB 16.2. LLDB currently uses a different format for `x` and LLDB is now able to handle both formats. At some point in the future support for LLDB's format of `x` will be removed.
+
 Changes to BOLT
 ---------------------------------
 

From 71ee354bab4ad4e3132d079f11e7b9771012442c Mon Sep 17 00:00:00 2001
From: Mark de Wever <koraq@xs4all.nl>
Date: Tue, 4 Feb 2025 17:36:31 +0100
Subject: [PATCH 042/282] [libc++][TZDB] Fixes %z escaping. (#125399)

The previous tested TZDB did not contain %z for the rule letters. The
usage of %z in TZDB 2024b revealed a bug in the implementation. The
patch fixes it and has been locally tested with TZDB 2024b.

Fixes #108957

(cherry picked from commit a27f3b2bb137001735949549354aff89dbf227f4)
---
 libcxx/src/experimental/time_zone.cpp                      | 2 +-
 .../time.zone.members/get_info.sys_time.pass.cpp           | 7 ++++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/libcxx/src/experimental/time_zone.cpp b/libcxx/src/experimental/time_zone.cpp
index f7d82a5d4cfc3..289164ab12036 100644
--- a/libcxx/src/experimental/time_zone.cpp
+++ b/libcxx/src/experimental/time_zone.cpp
@@ -668,7 +668,7 @@ __first_rule(seconds __stdoff, const vector<__tz::__rule>& __rules) {
                __continuation_end,
                __continuation.__stdoff + __save,
                chrono::duration_cast<minutes>(__save),
-               __continuation.__format},
+               chrono::__format(__continuation, __continuation.__format, __save)},
       true};
 }
 
diff --git a/libcxx/test/libcxx/time/time.zone/time.zone.timezone/time.zone.members/get_info.sys_time.pass.cpp b/libcxx/test/libcxx/time/time.zone/time.zone.timezone/time.zone.members/get_info.sys_time.pass.cpp
index 7f08c64d5e0e7..afd1273421f39 100644
--- a/libcxx/test/libcxx/time/time.zone/time.zone.timezone/time.zone.members/get_info.sys_time.pass.cpp
+++ b/libcxx/test/libcxx/time/time.zone/time.zone.timezone/time.zone.members/get_info.sys_time.pass.cpp
@@ -157,7 +157,6 @@ static void test_abbrev(std::string_view input, std::string_view expected) {
   TEST_LIBCPP_REQUIRE(result == expected, TEST_WRITE_CONCATENATED("\nExpected ", expected, "\nActual ", result, '\n'));
 }
 
-// This format is valid, however is not used in the tzdata.zi.
 static void percentage_z_format() {
   test_abbrev(
       R"(
@@ -188,6 +187,12 @@ Z Format 0:45 F %z)",
 R F 1999 max - Jan 5 0 -1 foo
 Z Format 0:45 F %z)",
       "-0015");
+
+  test_abbrev(
+      R"(
+Z Format -1:2:20 - LMT 1912 Ja 1 1u
+-1 - %z)",
+      "-01");
 }
 
 int main(int, const char**) {

From 3a3a3230d171e11842a9940b6da0f72022b1c5b3 Mon Sep 17 00:00:00 2001
From: "s.vgys" <83276820+samvangysegem@users.noreply.github.com>
Date: Fri, 7 Feb 2025 18:45:38 +0100
Subject: [PATCH 043/282] fix: removes invalid token from LLVM_VERSION_SUFFIX
 in LIBC namespace (#126193)

Resolves #125831

(cherry picked from commit 51759ffc4408e9eb5c2d40c9489ce3b98de233d5)
---
 libc/CMakeLists.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt
index c061e2a05ebd8..1c4c0cd5aa22b 100644
--- a/libc/CMakeLists.txt
+++ b/libc/CMakeLists.txt
@@ -51,7 +51,8 @@ set(LIBC_KERNEL_HEADERS "/usr/include" CACHE STRING "Path to Linux kernel header
 # Defining a global namespace to enclose all libc functions.
 set(default_namespace "__llvm_libc")
 if(LLVM_VERSION_MAJOR)
-  set(default_namespace "__llvm_libc_${LLVM_VERSION_MAJOR}_${LLVM_VERSION_MINOR}_${LLVM_VERSION_PATCH}_${LLVM_VERSION_SUFFIX}")
+  string(REPLACE "-" "" NS_LLVM_VERSION_SUFFIX ${LLVM_VERSION_SUFFIX})
+  set(default_namespace "__llvm_libc_${LLVM_VERSION_MAJOR}_${LLVM_VERSION_MINOR}_${LLVM_VERSION_PATCH}_${NS_LLVM_VERSION_SUFFIX}")
 endif()
 set(LIBC_NAMESPACE ${default_namespace}
   CACHE STRING "The namespace to use to enclose internal implementations. Must start with '__llvm_libc'."

From 14c5784ee4c03c5a7aa6f483486985d2c24a2407 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 31 Jan 2025 17:37:17 +0100
Subject: [PATCH 044/282] [TableGen] Reduce size of MatchTableRecord (NFC)
 (#125221)

MatchTableRecord stores a 64-bit RawValue. However, this field is only
needed by a small part of the code (jump table generation).

Create a separate RecordAndValue structure that is used in just the
necessary places.

Based on massif, this reduces memory usage on RISCVGenGlobalISel.inc by
about 100MB (to 2.15GB).

(cherry picked from commit e2301d674976b84ba505065a9702f3376e05bc43)
---
 .../GlobalISel/GlobalISelMatchTable.cpp       | 46 +++++++------------
 .../Common/GlobalISel/GlobalISelMatchTable.h  | 42 ++++++++---------
 2 files changed, 38 insertions(+), 50 deletions(-)

diff --git a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp
index f0cd98dd2dee0..8564bf8d2d91b 100644
--- a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp
+++ b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp
@@ -227,26 +227,12 @@ MatchTableRecord MatchTable::NamedValue(unsigned NumBytes,
                           MatchTableRecord::MTRF_CommaFollows);
 }
 
-MatchTableRecord MatchTable::NamedValue(unsigned NumBytes, StringRef NamedValue,
-                                        int64_t RawValue) {
-  return MatchTableRecord(std::nullopt, NamedValue, NumBytes,
-                          MatchTableRecord::MTRF_CommaFollows, RawValue);
-}
-
 MatchTableRecord MatchTable::NamedValue(unsigned NumBytes, StringRef Namespace,
                                         StringRef NamedValue) {
   return MatchTableRecord(std::nullopt, (Namespace + "::" + NamedValue).str(),
                           NumBytes, MatchTableRecord::MTRF_CommaFollows);
 }
 
-MatchTableRecord MatchTable::NamedValue(unsigned NumBytes, StringRef Namespace,
-                                        StringRef NamedValue,
-                                        int64_t RawValue) {
-  return MatchTableRecord(std::nullopt, (Namespace + "::" + NamedValue).str(),
-                          NumBytes, MatchTableRecord::MTRF_CommaFollows,
-                          RawValue);
-}
-
 MatchTableRecord MatchTable::IntValue(unsigned NumBytes, int64_t IntValue) {
   assert(isUIntN(NumBytes * 8, IntValue) || isIntN(NumBytes * 8, IntValue));
   auto Str = llvm::to_string(IntValue);
@@ -651,8 +637,8 @@ void SwitchMatcher::emit(MatchTable &Table) {
                 [&Table]() { return Table.allocateLabelID(); });
   const unsigned Default = Table.allocateLabelID();
 
-  const int64_t LowerBound = Values.begin()->getRawValue();
-  const int64_t UpperBound = Values.rbegin()->getRawValue() + 1;
+  const int64_t LowerBound = Values.begin()->RawValue;
+  const int64_t UpperBound = Values.rbegin()->RawValue + 1;
 
   emitPredicateSpecificOpcodes(*Condition, Table);
 
@@ -664,10 +650,11 @@ void SwitchMatcher::emit(MatchTable &Table) {
   auto VI = Values.begin();
   for (unsigned I = 0, E = Values.size(); I < E; ++I) {
     auto V = *VI++;
-    while (J++ < V.getRawValue())
+    while (J++ < V.RawValue)
       Table << MatchTable::IntValue(4, 0);
-    V.turnIntoComment();
-    Table << MatchTable::LineBreak << V << MatchTable::JumpTarget(LabelIDs[I]);
+    V.Record.turnIntoComment();
+    Table << MatchTable::LineBreak << V.Record
+          << MatchTable::JumpTarget(LabelIDs[I]);
   }
   Table << MatchTable::LineBreak;
 
@@ -1145,11 +1132,11 @@ void SameOperandMatcher::emitPredicateOpcodes(MatchTable &Table,
 
 std::map<LLTCodeGen, unsigned> LLTOperandMatcher::TypeIDValues;
 
-MatchTableRecord LLTOperandMatcher::getValue() const {
+RecordAndValue LLTOperandMatcher::getValue() const {
   const auto VI = TypeIDValues.find(Ty);
   if (VI == TypeIDValues.end())
     return MatchTable::NamedValue(1, getTy().getCxxEnumValue());
-  return MatchTable::NamedValue(1, getTy().getCxxEnumValue(), VI->second);
+  return {MatchTable::NamedValue(1, getTy().getCxxEnumValue()), VI->second};
 }
 
 bool LLTOperandMatcher::hasValue() const {
@@ -1167,7 +1154,8 @@ void LLTOperandMatcher::emitPredicateOpcodes(MatchTable &Table,
           << MatchTable::ULEB128Value(InsnVarID);
   }
   Table << MatchTable::Comment("Op") << MatchTable::ULEB128Value(OpIdx)
-        << MatchTable::Comment("Type") << getValue() << MatchTable::LineBreak;
+        << MatchTable::Comment("Type") << getValue().Record
+        << MatchTable::LineBreak;
 }
 
 //===- PointerToAnyOperandMatcher -----------------------------------------===//
@@ -1411,12 +1399,12 @@ Error OperandMatcher::addTypeCheckPredicate(const TypeSetByHwMode &VTy,
 DenseMap<const CodeGenInstruction *, unsigned>
     InstructionOpcodeMatcher::OpcodeValues;
 
-MatchTableRecord
+RecordAndValue
 InstructionOpcodeMatcher::getInstValue(const CodeGenInstruction *I) const {
   const auto VI = OpcodeValues.find(I);
   if (VI != OpcodeValues.end())
-    return MatchTable::NamedValue(2, I->Namespace, I->TheDef->getName(),
-                                  VI->second);
+    return {MatchTable::NamedValue(2, I->Namespace, I->TheDef->getName()),
+            VI->second};
   return MatchTable::NamedValue(2, I->Namespace, I->TheDef->getName());
 }
 
@@ -1428,14 +1416,14 @@ void InstructionOpcodeMatcher::initOpcodeValuesMap(
     OpcodeValues[I] = Target.getInstrIntValue(I->TheDef);
 }
 
-MatchTableRecord InstructionOpcodeMatcher::getValue() const {
+RecordAndValue InstructionOpcodeMatcher::getValue() const {
   assert(Insts.size() == 1);
 
   const CodeGenInstruction *I = Insts[0];
   const auto VI = OpcodeValues.find(I);
   if (VI != OpcodeValues.end())
-    return MatchTable::NamedValue(2, I->Namespace, I->TheDef->getName(),
-                                  VI->second);
+    return {MatchTable::NamedValue(2, I->Namespace, I->TheDef->getName()),
+            VI->second};
   return MatchTable::NamedValue(2, I->Namespace, I->TheDef->getName());
 }
 
@@ -1447,7 +1435,7 @@ void InstructionOpcodeMatcher::emitPredicateOpcodes(MatchTable &Table,
         << MatchTable::ULEB128Value(InsnVarID);
 
   for (const CodeGenInstruction *I : Insts)
-    Table << getInstValue(I);
+    Table << getInstValue(I).Record;
   Table << MatchTable::LineBreak;
 }
 
diff --git a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h
index e7914a613973b..77c8bc290faaf 100644
--- a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h
+++ b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h
@@ -145,14 +145,10 @@ struct MatchTableRecord {
   /// A bitfield of RecordFlagsBits flags.
   unsigned Flags;
 
-  /// The actual run-time value, if known
-  int64_t RawValue;
-
   MatchTableRecord(std::optional<unsigned> LabelID_, StringRef EmitStr,
-                   unsigned NumElements, unsigned Flags,
-                   int64_t RawValue = std::numeric_limits<int64_t>::min())
+                   unsigned NumElements, unsigned Flags)
       : LabelID(LabelID_.value_or(~0u)), EmitStr(EmitStr),
-        NumElements(NumElements), Flags(Flags), RawValue(RawValue) {
+        NumElements(NumElements), Flags(Flags) {
     assert((!LabelID_ || LabelID != ~0u) &&
            "This value is reserved for non-labels");
   }
@@ -166,12 +162,6 @@ struct MatchTableRecord {
     NumElements = 0;
   }
 
-  /// For Jump Table generation purposes
-  bool operator<(const MatchTableRecord &Other) const {
-    return RawValue < Other.RawValue;
-  }
-  int64_t getRawValue() const { return RawValue; }
-
   void emit(raw_ostream &OS, bool LineBreakNextAfterThis,
             const MatchTable &Table) const;
   unsigned size() const { return NumElements; }
@@ -202,12 +192,8 @@ class MatchTable {
   static MatchTableRecord Comment(StringRef Comment);
   static MatchTableRecord Opcode(StringRef Opcode, int IndentAdjust = 0);
   static MatchTableRecord NamedValue(unsigned NumBytes, StringRef NamedValue);
-  static MatchTableRecord NamedValue(unsigned NumBytes, StringRef NamedValue,
-                                     int64_t RawValue);
   static MatchTableRecord NamedValue(unsigned NumBytes, StringRef Namespace,
                                      StringRef NamedValue);
-  static MatchTableRecord NamedValue(unsigned NumBytes, StringRef Namespace,
-                                     StringRef NamedValue, int64_t RawValue);
   static MatchTableRecord IntValue(unsigned NumBytes, int64_t IntValue);
   static MatchTableRecord ULEB128Value(uint64_t IntValue);
   static MatchTableRecord Label(unsigned LabelID);
@@ -400,6 +386,20 @@ class GroupMatcher final : public Matcher {
   bool candidateConditionMatches(const PredicateMatcher &Predicate) const;
 };
 
+/// MatchTableRecord and associated value, for jump table generation.
+struct RecordAndValue {
+  MatchTableRecord Record;
+  int64_t RawValue;
+
+  RecordAndValue(MatchTableRecord Record,
+                 int64_t RawValue = std::numeric_limits<int64_t>::min())
+      : Record(std::move(Record)), RawValue(RawValue) {}
+
+  bool operator<(const RecordAndValue &Other) const {
+    return RawValue < Other.RawValue;
+  }
+};
+
 class SwitchMatcher : public Matcher {
   /// All the nested matchers, representing distinct switch-cases. The first
   /// conditions (as Matcher::getFirstCondition() reports) of all the nested
@@ -414,7 +414,7 @@ class SwitchMatcher : public Matcher {
 
   /// Temporary set used to check that the case values don't repeat within the
   /// same switch.
-  std::set<MatchTableRecord> Values;
+  std::set<RecordAndValue> Values;
 
   /// An owning collection for any auxiliary matchers created while optimizing
   /// nested matchers contained.
@@ -874,7 +874,7 @@ class PredicateMatcher {
     return hasValue() && PredicateMatcher::isIdentical(B);
   }
 
-  virtual MatchTableRecord getValue() const {
+  virtual RecordAndValue getValue() const {
     assert(hasValue() && "Can not get a value of a value-less predicate!");
     llvm_unreachable("Not implemented yet");
   }
@@ -968,7 +968,7 @@ class LLTOperandMatcher : public OperandPredicateMatcher {
            Ty == cast<LLTOperandMatcher>(&B)->Ty;
   }
 
-  MatchTableRecord getValue() const override;
+  RecordAndValue getValue() const override;
   bool hasValue() const override;
 
   LLTCodeGen getTy() const { return Ty; }
@@ -1378,7 +1378,7 @@ class InstructionOpcodeMatcher : public InstructionPredicateMatcher {
 
   static DenseMap<const CodeGenInstruction *, unsigned> OpcodeValues;
 
-  MatchTableRecord getInstValue(const CodeGenInstruction *I) const;
+  RecordAndValue getInstValue(const CodeGenInstruction *I) const;
 
 public:
   static void initOpcodeValuesMap(const CodeGenTarget &Target);
@@ -1405,7 +1405,7 @@ class InstructionOpcodeMatcher : public InstructionPredicateMatcher {
 
   // TODO: This is used for the SwitchMatcher optimization. We should be able to
   // return a list of the opcodes to match.
-  MatchTableRecord getValue() const override;
+  RecordAndValue getValue() const override;
 
   void emitPredicateOpcodes(MatchTable &Table,
                             RuleMatcher &Rule) const override;

From 4d04a406a5a5e13af679018b116fc9975eb85579 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 31 Jan 2025 21:27:09 +0100
Subject: [PATCH 045/282] [TableGen] Don't use inline storage for ReferenceLocs
 (NFC) (#125231)

The ReferenceLocs are not enabled by default (they are used by the
tablegen lsp server), and as such always empty, but still allocate
inline storage for the SmallVector. Disabling it saves about 200MB on
RISCVGenGlobalISel.inc.

(The equivalent field in Record already disables inline storage.)

(cherry picked from commit c640f97ccf723e64ff24af225cb995c905538406)
---
 llvm/include/llvm/TableGen/Record.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/include/llvm/TableGen/Record.h b/llvm/include/llvm/TableGen/Record.h
index d9930a48e8084..e04ed34823148 100644
--- a/llvm/include/llvm/TableGen/Record.h
+++ b/llvm/include/llvm/TableGen/Record.h
@@ -1523,7 +1523,7 @@ class RecordVal {
   bool IsUsed = false;
 
   /// Reference locations to this record value.
-  SmallVector<SMRange> ReferenceLocs;
+  SmallVector<SMRange, 0> ReferenceLocs;
 
 public:
   RecordVal(const Init *N, const RecTy *T, FieldKind K);

From 4cde428de0fe9f16d6c2d8fdbe501aa701c9c6ea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?kadir=20=C3=A7etinkaya?= <kadircet@google.com>
Date: Thu, 6 Feb 2025 10:02:42 +0100
Subject: [PATCH 046/282] [clang] Stop parsing warning suppression mappings in
 driver (#125722)

This gets rid of some extra IO from driver startup, and possiblity of
emitting warnings twice.

(cherry picked from commit df22bbe2beb57687c76402bc0cfdf7901a31cf29)
---
 .../test/Driver/warning-suppression-mappings-not-parsed.cpp  | 5 +++++
 clang/tools/driver/driver.cpp                                | 4 ++++
 2 files changed, 9 insertions(+)
 create mode 100644 clang/test/Driver/warning-suppression-mappings-not-parsed.cpp

diff --git a/clang/test/Driver/warning-suppression-mappings-not-parsed.cpp b/clang/test/Driver/warning-suppression-mappings-not-parsed.cpp
new file mode 100644
index 0000000000000..8f52fb1c6cc7d
--- /dev/null
+++ b/clang/test/Driver/warning-suppression-mappings-not-parsed.cpp
@@ -0,0 +1,5 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: echo '[unknown-warning]' > %t/foo.txt
+// RUN: %clang -fdriver-only --warning-suppression-mappings=%t/foo.txt %s | FileCheck -allow-empty %s
+// CHECK-NOT: unknown warning option 'unknown-warning'
diff --git a/clang/tools/driver/driver.cpp b/clang/tools/driver/driver.cpp
index 74923247b7ee1..00c00cea16f47 100644
--- a/clang/tools/driver/driver.cpp
+++ b/clang/tools/driver/driver.cpp
@@ -318,6 +318,10 @@ int clang_main(int Argc, char **Argv, const llvm::ToolContext &ToolContext) {
 
   IntrusiveRefCntPtr<DiagnosticOptions> DiagOpts =
       CreateAndPopulateDiagOpts(Args);
+  // Driver's diagnostics don't use suppression mappings, so don't bother
+  // parsing them. CC1 still receives full args, so this doesn't impact other
+  // actions.
+  DiagOpts->DiagnosticSuppressionMappingsFile.clear();
 
   TextDiagnosticPrinter *DiagClient
     = new TextDiagnosticPrinter(llvm::errs(), &*DiagOpts);

From 7bcfaa1c4d9ca11eabd1d80b9e3007f1640b2da1 Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Thu, 6 Feb 2025 04:36:47 -0500
Subject: [PATCH 047/282] [flang][Driver] When linking with the Fortran runtime
 also link with libexecinfo (#125998)

Also link with libexecinfo on FreeBSD, NetBSD, OpenBSD and DragonFly
for the backtrace functions.

(cherry picked from commit d1de75acea0da55316cd7827563e064105868f0f)
---
 clang/lib/Driver/ToolChains/CommonArgs.cpp |  5 +++++
 flang/test/Driver/linker-flags.f90         | 16 ++++++++++++----
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 2c4b082bcce4a..4ed4dbc1a8d1b 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -1340,6 +1340,11 @@ void tools::addFortranRuntimeLibs(const ToolChain &TC, const ArgList &Args,
     CmdArgs.push_back("-lFortranRuntime");
     CmdArgs.push_back("-lFortranDecimal");
     addArchSpecificRPath(TC, Args, CmdArgs);
+
+    // needs libexecinfo for backtrace functions
+    if (TC.getTriple().isOSFreeBSD() || TC.getTriple().isOSNetBSD() ||
+        TC.getTriple().isOSOpenBSD() || TC.getTriple().isOSDragonFly())
+      CmdArgs.push_back("-lexecinfo");
   }
 
   // libomp needs libatomic for atomic operations if using libgcc
diff --git a/flang/test/Driver/linker-flags.f90 b/flang/test/Driver/linker-flags.f90
index ac9500d7c45ce..b998cbaa6227c 100644
--- a/flang/test/Driver/linker-flags.f90
+++ b/flang/test/Driver/linker-flags.f90
@@ -5,10 +5,10 @@
 ! RUN: %flang -### --target=ppc64le-linux-gnu %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,UNIX,UNIX-F128%f128-lib
 ! RUN: %flang -### --target=aarch64-apple-darwin %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,DARWIN,DARWIN-F128%f128-lib
 ! RUN: %flang -### --target=sparc-sun-solaris2.11 %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,UNIX,SOLARIS-F128%f128-lib
-! RUN: %flang -### --target=x86_64-unknown-freebsd %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,UNIX,UNIX-F128%f128-lib
-! RUN: %flang -### --target=x86_64-unknown-netbsd %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,UNIX,UNIX-F128%f128-lib
-! RUN: %flang -### --target=x86_64-unknown-openbsd %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,UNIX,UNIX-F128%f128-lib
-! RUN: %flang -### --target=x86_64-unknown-dragonfly %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,UNIX,UNIX-F128%f128-lib
+! RUN: %flang -### --target=x86_64-unknown-freebsd %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,BSD,BSD-F128%f128-lib
+! RUN: %flang -### --target=x86_64-unknown-netbsd %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,BSD,BSD-F128%f128-lib
+! RUN: %flang -### --target=x86_64-unknown-openbsd %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,BSD,BSD-F128%f128-lib
+! RUN: %flang -### --target=x86_64-unknown-dragonfly %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,BSD,BSD-F128%f128-lib
 ! RUN: %flang -### --target=x86_64-unknown-haiku %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,HAIKU,HAIKU-F128%f128-lib
 ! RUN: %flang -### --target=x86_64-windows-gnu %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,MINGW,MINGW-F128%f128-lib
 ! RUN: %flang -### -rtlib=compiler-rt --target=aarch64-linux-gnu %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,UNIX,COMPILER-RT
@@ -36,6 +36,14 @@
 ! UNIX-SAME: "-lFortranRuntime" "-lFortranDecimal" "-lm"
 ! COMPILER-RT: "{{.*}}{{\\|/}}libclang_rt.builtins.a"
 
+! BSD-LABEL:  "{{.*}}ld{{(\.exe)?}}"
+! BSD-SAME: "[[object_file]]"
+! BSD-F128NONE-NOT: FortranFloat128Math
+! BSD-F128LIBQUADMATH-SAME: "-lFortranFloat128Math" "--as-needed" "-lquadmath" "--no-as-needed"
+! BSD-SAME: -lFortranRuntime
+! BSD-SAME: -lFortranDecimal
+! BSD-SAME: -lexecinfo
+
 ! DARWIN-LABEL:  "{{.*}}ld{{(\.exe)?}}"
 ! DARWIN-SAME: "[[object_file]]"
 ! DARWIN-F128NONE-NOT: FortranFloat128Math

From 3542150f05a1f32796d2258f679b9190b7d1f825 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood@arm.com>
Date: Fri, 7 Feb 2025 09:36:52 +0000
Subject: [PATCH 048/282] [LoopVectorize] Fix cost model assert when
 vectorising calls (#125716)

The legacy and vplan cost models did not agree because
VPWidenCallRecipe::computeCost only calculates the cost of the
call instruction, whereas
LoopVectorizationCostModel::setVectorizedCallDecision in some
cases adds on the cost of a synthesised mask argument. However,
this mask is always 'splat(i1 true)' which should be hoisted out
of the loop during codegen. In order to synchronise the two cost
models I have two options:

1) Also add the cost of the splat to the vplan model, or
2) Remove the cost of the splat from the legacy model.

I chose 2) because I feel this more closely represents what the
final code will look like. There is an argument that we should
take account of such broadcast costs in the preheader when
deciding if it's profitable to vectorise a loop, however there
isn't currently a mechanism to do this. We currently only take
account of the runtime checks when assessing profitability and
what the minimum trip count should be. However, I don't believe
this work needs doing as part of this PR.

(cherry picked from commit 1930524bbde3cd26ff527bbdb5e1f937f484edd6)
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  13 +-
 .../LoopVectorize/AArch64/masked-call.ll      | 272 +++++++++++++++++-
 2 files changed, 262 insertions(+), 23 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index c4b159117e2e8..318e4809d97ee 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -6331,19 +6331,8 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
         break;
       }
 
-      // Add in the cost of synthesizing a mask if one wasn't required.
-      InstructionCost MaskCost = 0;
-      if (VecFunc && UsesMask && !MaskRequired)
-        MaskCost = TTI.getShuffleCost(
-            TargetTransformInfo::SK_Broadcast,
-            VectorType::get(IntegerType::getInt1Ty(
-                                VecFunc->getFunctionType()->getContext()),
-                            VF),
-            {}, CostKind);
-
       if (TLI && VecFunc && !CI->isNoBuiltin())
-        VectorCost =
-            TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost;
+        VectorCost = TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind);
 
       // Find the cost of an intrinsic; some targets may have instructions that
       // perform the operation without needing an actual call.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
index 61bbae0b3f16a..5b0f0961a6297 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
@@ -39,7 +39,7 @@ define void @test_widen(ptr noalias %a, ptr readnone %b) #4 {
 ; TFNONE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; TFNONE-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]]
 ; TFNONE-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8
-; TFNONE-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR3:[0-9]+]]
+; TFNONE-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR4:[0-9]+]]
 ; TFNONE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]]
 ; TFNONE-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 8
 ; TFNONE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
@@ -177,7 +177,7 @@ define void @test_if_then(ptr noalias %a, ptr readnone %b) #4 {
 ; TFNONE-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[TMP12]], 50
 ; TFNONE-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END]]
 ; TFNONE:       if.then:
-; TFNONE-NEXT:    [[TMP13:%.*]] = call i64 @foo(i64 [[TMP12]]) #[[ATTR3]]
+; TFNONE-NEXT:    [[TMP13:%.*]] = call i64 @foo(i64 [[TMP12]]) #[[ATTR4]]
 ; TFNONE-NEXT:    br label [[IF_END]]
 ; TFNONE:       if.end:
 ; TFNONE-NEXT:    [[TMP14:%.*]] = phi i64 [ [[TMP13]], [[IF_THEN]] ], [ 0, [[FOR_BODY]] ]
@@ -339,10 +339,10 @@ define void @test_widen_if_then_else(ptr noalias %a, ptr readnone %b) #4 {
 ; TFNONE-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[TMP13]], 50
 ; TFNONE-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
 ; TFNONE:       if.then:
-; TFNONE-NEXT:    [[TMP14:%.*]] = call i64 @foo(i64 [[TMP13]]) #[[ATTR4:[0-9]+]]
+; TFNONE-NEXT:    [[TMP14:%.*]] = call i64 @foo(i64 [[TMP13]]) #[[ATTR5:[0-9]+]]
 ; TFNONE-NEXT:    br label [[IF_END]]
 ; TFNONE:       if.else:
-; TFNONE-NEXT:    [[TMP15:%.*]] = call i64 @foo(i64 0) #[[ATTR4]]
+; TFNONE-NEXT:    [[TMP15:%.*]] = call i64 @foo(i64 0) #[[ATTR5]]
 ; TFNONE-NEXT:    br label [[IF_END]]
 ; TFNONE:       if.end:
 ; TFNONE-NEXT:    [[TMP16:%.*]] = phi i64 [ [[TMP14]], [[IF_THEN]] ], [ [[TMP15]], [[IF_ELSE]] ]
@@ -509,7 +509,7 @@ define void @test_widen_nomask(ptr noalias %a, ptr readnone %b) #4 {
 ; TFNONE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; TFNONE-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]]
 ; TFNONE-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8
-; TFNONE-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR5:[0-9]+]]
+; TFNONE-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR6:[0-9]+]]
 ; TFNONE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]]
 ; TFNONE-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 8
 ; TFNONE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
@@ -525,7 +525,7 @@ define void @test_widen_nomask(ptr noalias %a, ptr readnone %b) #4 {
 ; TFALWAYS-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; TFALWAYS-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDVARS_IV]]
 ; TFALWAYS-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8
-; TFALWAYS-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR5:[0-9]+]]
+; TFALWAYS-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR6:[0-9]+]]
 ; TFALWAYS-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDVARS_IV]]
 ; TFALWAYS-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 8
 ; TFALWAYS-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
@@ -557,7 +557,7 @@ define void @test_widen_nomask(ptr noalias %a, ptr readnone %b) #4 {
 ; TFFALLBACK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[N_VEC]], [[VECTOR_BODY]] ]
 ; TFFALLBACK-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]]
 ; TFFALLBACK-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8
-; TFFALLBACK-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR5:[0-9]+]]
+; TFFALLBACK-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR6:[0-9]+]]
 ; TFFALLBACK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]]
 ; TFFALLBACK-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 8
 ; TFFALLBACK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
@@ -573,7 +573,7 @@ define void @test_widen_nomask(ptr noalias %a, ptr readnone %b) #4 {
 ; TFA_INTERLEAVE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; TFA_INTERLEAVE-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDVARS_IV]]
 ; TFA_INTERLEAVE-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8
-; TFA_INTERLEAVE-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR5:[0-9]+]]
+; TFA_INTERLEAVE-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR6:[0-9]+]]
 ; TFA_INTERLEAVE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDVARS_IV]]
 ; TFA_INTERLEAVE-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 8
 ; TFA_INTERLEAVE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
@@ -634,7 +634,7 @@ define void @test_widen_optmask(ptr noalias %a, ptr readnone %b) #4 {
 ; TFNONE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; TFNONE-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]]
 ; TFNONE-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8
-; TFNONE-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR6:[0-9]+]]
+; TFNONE-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR7:[0-9]+]]
 ; TFNONE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]]
 ; TFNONE-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 8
 ; TFNONE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
@@ -806,7 +806,7 @@ define double @test_widen_fmuladd_and_call(ptr noalias %a, ptr readnone %b, doub
 ; TFNONE-NEXT:    [[LOAD:%.*]] = load double, ptr [[GEP]], align 8
 ; TFNONE-NEXT:    [[MULADD]] = tail call double @llvm.fmuladd.f64(double [[LOAD]], double [[M]], double [[FMA_SUM]])
 ; TFNONE-NEXT:    [[TOINT:%.*]] = fptoui double [[LOAD]] to i64
-; TFNONE-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[TOINT]]) #[[ATTR3]]
+; TFNONE-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[TOINT]]) #[[ATTR4]]
 ; TFNONE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]]
 ; TFNONE-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 8
 ; TFNONE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
@@ -963,16 +963,266 @@ for.cond.cleanup:
   ret double %muladd
 }
 
+
+define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 {
+; TFNONE-LABEL: @test_widen_exp_v2(
+; TFNONE-NEXT:  entry:
+; TFNONE-NEXT:    [[TMP0:%.*]] = add i64 [[N:%.*]], 1
+; TFNONE-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; TFNONE-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 2
+; TFNONE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; TFNONE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; TFNONE:       vector.ph:
+; TFNONE-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; TFNONE-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+; TFNONE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
+; TFNONE-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; TFNONE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; TFNONE-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; TFNONE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; TFNONE:       vector.body:
+; TFNONE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; TFNONE-NEXT:    [[TMP7:%.*]] = load double, ptr [[P2:%.*]], align 8
+; TFNONE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x double> poison, double [[TMP7]], i64 0
+; TFNONE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x double> [[BROADCAST_SPLATINSERT]], <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
+; TFNONE-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x double> @exp_masked_scalable(<vscale x 2 x double> [[BROADCAST_SPLAT]], <vscale x 2 x i1> splat (i1 true))
+; TFNONE-NEXT:    [[TMP9:%.*]] = fcmp ogt <vscale x 2 x double> [[TMP8]], zeroinitializer
+; TFNONE-NEXT:    [[TMP10:%.*]] = xor <vscale x 2 x i1> [[TMP9]], splat (i1 true)
+; TFNONE-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP10]], <vscale x 2 x double> splat (double 1.000000e+00), <vscale x 2 x double> zeroinitializer
+; TFNONE-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vscale.i32()
+; TFNONE-NEXT:    [[TMP12:%.*]] = mul i32 [[TMP11]], 2
+; TFNONE-NEXT:    [[TMP13:%.*]] = sub i32 [[TMP12]], 1
+; TFNONE-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 2 x double> [[PREDPHI]], i32 [[TMP13]]
+; TFNONE-NEXT:    store double [[TMP14]], ptr [[P:%.*]], align 8
+; TFNONE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
+; TFNONE-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; TFNONE-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; TFNONE:       middle.block:
+; TFNONE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; TFNONE-NEXT:    br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]]
+; TFNONE:       scalar.ph:
+; TFNONE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; TFNONE-NEXT:    br label [[LOOP:%.*]]
+; TFNONE:       loop:
+; TFNONE-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_END:%.*]] ]
+; TFNONE-NEXT:    [[LD:%.*]] = load double, ptr [[P2]], align 8
+; TFNONE-NEXT:    [[EXP:%.*]] = tail call double @llvm.exp.f64(double [[LD]]) #[[ATTR8:[0-9]+]]
+; TFNONE-NEXT:    [[COND1:%.*]] = fcmp ogt double [[EXP]], 0.000000e+00
+; TFNONE-NEXT:    br i1 [[COND1]], label [[LOOP_MIDDLE:%.*]], label [[LOOP_END]]
+; TFNONE:       loop.middle:
+; TFNONE-NEXT:    br label [[LOOP_END]]
+; TFNONE:       loop.end:
+; TFNONE-NEXT:    [[SINK:%.*]] = phi double [ 0.000000e+00, [[LOOP_MIDDLE]] ], [ 1.000000e+00, [[LOOP]] ]
+; TFNONE-NEXT:    store double [[SINK]], ptr [[P]], align 8
+; TFNONE-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; TFNONE-NEXT:    [[COND2:%.*]] = icmp eq i64 [[IV]], [[N]]
+; TFNONE-NEXT:    br i1 [[COND2]], label [[END]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
+; TFNONE:       end:
+; TFNONE-NEXT:    ret void
+;
+; TFALWAYS-LABEL: @test_widen_exp_v2(
+; TFALWAYS-NEXT:  entry:
+; TFALWAYS-NEXT:    [[TMP0:%.*]] = add i64 [[N:%.*]], 1
+; TFALWAYS-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], 1
+; TFALWAYS-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2
+; TFALWAYS-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; TFALWAYS-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], 2
+; TFALWAYS-NEXT:    [[TMP2:%.*]] = icmp ugt i64 [[TMP0]], 2
+; TFALWAYS-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i64 [[TMP1]], i64 0
+; TFALWAYS-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 0, i64 [[TMP0]])
+; TFALWAYS-NEXT:    br label [[VECTOR_BODY:%.*]]
+; TFALWAYS:       vector.body:
+; TFALWAYS-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ]
+; TFALWAYS-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[PRED_STORE_CONTINUE2]] ]
+; TFALWAYS-NEXT:    [[TMP4:%.*]] = load double, ptr [[P2:%.*]], align 8
+; TFALWAYS-NEXT:    [[TMP5:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR7:[0-9]+]]
+; TFALWAYS-NEXT:    [[TMP6:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR7]]
+; TFALWAYS-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i32 0
+; TFALWAYS-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[TMP6]], i32 1
+; TFALWAYS-NEXT:    [[TMP9:%.*]] = fcmp ogt <2 x double> [[TMP8]], zeroinitializer
+; TFALWAYS-NEXT:    [[TMP10:%.*]] = xor <2 x i1> [[TMP9]], splat (i1 true)
+; TFALWAYS-NEXT:    [[TMP11:%.*]] = select <2 x i1> [[ACTIVE_LANE_MASK]], <2 x i1> [[TMP10]], <2 x i1> zeroinitializer
+; TFALWAYS-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP11]], <2 x double> splat (double 1.000000e+00), <2 x double> zeroinitializer
+; TFALWAYS-NEXT:    [[TMP12:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 0
+; TFALWAYS-NEXT:    br i1 [[TMP12]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; TFALWAYS:       pred.store.if:
+; TFALWAYS-NEXT:    [[TMP13:%.*]] = extractelement <2 x double> [[PREDPHI]], i32 0
+; TFALWAYS-NEXT:    store double [[TMP13]], ptr [[P:%.*]], align 8
+; TFALWAYS-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; TFALWAYS:       pred.store.continue:
+; TFALWAYS-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 1
+; TFALWAYS-NEXT:    br i1 [[TMP14]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]]
+; TFALWAYS:       pred.store.if1:
+; TFALWAYS-NEXT:    [[TMP15:%.*]] = extractelement <2 x double> [[PREDPHI]], i32 1
+; TFALWAYS-NEXT:    store double [[TMP15]], ptr [[P]], align 8
+; TFALWAYS-NEXT:    br label [[PRED_STORE_CONTINUE2]]
+; TFALWAYS:       pred.store.continue2:
+; TFALWAYS-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; TFALWAYS-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 [[INDEX]], i64 [[TMP3]])
+; TFALWAYS-NEXT:    [[TMP16:%.*]] = xor <2 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; TFALWAYS-NEXT:    [[TMP17:%.*]] = extractelement <2 x i1> [[TMP16]], i32 0
+; TFALWAYS-NEXT:    br i1 [[TMP17]], label [[END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; TFALWAYS:       end:
+; TFALWAYS-NEXT:    ret void
+;
+; TFFALLBACK-LABEL: @test_widen_exp_v2(
+; TFFALLBACK-NEXT:  entry:
+; TFFALLBACK-NEXT:    [[TMP0:%.*]] = add i64 [[N:%.*]], 1
+; TFFALLBACK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], 1
+; TFFALLBACK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2
+; TFFALLBACK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; TFFALLBACK-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], 2
+; TFFALLBACK-NEXT:    [[TMP2:%.*]] = icmp ugt i64 [[TMP0]], 2
+; TFFALLBACK-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i64 [[TMP1]], i64 0
+; TFFALLBACK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 0, i64 [[TMP0]])
+; TFFALLBACK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; TFFALLBACK:       vector.body:
+; TFFALLBACK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ]
+; TFFALLBACK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[PRED_STORE_CONTINUE2]] ]
+; TFFALLBACK-NEXT:    [[TMP4:%.*]] = load double, ptr [[P2:%.*]], align 8
+; TFFALLBACK-NEXT:    [[TMP5:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR7:[0-9]+]]
+; TFFALLBACK-NEXT:    [[TMP6:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR7]]
+; TFFALLBACK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i32 0
+; TFFALLBACK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[TMP6]], i32 1
+; TFFALLBACK-NEXT:    [[TMP9:%.*]] = fcmp ogt <2 x double> [[TMP8]], zeroinitializer
+; TFFALLBACK-NEXT:    [[TMP10:%.*]] = xor <2 x i1> [[TMP9]], splat (i1 true)
+; TFFALLBACK-NEXT:    [[TMP11:%.*]] = select <2 x i1> [[ACTIVE_LANE_MASK]], <2 x i1> [[TMP10]], <2 x i1> zeroinitializer
+; TFFALLBACK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP11]], <2 x double> splat (double 1.000000e+00), <2 x double> zeroinitializer
+; TFFALLBACK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 0
+; TFFALLBACK-NEXT:    br i1 [[TMP12]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; TFFALLBACK:       pred.store.if:
+; TFFALLBACK-NEXT:    [[TMP13:%.*]] = extractelement <2 x double> [[PREDPHI]], i32 0
+; TFFALLBACK-NEXT:    store double [[TMP13]], ptr [[P:%.*]], align 8
+; TFFALLBACK-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; TFFALLBACK:       pred.store.continue:
+; TFFALLBACK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 1
+; TFFALLBACK-NEXT:    br i1 [[TMP14]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]]
+; TFFALLBACK:       pred.store.if1:
+; TFFALLBACK-NEXT:    [[TMP15:%.*]] = extractelement <2 x double> [[PREDPHI]], i32 1
+; TFFALLBACK-NEXT:    store double [[TMP15]], ptr [[P]], align 8
+; TFFALLBACK-NEXT:    br label [[PRED_STORE_CONTINUE2]]
+; TFFALLBACK:       pred.store.continue2:
+; TFFALLBACK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; TFFALLBACK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 [[INDEX]], i64 [[TMP3]])
+; TFFALLBACK-NEXT:    [[TMP16:%.*]] = xor <2 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; TFFALLBACK-NEXT:    [[TMP17:%.*]] = extractelement <2 x i1> [[TMP16]], i32 0
+; TFFALLBACK-NEXT:    br i1 [[TMP17]], label [[END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; TFFALLBACK:       end:
+; TFFALLBACK-NEXT:    ret void
+;
+; TFA_INTERLEAVE-LABEL: @test_widen_exp_v2(
+; TFA_INTERLEAVE-NEXT:  entry:
+; TFA_INTERLEAVE-NEXT:    [[TMP0:%.*]] = add i64 [[N:%.*]], 1
+; TFA_INTERLEAVE-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], 3
+; TFA_INTERLEAVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
+; TFA_INTERLEAVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; TFA_INTERLEAVE-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], 4
+; TFA_INTERLEAVE-NEXT:    [[TMP2:%.*]] = icmp ugt i64 [[TMP0]], 4
+; TFA_INTERLEAVE-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i64 [[TMP1]], i64 0
+; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 0, i64 [[TMP0]])
+; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 2, i64 [[TMP0]])
+; TFA_INTERLEAVE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; TFA_INTERLEAVE:       vector.body:
+; TFA_INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE9:%.*]] ]
+; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[PRED_STORE_CONTINUE9]] ]
+; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK2:%.*]] = phi <2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY1]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT10:%.*]], [[PRED_STORE_CONTINUE9]] ]
+; TFA_INTERLEAVE-NEXT:    [[TMP4:%.*]] = load double, ptr [[P2:%.*]], align 8
+; TFA_INTERLEAVE-NEXT:    [[TMP5:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR7:[0-9]+]]
+; TFA_INTERLEAVE-NEXT:    [[TMP6:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR7]]
+; TFA_INTERLEAVE-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i32 0
+; TFA_INTERLEAVE-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[TMP6]], i32 1
+; TFA_INTERLEAVE-NEXT:    [[TMP9:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR7]]
+; TFA_INTERLEAVE-NEXT:    [[TMP10:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR7]]
+; TFA_INTERLEAVE-NEXT:    [[TMP11:%.*]] = insertelement <2 x double> poison, double [[TMP9]], i32 0
+; TFA_INTERLEAVE-NEXT:    [[TMP12:%.*]] = insertelement <2 x double> [[TMP11]], double [[TMP10]], i32 1
+; TFA_INTERLEAVE-NEXT:    [[TMP13:%.*]] = fcmp ogt <2 x double> [[TMP8]], zeroinitializer
+; TFA_INTERLEAVE-NEXT:    [[TMP14:%.*]] = fcmp ogt <2 x double> [[TMP12]], zeroinitializer
+; TFA_INTERLEAVE-NEXT:    [[TMP15:%.*]] = xor <2 x i1> [[TMP13]], splat (i1 true)
+; TFA_INTERLEAVE-NEXT:    [[TMP16:%.*]] = xor <2 x i1> [[TMP14]], splat (i1 true)
+; TFA_INTERLEAVE-NEXT:    [[TMP17:%.*]] = select <2 x i1> [[ACTIVE_LANE_MASK]], <2 x i1> [[TMP15]], <2 x i1> zeroinitializer
+; TFA_INTERLEAVE-NEXT:    [[TMP18:%.*]] = select <2 x i1> [[ACTIVE_LANE_MASK2]], <2 x i1> [[TMP16]], <2 x i1> zeroinitializer
+; TFA_INTERLEAVE-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP17]], <2 x double> splat (double 1.000000e+00), <2 x double> zeroinitializer
+; TFA_INTERLEAVE-NEXT:    [[PREDPHI3:%.*]] = select <2 x i1> [[TMP18]], <2 x double> splat (double 1.000000e+00), <2 x double> zeroinitializer
+; TFA_INTERLEAVE-NEXT:    [[TMP19:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 0
+; TFA_INTERLEAVE-NEXT:    br i1 [[TMP19]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; TFA_INTERLEAVE:       pred.store.if:
+; TFA_INTERLEAVE-NEXT:    [[TMP20:%.*]] = extractelement <2 x double> [[PREDPHI]], i32 0
+; TFA_INTERLEAVE-NEXT:    store double [[TMP20]], ptr [[P:%.*]], align 8
+; TFA_INTERLEAVE-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; TFA_INTERLEAVE:       pred.store.continue:
+; TFA_INTERLEAVE-NEXT:    [[TMP21:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 1
+; TFA_INTERLEAVE-NEXT:    br i1 [[TMP21]], label [[PRED_STORE_IF4:%.*]], label [[PRED_STORE_CONTINUE5:%.*]]
+; TFA_INTERLEAVE:       pred.store.if4:
+; TFA_INTERLEAVE-NEXT:    [[TMP22:%.*]] = extractelement <2 x double> [[PREDPHI]], i32 1
+; TFA_INTERLEAVE-NEXT:    store double [[TMP22]], ptr [[P]], align 8
+; TFA_INTERLEAVE-NEXT:    br label [[PRED_STORE_CONTINUE5]]
+; TFA_INTERLEAVE:       pred.store.continue5:
+; TFA_INTERLEAVE-NEXT:    [[TMP23:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK2]], i32 0
+; TFA_INTERLEAVE-NEXT:    br i1 [[TMP23]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7:%.*]]
+; TFA_INTERLEAVE:       pred.store.if6:
+; TFA_INTERLEAVE-NEXT:    [[TMP24:%.*]] = extractelement <2 x double> [[PREDPHI3]], i32 0
+; TFA_INTERLEAVE-NEXT:    store double [[TMP24]], ptr [[P]], align 8
+; TFA_INTERLEAVE-NEXT:    br label [[PRED_STORE_CONTINUE7]]
+; TFA_INTERLEAVE:       pred.store.continue7:
+; TFA_INTERLEAVE-NEXT:    [[TMP25:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK2]], i32 1
+; TFA_INTERLEAVE-NEXT:    br i1 [[TMP25]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9]]
+; TFA_INTERLEAVE:       pred.store.if8:
+; TFA_INTERLEAVE-NEXT:    [[TMP26:%.*]] = extractelement <2 x double> [[PREDPHI3]], i32 1
+; TFA_INTERLEAVE-NEXT:    store double [[TMP26]], ptr [[P]], align 8
+; TFA_INTERLEAVE-NEXT:    br label [[PRED_STORE_CONTINUE9]]
+; TFA_INTERLEAVE:       pred.store.continue9:
+; TFA_INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; TFA_INTERLEAVE-NEXT:    [[TMP27:%.*]] = add i64 [[INDEX]], 2
+; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 [[INDEX]], i64 [[TMP3]])
+; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT10]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 [[TMP27]], i64 [[TMP3]])
+; TFA_INTERLEAVE-NEXT:    [[TMP28:%.*]] = xor <2 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; TFA_INTERLEAVE-NEXT:    [[TMP29:%.*]] = extractelement <2 x i1> [[TMP28]], i32 0
+; TFA_INTERLEAVE-NEXT:    br i1 [[TMP29]], label [[END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; TFA_INTERLEAVE:       end:
+; TFA_INTERLEAVE-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.end ]
+  %ld = load double, ptr %p2, align 8
+  %exp = tail call double @llvm.exp.f64(double %ld) #6
+  %cond1 = fcmp ogt double %exp, 0.000000e+00
+  br i1 %cond1, label %loop.middle, label %loop.end
+
+loop.middle:
+  br label %loop.end
+
+loop.end:
+  %sink = phi double [ 0.000000e+00, %loop.middle ], [ 1.000000e+00, %loop ]
+  store double %sink, ptr %p, align 8
+  %iv.next = add i64 %iv, 1
+  %cond2 = icmp eq i64 %iv, %n
+  br i1 %cond2, label %end, label %loop
+
+end:
+  ret void
+}
+
+
 declare i64 @foo(i64)
 declare double @llvm.fmuladd.f64(double, double, double)
+declare double @llvm.exp.f64(double)
+
+; fixed-width variants of exp
+declare <2 x double> @exp_fixed(<2 x double>)
 
-;; scalable vector variants of foo
+;; scalable vector variants of foo and exp
 declare <vscale x 2 x i64> @foo_uniform(i64, <vscale x 2 x i1>)
 declare <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64>, <vscale x 2 x i1>)
 declare <vscale x 2 x i64> @foo_vector_nomask(<vscale x 2 x i64>)
+declare <vscale x 2 x double> @exp_masked_scalable(<vscale x 2 x double>, <vscale x 2 x i1>)
 
 attributes #0 = { nounwind "vector-function-abi-variant"="_ZGVsMxv_foo(foo_vector),_ZGVsMxu_foo(foo_uniform)" }
 attributes #1 = { nounwind "vector-function-abi-variant"="_ZGVsMxv_foo(foo_vector)" }
 attributes #2 = { nounwind "vector-function-abi-variant"="_ZGVsNxv_foo(foo_vector_nomask)" }
 attributes #3 = { nounwind "vector-function-abi-variant"="_ZGVsNxv_foo(foo_vector_nomask),_ZGVsMxv_foo(foo_vector)" }
 attributes #4 = { "target-features"="+sve" vscale_range(2,16) "no-trapping-math"="false" }
+attributes #5 = { "target-cpu"="neoverse-v2" vscale_range(1,16) }
+attributes #6 = { "vector-function-abi-variant"="_ZGV_LLVM_N2v_llvm.exp.f64(exp_fixed),_ZGVsMxv_llvm.exp.f64(exp_masked_scalable)" }

From b5f41cc50c9db7e3c155e5c1e8114678baa6bce9 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood@arm.com>
Date: Fri, 7 Feb 2025 10:16:32 +0000
Subject: [PATCH 049/282] [LoopVectorize] Fix build error (#126218)

Fixes issue caused by 1930524bbde3cd26ff527bbdb5e1f937f484edd6

Unused variable UsesMask in LoopVectorize.cpp

(cherry picked from commit 3872e55758a5de035c032a975f244302c3ddacc3)
---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 318e4809d97ee..06c2a91f89b1c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -6262,7 +6262,6 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
 
       // Find the cost of vectorizing the call, if we can find a suitable
       // vector variant of the function.
-      bool UsesMask = false;
       VFInfo FuncInfo;
       Function *VecFunc = nullptr;
       // Search through any available variants for one we can use at this VF.
@@ -6314,7 +6313,6 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
             break;
           }
           case VFParamKind::GlobalPredicate:
-            UsesMask = true;
             break;
           default:
             ParamsOk = false;

From e2426cd9e9b47a771f2aaaef8c48f8dbea8b7d49 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 7 Feb 2025 09:18:18 +0100
Subject: [PATCH 050/282] [libclc] Allow default path when looking for
 llvm-spirv (#126071)

This is an external tool, so I don't think there is an expectation that
it has to be in the LLVM tools bindir. It may also be in the default
system bindir (which is not necessarily the same).

(cherry picked from commit 26ecddb05d13c101ccd840a6710eb5f8b82de841)
---
 libclc/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt
index 2c2c7f16e2944..43e213b385f5d 100644
--- a/libclc/CMakeLists.txt
+++ b/libclc/CMakeLists.txt
@@ -114,7 +114,7 @@ endforeach()
 if( TARGET llvm-spirv )
   get_host_tool_path( llvm-spirv LLVM_SPIRV llvm-spirv_exe llvm-spirv_target )
 else()
-  find_program( LLVM_SPIRV llvm-spirv PATHS ${LLVM_TOOLS_BINARY_DIR} NO_DEFAULT_PATH )
+  find_program( LLVM_SPIRV llvm-spirv HINTS ${LLVM_TOOLS_BINARY_DIR} )
   set( llvm-spirv_exe "${LLVM_SPIRV}" )
   set( llvm-spirv_target )
 endif()

From 3c011c65f618a058b0be483223f4158bb269deea Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Fri, 7 Feb 2025 13:09:58 -0800
Subject: [PATCH 051/282] workflows/premerge: Move concurrency definition to
 workflow level (#126308)

Prior workflow runs were not being cancelled when the pull request was
closed, and I think this was why. Also, there is no advantage to having
the definitions at the job level.

(cherry picked from commit 6e5988863177e1d53e7a7abb7a3db2b85376f0f5)
---
 .github/workflows/premerge.yaml | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/premerge.yaml b/.github/workflows/premerge.yaml
index 45e6bb763a0ef..9b6a1236823d7 100644
--- a/.github/workflows/premerge.yaml
+++ b/.github/workflows/premerge.yaml
@@ -19,15 +19,16 @@ on:
       - 'main'
       - 'release/**'
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}
+  cancel-in-progress: true
+
 jobs:
   premerge-checks-linux:
     if: >-
         false && github.repository_owner == 'llvm' &&
         (github.event_name != 'pull_request' || github.event.action != 'closed')
     runs-on: llvm-premerge-linux-runners
-    concurrency:
-      group: ${{ github.workflow }}-linux-${{ github.event.pull_request.number || github.sha }}
-      cancel-in-progress: true
     steps:
       - name: Checkout LLVM
         uses: actions/checkout@v4
@@ -86,9 +87,6 @@ jobs:
         false && github.repository_owner == 'llvm' &&
         (github.event_name != 'pull_request' || github.event.action != 'closed')
     runs-on: llvm-premerge-windows-runners
-    concurrency:
-      group: ${{ github.workflow }}-windows-${{ github.event.pull_request.number || github.sha }}
-      cancel-in-progress: true
     defaults:
       run:
         shell: bash
@@ -146,9 +144,6 @@ jobs:
 
   permerge-check-macos:
     runs-on: macos-14
-    concurrency:
-      group: ${{ github.workflow }}-macos-${{ github.event.pull_request.number || github.sha }}
-      cancel-in-progress: true
     if: >-
       github.repository_owner == 'llvm' &&
       (startswith(github.ref_name, 'release/') ||

From 454d7c10f1869e0c109c733b2385b8c16c6a7756 Mon Sep 17 00:00:00 2001
From: Phoebe Wang <phoebe.wang@intel.com>
Date: Thu, 6 Feb 2025 10:58:37 +0800
Subject: [PATCH 052/282] [X86] Do not combine LRINT and TRUNC (#125848)

Per to discussions in #125324, most participants are opposed to this
optimization. So remove the combination to address the concerns.

Fixes #125324

(cherry picked from commit 8c222c122f1a8edb1be96e482511ad547f7db7b3)
---
 llvm/lib/Target/X86/X86ISelLowering.cpp |  5 -----
 llvm/test/CodeGen/X86/lrint-conv-i64.ll | 18 ++++++++++++++++++
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 8f904209d8a3a..627cef9ead7ff 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -53899,11 +53899,6 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
       return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
   }
 
-  // Try to combine (trunc (vNi64 (lrint x))) to (vNi32 (lrint x)).
-  if (Src.getOpcode() == ISD::LRINT && VT.getScalarType() == MVT::i32 &&
-      Src.hasOneUse())
-    return DAG.getNode(ISD::LRINT, DL, VT, Src.getOperand(0));
-
   return SDValue();
 }
 
diff --git a/llvm/test/CodeGen/X86/lrint-conv-i64.ll b/llvm/test/CodeGen/X86/lrint-conv-i64.ll
index 01b0af2f807f2..38fa09085e189 100644
--- a/llvm/test/CodeGen/X86/lrint-conv-i64.ll
+++ b/llvm/test/CodeGen/X86/lrint-conv-i64.ll
@@ -45,6 +45,24 @@ entry:
   ret i64 %0
 }
 
+define i32 @PR125324(float %x) {
+; SSE-LABEL: PR125324:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    cvtss2si %xmm0, %rax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: PR125324:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vcvtss2si %xmm0, %rax
+; AVX-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX-NEXT:    retq
+entry:
+  %0 = tail call i64 @llvm.lrint.i64.f32(float %x)
+  %1 = trunc i64 %0 to i32
+  ret i32 %1
+}
+
 declare i64 @llvm.lrint.i64.f32(float) nounwind readnone
 declare i64 @llvm.lrint.i64.f64(double) nounwind readnone
 declare i64 @llvm.lrint.i64.f80(x86_fp80) nounwind readnone

From e40790864ba00124931f0b242c20a474a9255eb7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?kadir=20=C3=A7etinkaya?= <kadircet@google.com>
Date: Thu, 6 Feb 2025 10:18:38 +0100
Subject: [PATCH 053/282] [clang] Parse warning-suppression-mapping after
 setting up diagengine (#125714)

We can emit diagnostics while parsing warning-suppression-mapping, make
sure command line flags take affect when emitting those.

(cherry picked from commit ecb016a87d89aed36b8f5d8102e15d8eb0e57108)
---
 clang/lib/Basic/Warnings.cpp             | 23 +++++++++++++----------
 clang/unittests/Basic/DiagnosticTest.cpp |  9 +++++++++
 2 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/clang/lib/Basic/Warnings.cpp b/clang/lib/Basic/Warnings.cpp
index da0304463007b..5f48e0ec81554 100644
--- a/clang/lib/Basic/Warnings.cpp
+++ b/clang/lib/Basic/Warnings.cpp
@@ -73,16 +73,6 @@ void clang::ProcessWarningOptions(DiagnosticsEngine &Diags,
   else
     Diags.setExtensionHandlingBehavior(diag::Severity::Ignored);
 
-  if (!Opts.DiagnosticSuppressionMappingsFile.empty()) {
-    if (auto FileContents =
-            VFS.getBufferForFile(Opts.DiagnosticSuppressionMappingsFile)) {
-      Diags.setDiagSuppressionMapping(**FileContents);
-    } else if (ReportDiags) {
-      Diags.Report(diag::err_drv_no_such_file)
-          << Opts.DiagnosticSuppressionMappingsFile;
-    }
-  }
-
   SmallVector<diag::kind, 10> _Diags;
   const IntrusiveRefCntPtr< DiagnosticIDs > DiagIDs =
     Diags.getDiagnosticIDs();
@@ -237,4 +227,17 @@ void clang::ProcessWarningOptions(DiagnosticsEngine &Diags,
       }
     }
   }
+
+  // Process suppression mappings file after processing other warning flags
+  // (like -Wno-unknown-warning-option) as we can emit extra warnings during
+  // processing.
+  if (!Opts.DiagnosticSuppressionMappingsFile.empty()) {
+    if (auto FileContents =
+            VFS.getBufferForFile(Opts.DiagnosticSuppressionMappingsFile)) {
+      Diags.setDiagSuppressionMapping(**FileContents);
+    } else if (ReportDiags) {
+      Diags.Report(diag::err_drv_no_such_file)
+          << Opts.DiagnosticSuppressionMappingsFile;
+    }
+  }
 }
diff --git a/clang/unittests/Basic/DiagnosticTest.cpp b/clang/unittests/Basic/DiagnosticTest.cpp
index e03d9a464df7f..e33840705e44a 100644
--- a/clang/unittests/Basic/DiagnosticTest.cpp
+++ b/clang/unittests/Basic/DiagnosticTest.cpp
@@ -346,4 +346,13 @@ TEST_F(SuppressionMappingTest, IsIgnored) {
   EXPECT_FALSE(Diags.isIgnored(diag::warn_unused_function,
                                SM.getLocForStartOfFile(ClangID)));
 }
+
+TEST_F(SuppressionMappingTest, ParsingRespectsOtherWarningOpts) {
+  Diags.getDiagnosticOptions().DiagnosticSuppressionMappingsFile = "foo.txt";
+  FS->addFile("foo.txt", /*ModificationTime=*/{},
+              llvm::MemoryBuffer::getMemBuffer("[non-existing-warning]"));
+  Diags.getDiagnosticOptions().Warnings.push_back("no-unknown-warning-option");
+  clang::ProcessWarningOptions(Diags, Diags.getDiagnosticOptions(), *FS);
+  EXPECT_THAT(diags(), IsEmpty());
+}
 } // namespace

From dbb2699d2eee998e2a3d7862f7d633736bbbd77b Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Wed, 5 Feb 2025 20:51:28 -0600
Subject: [PATCH 054/282] [Offload] Stop the RPC server faiilng with more than
 one GPU (#125982)

Summary:
Pretty dumb mistake of me, forgot that this is run per-device and
per-plugin, which fell through the cracks with my testing because I have
two GPUs that use different plugins.

(cherry picked from commit 7a8779422dad058f11cd473d409f42e32859788d)
---
 offload/plugins-nextgen/common/src/PluginInterface.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 16f510de3ecc5..57672b0223bec 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -1057,8 +1057,9 @@ Error GenericDeviceTy::setupRPCServer(GenericPluginTy &Plugin,
   if (auto Err = Server.initDevice(*this, Plugin.getGlobalHandler(), Image))
     return Err;
 
-  if (auto Err = Server.startThread())
-    return Err;
+  if (!Server.Thread->Running.load(std::memory_order_acquire))
+    if (auto Err = Server.startThread())
+      return Err;
 
   RPCServer = &Server;
   DP("Running an RPC server on device %d\n", getDeviceId());
@@ -1633,7 +1634,7 @@ Error GenericPluginTy::deinit() {
   if (GlobalHandler)
     delete GlobalHandler;
 
-  if (RPCServer && RPCServer->Thread->Running.load(std::memory_order_relaxed))
+  if (RPCServer && RPCServer->Thread->Running.load(std::memory_order_acquire))
     if (Error Err = RPCServer->shutDown())
       return Err;
 

From 4d3d4445c67c02d34900d5e4160632e1a06b8df8 Mon Sep 17 00:00:00 2001
From: Peter Smith <peter.smith@arm.com>
Date: Wed, 5 Feb 2025 19:04:21 +0000
Subject: [PATCH 055/282] [LLD][ELF][AArch64] Discard .ARM.attributes sections
 (#125838)

LLVM has started to emit AArch64 build attributes sections called
.ARM.attributes. LLD does not yet have support for these so they are
accumulating in the ELF output. As the first part of that support
discard all the .ARM.attributes sections. This can be built upon by the
full implementation in LLD.

The build attributes specification only defines build attributes for
relocatable objects. The intention for LLD is that files of type ET_EXEC
and ET_SHARED will not have a build attributes in the output. A
relocatable link with -r will need a merged build attributes, but until
the merge is implemented it is better to discard.

(cherry picked from commit ba476d0b83dc8a4bbf066dc02a0f73ded27114f0)
---
 lld/ELF/InputFiles.cpp                  |  7 +++++++
 lld/test/ELF/aarch64-build-attributes.s | 26 +++++++++++++++++++++++++
 2 files changed, 33 insertions(+)
 create mode 100644 lld/test/ELF/aarch64-build-attributes.s

diff --git a/lld/ELF/InputFiles.cpp b/lld/ELF/InputFiles.cpp
index 42d0e4c202ec6..caee72cf31955 100644
--- a/lld/ELF/InputFiles.cpp
+++ b/lld/ELF/InputFiles.cpp
@@ -639,6 +639,13 @@ template <class ELFT> void ObjFile<ELFT>::parse(bool ignoreComdats) {
       }
       break;
     case EM_AARCH64:
+      // FIXME: BuildAttributes have been implemented in llvm, but not yet in
+      // lld. Remove the section so that it does not accumulate in the output
+      // file. When support is implemented we expect not to output a build
+      // attributes section in files of type ET_EXEC or ET_SHARED, but ld -r
+      // ouptut will need a single merged attributes section.
+      if (sec.sh_type == SHT_AARCH64_ATTRIBUTES)
+        sections[i] = &InputSection::discarded;
       // Producing a static binary with MTE globals is not currently supported,
       // remove all SHT_AARCH64_MEMTAG_GLOBALS_STATIC sections as they're unused
       // medatada, and we don't want them to end up in the output file for
diff --git a/lld/test/ELF/aarch64-build-attributes.s b/lld/test/ELF/aarch64-build-attributes.s
new file mode 100644
index 0000000000000..24e15f94e3d4a
--- /dev/null
+++ b/lld/test/ELF/aarch64-build-attributes.s
@@ -0,0 +1,26 @@
+// REQUIRES: aarch64
+// RUN: llvm-mc -triple=aarch64 %s -filetype=obj -o %t.o
+// RUN: ld.lld %t.o --shared -o %t.so
+// RUN: llvm-readelf --sections %t.so | FileCheck %s
+// RUN: ld.lld %t.o -o %t
+// RUN: llvm-readelf --sections %t | FileCheck %s
+// RUN: ld.lld -r %t.o -o %t2.o
+// RUN: llvm-readelf --sections %t2.o | FileCheck %s
+
+/// File has a Build attributes section. This should not appear in
+/// ET_EXEC or ET_SHARED files as there is no requirement for it to
+/// do so. FIXME, the ld -r (relocatable link) should output a single
+/// merged build attributes section. When full support is added in
+/// ld.lld this test should be updated.
+
+// CHECK-NOT: .ARM.attributes
+
+.aeabi_subsection aeabi_feature_and_bits, optional, uleb128
+.aeabi_attribute Tag_Feature_BTI, 1
+.aeabi_attribute Tag_Feature_PAC, 1
+.aeabi_attribute Tag_Feature_GCS, 1
+
+.global _start
+.type _start, %function
+_start:
+ret

From de5dcad3a2d332e75ea3b680d7d41080b064ca7c Mon Sep 17 00:00:00 2001
From: CarolineConcatto <caroline.concatto@arm.com>
Date: Thu, 6 Feb 2025 17:13:30 +0000
Subject: [PATCH 056/282] [AArch64] Update feature dep. for Armv9.6 extensions
 (#125874)

These features FEAT_FAMINMAX, FEAT_LUT and FEAT_FP8 depends on
FEAT_NEON.

Update dependency from FEAT_FP8DOT4 and FEAT_FP8DOT2. Now depends
indirectly on FEAT_NEON through FEAT_FP8
---
 llvm/lib/Target/AArch64/AArch64Features.td     | 10 +++++-----
 llvm/lib/Target/AArch64/AArch64Processors.td   |  2 +-
 .../TargetParser/TargetParserTest.cpp          | 18 +++++++++++++-----
 3 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td
index 20db70ee38572..d47dcfe92ff19 100644
--- a/llvm/lib/Target/AArch64/AArch64Features.td
+++ b/llvm/lib/Target/AArch64/AArch64Features.td
@@ -473,13 +473,13 @@ def FeatureD128 : ExtensionWithMArch<"d128", "D128",
 //===----------------------------------------------------------------------===//
 
 def FeatureFAMINMAX: ExtensionWithMArch<"faminmax", "FAMINMAX", "FEAT_FAMINMAX",
- "Enable FAMIN and FAMAX instructions">;
+ "Enable FAMIN and FAMAX instructions", [FeatureNEON]>;
 
 def FeatureLUT: ExtensionWithMArch<"lut", "LUT", "FEAT_LUT",
- "Enable Lookup Table instructions">;
+ "Enable Lookup Table instructions", [FeatureNEON]>;
 
 def FeatureFP8 : ExtensionWithMArch<"fp8", "FP8", "FEAT_FP8",
-  "Enable FP8 instructions", [FeatureFAMINMAX, FeatureLUT, FeatureBF16]>;
+  "Enable FP8 instructions", [FeatureNEON]>;
 
 def FeatureFP8FMA : ExtensionWithMArch<"fp8fma", "FP8FMA", "FEAT_FP8FMA",
   "Enable Armv9.5-A FP8 multiply-add instructions", [FeatureFP8]>;
@@ -488,10 +488,10 @@ def FeatureSSVE_FP8FMA : ExtensionWithMArch<"ssve-fp8fma", "SSVE_FP8FMA", "FEAT_
   "Enable SVE2 FP8 multiply-add instructions", [FeatureSME2, FeatureFP8]>;
 
 def FeatureFP8DOT4: ExtensionWithMArch<"fp8dot4", "FP8DOT4", "FEAT_FP8DOT4",
-  "Enable FP8 4-way dot instructions", [FeatureNEON, FeatureFP8]>;
+  "Enable FP8 4-way dot instructions", [FeatureFP8]>;
 
 def FeatureFP8DOT2: ExtensionWithMArch<"fp8dot2", "FP8DOT2", "FEAT_FP8DOT2",
-  "Enable FP8 2-way dot instructions", [FeatureNEON, FeatureFP8]>;
+  "Enable FP8 2-way dot instructions", [FeatureFP8]>;
 
 def FeatureSSVE_FP8DOT4 : ExtensionWithMArch<"ssve-fp8dot4", "SSVE_FP8DOT4", "FEAT_SSVE_FP8DOT4",
   "Enable SVE2 FP8 4-way dot product instructions", [FeatureSME2, FeatureFP8]>;
diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
index 8a2c0442a0c0d..d1d4986d12550 100644
--- a/llvm/lib/Target/AArch64/AArch64Processors.td
+++ b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -868,7 +868,7 @@ def ProcessorFeatures {
                                    FeatureSSBS, FeatureLS64, FeatureCLRBHB,
                                    FeatureSPECRES2, FeatureSVEAES, FeatureSVE2SM4,
                                    FeatureSVE2SHA3, FeatureSVE2, FeatureSVEBitPerm, FeatureETE,
-                                   FeatureMEC, FeatureFP8DOT2];
+                                   FeatureMEC, FeatureFAMINMAX, FeatureFP8DOT2, FeatureLUT];
   list<SubtargetFeature> Carmel   = [HasV8_2aOps, FeatureNEON, FeatureSHA2, FeatureAES,
                                      FeatureFullFP16, FeatureCRC, FeatureLSE, FeatureRAS, FeatureRDM,
                                      FeatureFPARMv8];
diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp
index c594d38b50b22..fdf0b0fa23744 100644
--- a/llvm/unittests/TargetParser/TargetParserTest.cpp
+++ b/llvm/unittests/TargetParser/TargetParserTest.cpp
@@ -1814,7 +1814,7 @@ AArch64ExtensionDependenciesBaseArchTestParams
         {AArch64::ARMV9_6A, {"nofp", "fprcvt"}, {"fp-armv8", "fprcvt"}, {}},
         {AArch64::ARMV9_6A, {"fprcvt", "nofp"}, {}, {"fp-armv8", "fprcvt"}},
 
-        // simd -> {aes, sha2, sha3, sm4, f8f16mm, f8f32mm, fp8dot4, fp8dot2}
+        // simd -> {aes, sha2, sha3, sm4, f8f16mm, f8f32mm, faminmax, lut, fp8}
         {AArch64::ARMV8A, {"nosimd", "aes"}, {"neon", "aes"}, {}},
         {AArch64::ARMV8A, {"aes", "nosimd"}, {}, {"neon", "aes"}},
         {AArch64::ARMV8A, {"nosimd", "sha2"}, {"neon", "sha2"}, {}},
@@ -1827,10 +1827,18 @@ AArch64ExtensionDependenciesBaseArchTestParams
         {AArch64::ARMV9_6A, {"f8f16mm", "nosimd"}, {}, {"neon", "f8f16mm"}},
         {AArch64::ARMV9_6A, {"nosimd", "f8f32mm"}, {"neon", "f8f32mm"}, {}},
         {AArch64::ARMV9_6A, {"f8f32mm", "nosimd"}, {}, {"neon", "f8f32mm"}},
-        {AArch64::ARMV9_6A, {"nosimd", "fp8dot4"}, {"neon", "fp8dot4"}, {}},
-        {AArch64::ARMV9_6A, {"fp8dot4", "nosimd"}, {}, {"neon", "fp8dot4"}},
-        {AArch64::ARMV9_6A, {"nosimd", "fp8dot2"}, {"neon", "fp8dot2"}, {}},
-        {AArch64::ARMV9_6A, {"fp8dot2", "nosimd"}, {}, {"neon", "fp8dot2"}},
+        {AArch64::ARMV9_6A, {"faminmax", "nosimd"}, {}, {"neon", "faminmax"}},
+        {AArch64::ARMV9_6A, {"nosimd", "faminmax"}, {"neon", "faminmax"}, {}},
+        {AArch64::ARMV9_6A, {"lut", "nosimd"}, {}, {"neon", "lut"}},
+        {AArch64::ARMV9_6A, {"nosimd", "lut"}, {"neon", "lut"}, {}},
+        {AArch64::ARMV9_6A, {"fp8", "nosimd"}, {}, {"neon", "fp8"}},
+        {AArch64::ARMV9_6A, {"nosimd", "fp8"}, {"neon", "fp8"}, {}},
+
+        // fp8 -> {fp8dot4, fp8dot2}
+        {AArch64::ARMV9_6A, {"nofp8", "fp8dot4"}, {"fp8", "fp8dot4"}, {}},
+        {AArch64::ARMV9_6A, {"fp8dot4", "nofp8"}, {}, {"fp8", "fp8dot4"}},
+        {AArch64::ARMV9_6A, {"nofp8", "fp8dot2"}, {"fp8", "fp8dot2"}, {}},
+        {AArch64::ARMV9_6A, {"fp8dot2", "nofp8"}, {}, {"fp8", "fp8dot2"}},
 
         // simd -> {rdm, dotprod, fcma}
         {AArch64::ARMV8A, {"nosimd", "rdm"}, {"neon", "rdm"}, {}},

From 3351e1b142a8ef097f15766e363c2ecb5b8f7e5f Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Fri, 7 Feb 2025 10:16:57 +0000
Subject: [PATCH 057/282] [AArch64] Enable AvoidLDAPUR for cpu=generic between
 armv8.4 and armv9.3. (#125261)

As added in #124274, CPUs in this range can suffer from performance
issues with ldapur. As the gain from ldar->ldapr is expected to be
greater than the minor gain from ldapr->ldapur, this opts to avoid the
instruction under the default -mcpu=generic when the -march is less that
armv8.8 / armv9.3.

I renamed AArch64Subtarget::Others to AArch64Subtarget::Generic to be
clearer what it means.

(cherry picked from commit 6424abcd6c9c6aa8171c79d0fe0369d3a10da3d5)
---
 llvm/lib/Target/AArch64/AArch64Subtarget.cpp               | 7 ++++++-
 llvm/lib/Target/AArch64/AArch64Subtarget.h                 | 4 ++--
 llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp     | 2 +-
 .../AArch64/Atomics/aarch64-atomic-load-rcpc_immo.ll       | 7 +++++--
 4 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index bc921f07e1dbf..809e658e65380 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -125,7 +125,12 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
   // this in the future so we can specify it together with the subtarget
   // features.
   switch (ARMProcFamily) {
-  case Others:
+  case Generic:
+    // Using TuneCPU=generic we avoid ldapur instructions to line up with the
+    // cpus that use the AvoidLDAPUR feature. We don't want this to be on
+    // forever, so it is enabled between armv8.4 and armv8.7/armv9.2.
+    if (hasV8_4aOps() && !hasV8_8aOps())
+      AvoidLDAPUR = true;
     break;
   case Carmel:
     CacheLineSize = 64;
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index d22991224d496..dca5f5393fe47 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -38,7 +38,7 @@ class Triple;
 class AArch64Subtarget final : public AArch64GenSubtargetInfo {
 public:
   enum ARMProcFamilyEnum : uint8_t {
-    Others,
+    Generic,
 #define ARM_PROCESSOR_FAMILY(ENUM) ENUM,
 #include "llvm/TargetParser/AArch64TargetParserDef.inc"
 #undef ARM_PROCESSOR_FAMILY
@@ -46,7 +46,7 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
 
 protected:
   /// ARMProcFamily - ARM processor family: Cortex-A53, Cortex-A57, and others.
-  ARMProcFamilyEnum ARMProcFamily = Others;
+  ARMProcFamilyEnum ARMProcFamily = Generic;
 
   // Enable 64-bit vectorization in SLP.
   unsigned MinVectorRegisterBitWidth = 64;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 4af3c482e6598..cd994d53a6008 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4272,7 +4272,7 @@ void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
   // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
   // checking for that case, we can ensure that the default behaviour is
   // unchanged
-  if (ST->getProcFamily() != AArch64Subtarget::Others &&
+  if (ST->getProcFamily() != AArch64Subtarget::Generic &&
       !ST->getSchedModel().isOutOfOrder()) {
     UP.Runtime = true;
     UP.Partial = true;
diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-rcpc_immo.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-rcpc_immo.ll
index b475e68db411a..02ff12c27fcda 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-rcpc_immo.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-rcpc_immo.ll
@@ -1,12 +1,15 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --filter-out "(?!^\s*lda.*\bsp\b)^\s*.*\bsp\b" --filter "^\s*(ld|st[^r]|swp|cas|bl|add|and|eor|orn|orr|sub|mvn|sxt|cmp|ccmp|csel|dmb)"
-; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+v8.4a -mattr=+rcpc-immo -global-isel=true -global-isel-abort=2 -O0 | FileCheck %s --check-prefixes=CHECK,GISEL
-; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+v8.4a -mattr=+rcpc-immo -global-isel=false -O1 | FileCheck %s --check-prefixes=CHECK,SDAG,SDAG-NOAVOIDLDAPUR
+; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+v8.8a -mattr=+rcpc-immo -global-isel=true -global-isel-abort=2 -O0 | FileCheck %s --check-prefixes=CHECK,GISEL
+; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+v8.4a -mattr=+rcpc-immo -global-isel=false -O1 | FileCheck %s --check-prefixes=CHECK,SDAG,SDAG-AVOIDLDAPUR
 ; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+v8.4a -mattr=+rcpc-immo,avoid-ldapur -global-isel=false -O1 | FileCheck %s --check-prefixes=CHECK,SDAG,SDAG-AVOIDLDAPUR
 ; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mcpu=neoverse-v2 -global-isel=false -O1 | FileCheck %s --check-prefixes=CHECK,SDAG,SDAG-AVOIDLDAPUR
 ; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mcpu=neoverse-v3 -global-isel=false -O1 | FileCheck %s --check-prefixes=CHECK,SDAG,SDAG-AVOIDLDAPUR
 ; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mcpu=cortex-x3 -global-isel=false -O1 | FileCheck %s --check-prefixes=CHECK,SDAG,SDAG-AVOIDLDAPUR
 ; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mcpu=cortex-x4 -global-isel=false -O1 | FileCheck %s --check-prefixes=CHECK,SDAG,SDAG-AVOIDLDAPUR
 ; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mcpu=cortex-x925 -global-isel=false -O1 | FileCheck %s --check-prefixes=CHECK,SDAG,SDAG-AVOIDLDAPUR
+; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+v9a -global-isel=false -O1 | FileCheck %s --check-prefixes=CHECK,SDAG,SDAG-AVOIDLDAPUR
+; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+v8.8a -global-isel=false -O1 | FileCheck %s --check-prefixes=CHECK,SDAG,SDAG-NOAVOIDLDAPUR
+; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+v9.3a -global-isel=false -O1 | FileCheck %s --check-prefixes=CHECK,SDAG,SDAG-NOAVOIDLDAPUR
 
 define i8 @load_atomic_i8_aligned_unordered(ptr %ptr) {
 ; CHECK-LABEL: load_atomic_i8_aligned_unordered:

From 898089b76e1c9f6bb9c61a4f43e287b07c67eadb Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Wed, 5 Feb 2025 17:49:52 +0000
Subject: [PATCH 058/282] [mlir][CMake] Fix dependency on MLIRTestDialect in
 Transforms tests (#125894)

Another follow up fix to
https://github.com/llvm/llvm-project/pull/123910 to fix a build failure
that sometimes happens in shared library builds:
https://lab.llvm.org/buildbot/#/builders/50/builds/9724

In file included from
/home/tcwg-buildbot/worker/flang-aarch64-dylib/llvm-project/mlir/test/lib/Transforms/TestInlining.cpp:16:
/home/tcwg-buildbot/worker/flang-aarch64-dylib/llvm-project/mlir/test/lib/Transforms/../Dialect/Test/TestOps.h:148:10:
fatal error: 'TestOps.h.inc' file not found
  148 | #include "TestOps.h.inc"
      |          ^~~~~~~~~~~~~~~
1 error generated.

(cherry picked from commit ebd23f25c8936db3dd917567737a067d6878e2f4)
---
 mlir/test/lib/Transforms/CMakeLists.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mlir/test/lib/Transforms/CMakeLists.txt b/mlir/test/lib/Transforms/CMakeLists.txt
index b91265d20fb48..1b9b9bffa5279 100644
--- a/mlir/test/lib/Transforms/CMakeLists.txt
+++ b/mlir/test/lib/Transforms/CMakeLists.txt
@@ -34,12 +34,14 @@ add_mlir_library(MLIRTestTransforms
 
   DEPENDS
   ${MLIRTestTransformsPDLDep}
+
+  LINK_LIBS PUBLIC
+  MLIRTestDialect
   )
 mlir_target_link_libraries(MLIRTestTransforms PUBLIC
   MLIRAnalysis
   MLIRFuncDialect
   MLIRInferIntRangeInterface
-  MLIRTestDialect
   MLIRTransforms
   )
 

From f92369f2591f986c69a5b189d22da64aceb5b33d Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Wed, 5 Feb 2025 22:32:51 -0500
Subject: [PATCH 059/282] [libc++] Replace __is_trivially_relocatable by
 is_trivially_copyable (#124970)

The __is_trivially_relocatable builtin has semantics that do not
correspond to any current or future notion of trivial relocation.
Furthermore, it currently leads to incorrect optimizations for some
types on supported compilers:
- Clang on Windows where types with non-trivial destructors get
  incorrectly optimized
- AppleClang where types with non-trivial move constructors get
  incorrectly optimized

Until there is an agreed upon and bugfree implementation of what it
means to be trivially relocatable, it is safer to simply use trivially
copyable instead. This doesn't leave a lot of types behind and is
definitely correct.

(cherry picked from commit accfbd4cb327411ad66c0109ba1841482b871967)
---
 .../__type_traits/is_trivially_relocatable.h  |  8 ++-
 .../is_trivially_relocatable.compile.pass.cpp | 14 ++--
 .../vector/trivial_relocation.pass.cpp        | 71 +++++++++++++++++++
 3 files changed, 86 insertions(+), 7 deletions(-)
 create mode 100644 libcxx/test/std/containers/sequences/vector/trivial_relocation.pass.cpp

diff --git a/libcxx/include/__type_traits/is_trivially_relocatable.h b/libcxx/include/__type_traits/is_trivially_relocatable.h
index c0871731cc001..9b0e240de55f4 100644
--- a/libcxx/include/__type_traits/is_trivially_relocatable.h
+++ b/libcxx/include/__type_traits/is_trivially_relocatable.h
@@ -11,7 +11,6 @@
 
 #include <__config>
 #include <__type_traits/enable_if.h>
-#include <__type_traits/integral_constant.h>
 #include <__type_traits/is_same.h>
 #include <__type_traits/is_trivially_copyable.h>
 
@@ -23,8 +22,11 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 // A type is trivially relocatable if a move construct + destroy of the original object is equivalent to
 // `memcpy(dst, src, sizeof(T))`.
-
-#if __has_builtin(__is_trivially_relocatable)
+//
+// Note that we don't use the __is_trivially_relocatable Clang builtin right now because it does not
+// implement the semantics of any current or future trivial relocation proposal and it can lead to
+// incorrect optimizations on some platforms (Windows) and supported compilers (AppleClang).
+#if __has_builtin(__is_trivially_relocatable) && 0
 template <class _Tp, class = void>
 struct __libcpp_is_trivially_relocatable : integral_constant<bool, __is_trivially_relocatable(_Tp)> {};
 #else
diff --git a/libcxx/test/libcxx/type_traits/is_trivially_relocatable.compile.pass.cpp b/libcxx/test/libcxx/type_traits/is_trivially_relocatable.compile.pass.cpp
index 674df1d021905..213d06d314a07 100644
--- a/libcxx/test/libcxx/type_traits/is_trivially_relocatable.compile.pass.cpp
+++ b/libcxx/test/libcxx/type_traits/is_trivially_relocatable.compile.pass.cpp
@@ -54,11 +54,17 @@ struct MoveOnlyTriviallyCopyable {
   MoveOnlyTriviallyCopyable(MoveOnlyTriviallyCopyable&&)                 = default;
   MoveOnlyTriviallyCopyable& operator=(MoveOnlyTriviallyCopyable&&)      = default;
 };
-#ifndef _MSC_VER
 static_assert(std::__libcpp_is_trivially_relocatable<MoveOnlyTriviallyCopyable>::value, "");
-#else
-static_assert(!std::__libcpp_is_trivially_relocatable<MoveOnlyTriviallyCopyable>::value, "");
-#endif
+
+struct NonTrivialMoveConstructor {
+  NonTrivialMoveConstructor(NonTrivialMoveConstructor&&);
+};
+static_assert(!std::__libcpp_is_trivially_relocatable<NonTrivialMoveConstructor>::value, "");
+
+struct NonTrivialDestructor {
+  ~NonTrivialDestructor() {}
+};
+static_assert(!std::__libcpp_is_trivially_relocatable<NonTrivialDestructor>::value, "");
 
 // library-internal types
 // ----------------------
diff --git a/libcxx/test/std/containers/sequences/vector/trivial_relocation.pass.cpp b/libcxx/test/std/containers/sequences/vector/trivial_relocation.pass.cpp
new file mode 100644
index 0000000000000..fbd597d07d6e3
--- /dev/null
+++ b/libcxx/test/std/containers/sequences/vector/trivial_relocation.pass.cpp
@@ -0,0 +1,71 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <vector>
+
+// Make sure we don't miscompile vector operations for types that shouldn't be considered
+// trivially relocatable.
+
+#include <vector>
+#include <cassert>
+#include <cstddef>
+#include <type_traits>
+#include <utility>
+
+#include "test_macros.h"
+
+struct Tracker {
+  std::size_t move_constructs = 0;
+};
+
+struct [[clang::trivial_abi]] Inner {
+  TEST_CONSTEXPR_CXX20 explicit Inner(Tracker* tracker) : tracker_(tracker) {}
+  TEST_CONSTEXPR_CXX20 Inner(const Inner& rhs) : tracker_(rhs.tracker_) { tracker_->move_constructs += 1; }
+  TEST_CONSTEXPR_CXX20 Inner(Inner&& rhs) : tracker_(rhs.tracker_) { tracker_->move_constructs += 1; }
+  Tracker* tracker_;
+};
+
+// Even though this type contains a trivial_abi type, it is not trivially move-constructible,
+// so we should not attempt to optimize its move construction + destroy using trivial relocation.
+struct NotTriviallyMovable {
+  TEST_CONSTEXPR_CXX20 explicit NotTriviallyMovable(Tracker* tracker) : inner_(tracker) {}
+  TEST_CONSTEXPR_CXX20 NotTriviallyMovable(NotTriviallyMovable&& other) : inner_(std::move(other.inner_)) {}
+  Inner inner_;
+};
+static_assert(!std::is_trivially_copyable<NotTriviallyMovable>::value, "");
+LIBCPP_STATIC_ASSERT(!std::__libcpp_is_trivially_relocatable<NotTriviallyMovable>::value, "");
+
+TEST_CONSTEXPR_CXX20 bool tests() {
+  Tracker track;
+  std::vector<NotTriviallyMovable> v;
+
+  // Fill the vector at its capacity, such that any subsequent push_back would require growing.
+  v.reserve(5);
+  std::size_t const capacity = v.capacity(); // could technically be more than 5
+  while (v.size() < v.capacity()) {
+    v.emplace_back(&track);
+  }
+  assert(track.move_constructs == 0);
+  assert(v.capacity() == capacity);
+  assert(v.size() == capacity);
+
+  // Force a reallocation of the buffer + relocalization of the elements.
+  // All the existing elements of the vector should be move-constructed to their new location.
+  v.emplace_back(&track);
+  assert(track.move_constructs == capacity);
+
+  return true;
+}
+
+int main(int, char**) {
+  tests();
+#if TEST_STD_VER >= 20
+  static_assert(tests());
+#endif
+  return 0;
+}

From e5ea8f0a4855acd532db6beab8944a6226287a5f Mon Sep 17 00:00:00 2001
From: Tom Tromey <tom@tromey.com>
Date: Tue, 4 Feb 2025 14:36:22 -0700
Subject: [PATCH 060/282] Allow 128-bit discriminants in DWARF variants
 (#125578)

If a variant part has a 128-bit discriminator, then
DwarfUnit::constructTypeDIE will assert.  This patch fixes the problem
by allowing any size of integer to be used here.  This is largely
accomplished by moving part of DwarfUnit::addConstantValue to a new
method.

Fixes #119655

(cherry picked from commit 3c2807624d2006fa8aacf9c6441c9a3034a52b44)
---
 llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp     | 64 +++++++++++--------
 llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h       |  8 +++
 .../DebugInfo/Generic/discriminated-union.ll  |  4 +-
 3 files changed, 49 insertions(+), 27 deletions(-)

diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index d3450b8b0556f..5a17aa3bc9dc5 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -232,6 +232,42 @@ void DwarfUnit::addUInt(DIEValueList &Block, dwarf::Form Form,
   addUInt(Block, (dwarf::Attribute)0, Form, Integer);
 }
 
+void DwarfUnit::addIntAsBlock(DIE &Die, dwarf::Attribute Attribute, const APInt &Val) {
+  DIEBlock *Block = new (DIEValueAllocator) DIEBlock;
+
+  // Get the raw data form of the large APInt.
+  const uint64_t *Ptr64 = Val.getRawData();
+
+  int NumBytes = Val.getBitWidth() / 8; // 8 bits per byte.
+  bool LittleEndian = Asm->getDataLayout().isLittleEndian();
+
+  // Output the constant to DWARF one byte at a time.
+  for (int i = 0; i < NumBytes; i++) {
+    uint8_t c;
+    if (LittleEndian)
+      c = Ptr64[i / 8] >> (8 * (i & 7));
+    else
+      c = Ptr64[(NumBytes - 1 - i) / 8] >> (8 * ((NumBytes - 1 - i) & 7));
+    addUInt(*Block, dwarf::DW_FORM_data1, c);
+  }
+
+  addBlock(Die, Attribute, Block);
+}
+
+void DwarfUnit::addInt(DIE &Die, dwarf::Attribute Attribute,
+		       const APInt &Val, bool Unsigned) {
+  unsigned CIBitWidth = Val.getBitWidth();
+  if (CIBitWidth <= 64) {
+    if (Unsigned)
+      addUInt(Die, Attribute, std::nullopt, Val.getZExtValue());
+    else
+      addSInt(Die, Attribute, std::nullopt, Val.getSExtValue());
+    return;
+  }
+
+  addIntAsBlock(Die, Attribute, Val);
+}
+
 void DwarfUnit::addSInt(DIEValueList &Die, dwarf::Attribute Attribute,
                         std::optional<dwarf::Form> Form, int64_t Integer) {
   if (!Form)
@@ -484,25 +520,7 @@ void DwarfUnit::addConstantValue(DIE &Die, const APInt &Val, bool Unsigned) {
     return;
   }
 
-  DIEBlock *Block = new (DIEValueAllocator) DIEBlock;
-
-  // Get the raw data form of the large APInt.
-  const uint64_t *Ptr64 = Val.getRawData();
-
-  int NumBytes = Val.getBitWidth() / 8; // 8 bits per byte.
-  bool LittleEndian = Asm->getDataLayout().isLittleEndian();
-
-  // Output the constant to DWARF one byte at a time.
-  for (int i = 0; i < NumBytes; i++) {
-    uint8_t c;
-    if (LittleEndian)
-      c = Ptr64[i / 8] >> (8 * (i & 7));
-    else
-      c = Ptr64[(NumBytes - 1 - i) / 8] >> (8 * ((NumBytes - 1 - i) & 7));
-    addUInt(*Block, dwarf::DW_FORM_data1, c);
-  }
-
-  addBlock(Die, dwarf::DW_AT_const_value, Block);
+  addIntAsBlock(Die, dwarf::DW_AT_const_value, Val);
 }
 
 void DwarfUnit::addLinkageName(DIE &Die, StringRef LinkageName) {
@@ -980,12 +998,8 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DICompositeType *CTy) {
           DIE &Variant = createAndAddDIE(dwarf::DW_TAG_variant, Buffer);
           if (const ConstantInt *CI =
               dyn_cast_or_null<ConstantInt>(DDTy->getDiscriminantValue())) {
-            if (DD->isUnsignedDIType(Discriminator->getBaseType()))
-              addUInt(Variant, dwarf::DW_AT_discr_value, std::nullopt,
-                      CI->getZExtValue());
-            else
-              addSInt(Variant, dwarf::DW_AT_discr_value, std::nullopt,
-                      CI->getSExtValue());
+	    addInt(Variant, dwarf::DW_AT_discr_value, CI->getValue(),
+		   DD->isUnsignedDIType(Discriminator->getBaseType()));
           }
           constructMemberDIE(Variant, DDTy);
         } else {
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
index 7a5295d826a48..842a8d6ad53b1 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
@@ -167,6 +167,10 @@ class DwarfUnit : public DIEUnit {
 
   void addSInt(DIELoc &Die, std::optional<dwarf::Form> Form, int64_t Integer);
 
+  /// Add an integer attribute data and value; value may be any width.
+  void addInt(DIE &Die, dwarf::Attribute Attribute, const APInt &Integer,
+	      bool Unsigned);
+
   /// Add a string attribute data and value.
   ///
   /// We always emit a reference to the string pool instead of immediate
@@ -336,6 +340,10 @@ class DwarfUnit : public DIEUnit {
   void emitCommonHeader(bool UseOffsets, dwarf::UnitType UT);
 
 private:
+  /// A helper to add a wide integer constant to a DIE using a block
+  /// form.
+  void addIntAsBlock(DIE &Die, dwarf::Attribute Attribute, const APInt &Val);
+
   void constructTypeDIE(DIE &Buffer, const DIBasicType *BTy);
   void constructTypeDIE(DIE &Buffer, const DIStringType *BTy);
   void constructTypeDIE(DIE &Buffer, const DIDerivedType *DTy);
diff --git a/llvm/test/DebugInfo/Generic/discriminated-union.ll b/llvm/test/DebugInfo/Generic/discriminated-union.ll
index d267d9b029e95..592f2152ae682 100644
--- a/llvm/test/DebugInfo/Generic/discriminated-union.ll
+++ b/llvm/test/DebugInfo/Generic/discriminated-union.ll
@@ -22,7 +22,7 @@
 ;         CHECK: DW_AT_alignment
 ;         CHECK: DW_AT_data_member_location [DW_FORM_data1]	(0x00)
 ;     CHECK: DW_TAG_variant
-;       CHECK: DW_AT_discr_value [DW_FORM_data1]	(0x00)
+;       CHECK: DW_AT_discr_value [DW_FORM_block1]	(<0x10> 00 00 00 00 00 00 00 00 01 00 00 00 00 00 00 00 )
 ;       CHECK: DW_TAG_member
 ;         CHECK: DW_AT_type
 ;         CHECK: DW_AT_alignment
@@ -71,7 +71,7 @@ attributes #0 = { nounwind uwtable }
 !21 = !DIBasicType(name: "u8", size: 8, encoding: DW_ATE_unsigned)
 !22 = !DIDerivedType(tag: DW_TAG_member, name: "__1", scope: !18, file: !7, baseType: !23, size: 64, align: 64)
 !23 = !DIDerivedType(tag: DW_TAG_pointer_type, name: "&u8", baseType: !21, size: 64, align: 64)
-!24 = !DIDerivedType(tag: DW_TAG_member, scope: !14, file: !7, baseType: !25, size: 128, align: 64, extraData: i64 0)
+!24 = !DIDerivedType(tag: DW_TAG_member, scope: !14, file: !7, baseType: !25, size: 128, align: 64, extraData: i128 18446744073709551616)
 !25 = !DICompositeType(tag: DW_TAG_structure_type, name: "Nope", scope: !12, file: !7, size: 128, align: 64, elements: !4, identifier: "7ce1efff6b82281ab9ceb730566e7e20::Nope")
 !27 = !DIBasicType(name: "u64", size: 64, encoding: DW_ATE_unsigned)
 !28 = !DIExpression()

From 5e36383513a57704b513732b4b152b67933c37d9 Mon Sep 17 00:00:00 2001
From: beetrees <b@beetr.ee>
Date: Wed, 5 Feb 2025 17:20:11 +0000
Subject: [PATCH 061/282] Fix
 `llvm/test/DebugInfo/Generic/discriminated-union.ll` on big-endian targets
 (#125849)

Fixes the failure of the [Solaris/sparcv9
buildbot](https://lab.llvm.org/buildbot/#/builders/13/builds/5103)
caused by #125578.

cc @rorth @tromey @dwblaikie

(cherry picked from commit 34929853bc39d28943373b8a96371a7e81e98917)
---
 llvm/test/DebugInfo/Generic/discriminated-union.ll | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/llvm/test/DebugInfo/Generic/discriminated-union.ll b/llvm/test/DebugInfo/Generic/discriminated-union.ll
index 592f2152ae682..0267580eb0cbf 100644
--- a/llvm/test/DebugInfo/Generic/discriminated-union.ll
+++ b/llvm/test/DebugInfo/Generic/discriminated-union.ll
@@ -1,8 +1,8 @@
 ; RUN: %llc_dwarf -O0 -filetype=obj < %s > %t
-; RUN: llvm-dwarfdump -v -debug-info %t | FileCheck %s
+; RUN: llvm-dwarfdump -v -debug-info %t | FileCheck %s --check-prefix %if target-byteorder-big-endian %{ CHECK-BE %} %else %{ CHECK-LE %}
 
 ; RUN: %llc_dwarf --try-experimental-debuginfo-iterators -O0 -filetype=obj < %s > %t
-; RUN: llvm-dwarfdump -v -debug-info %t | FileCheck %s
+; RUN: llvm-dwarfdump -v -debug-info %t | FileCheck %s --check-prefix %if target-byteorder-big-endian %{ CHECK-BE %} %else %{ CHECK-LE %}
 
 ; Check for a variant part that has two members, one of which has a
 ; discriminant value.
@@ -22,7 +22,8 @@
 ;         CHECK: DW_AT_alignment
 ;         CHECK: DW_AT_data_member_location [DW_FORM_data1]	(0x00)
 ;     CHECK: DW_TAG_variant
-;       CHECK: DW_AT_discr_value [DW_FORM_block1]	(<0x10> 00 00 00 00 00 00 00 00 01 00 00 00 00 00 00 00 )
+;       CHECK-LE: DW_AT_discr_value [DW_FORM_block1]	(<0x10> 00 00 00 00 00 00 00 00 01 00 00 00 00 00 00 00 )
+;       CHECK-BE: DW_AT_discr_value [DW_FORM_block1]	(<0x10> 00 00 00 00 00 00 00 01 00 00 00 00 00 00 00 00 )
 ;       CHECK: DW_TAG_member
 ;         CHECK: DW_AT_type
 ;         CHECK: DW_AT_alignment

From b2b41544eefa71f97fad492100617aab90d024fb Mon Sep 17 00:00:00 2001
From: Michael Park <mcypark@gmail.com>
Date: Mon, 3 Feb 2025 11:22:02 -0800
Subject: [PATCH 062/282] [C++20][Modules][Serialization] Delay marking pending
 incomplete decl chains until the end of `finishPendingActions`. (#121245)

The call to `hasBody` inside `finishPendingActions` that bumps the `PendingIncompleteDeclChains`
size from `0` to `1`, and also sets the `LazyVal->LastGeneration` to `6` which matches
the `LazyVal->ExternalSource->getGeneration()` value of `6`. Later, the iterations over `redecls()`
(which calls `getNextRedeclaration`) is expected to trigger the reload, but it **does not** since
the generation numbers match.

The proposed solution is to perform the marking of incomplete decl chains at the end of `finishPendingActions`.
This way, **all** of the incomplete decls are marked incomplete as a post-condition of `finishPendingActions`.
It's also safe to delay this operation since any operation being done within `finishPendingActions` has
`NumCurrentElementsDeserializing == 1`, which means that any calls to `CompleteDeclChain` would simply
add to the `PendingIncompleteDeclChains` without doing anything anyway.

(cherry picked from commit a9e249f64e800fbb20a3b26c0cfb68c1a1aee5e1)
---
 clang/lib/Serialization/ASTReader.cpp | 25 ++++---
 clang/test/Modules/pr121245.cpp       | 93 +++++++++++++++++++++++++++
 2 files changed, 105 insertions(+), 13 deletions(-)
 create mode 100644 clang/test/Modules/pr121245.cpp

diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index f524251c48ddd..24acd6e297e71 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -10186,12 +10186,12 @@ void ASTReader::visitTopLevelModuleMaps(
 }
 
 void ASTReader::finishPendingActions() {
-  while (
-      !PendingIdentifierInfos.empty() || !PendingDeducedFunctionTypes.empty() ||
-      !PendingDeducedVarTypes.empty() || !PendingIncompleteDeclChains.empty() ||
-      !PendingDeclChains.empty() || !PendingMacroIDs.empty() ||
-      !PendingDeclContextInfos.empty() || !PendingUpdateRecords.empty() ||
-      !PendingObjCExtensionIvarRedeclarations.empty()) {
+  while (!PendingIdentifierInfos.empty() ||
+         !PendingDeducedFunctionTypes.empty() ||
+         !PendingDeducedVarTypes.empty() || !PendingDeclChains.empty() ||
+         !PendingMacroIDs.empty() || !PendingDeclContextInfos.empty() ||
+         !PendingUpdateRecords.empty() ||
+         !PendingObjCExtensionIvarRedeclarations.empty()) {
     // If any identifiers with corresponding top-level declarations have
     // been loaded, load those declarations now.
     using TopLevelDeclsMap =
@@ -10239,13 +10239,6 @@ void ASTReader::finishPendingActions() {
     }
     PendingDeducedVarTypes.clear();
 
-    // For each decl chain that we wanted to complete while deserializing, mark
-    // it as "still needs to be completed".
-    for (unsigned I = 0; I != PendingIncompleteDeclChains.size(); ++I) {
-      markIncompleteDeclChain(PendingIncompleteDeclChains[I]);
-    }
-    PendingIncompleteDeclChains.clear();
-
     // Load pending declaration chains.
     for (unsigned I = 0; I != PendingDeclChains.size(); ++I)
       loadPendingDeclChain(PendingDeclChains[I].first,
@@ -10483,6 +10476,12 @@ void ASTReader::finishPendingActions() {
   for (auto *ND : PendingMergedDefinitionsToDeduplicate)
     getContext().deduplicateMergedDefinitonsFor(ND);
   PendingMergedDefinitionsToDeduplicate.clear();
+
+  // For each decl chain that we wanted to complete while deserializing, mark
+  // it as "still needs to be completed".
+  for (Decl *D : PendingIncompleteDeclChains)
+    markIncompleteDeclChain(D);
+  PendingIncompleteDeclChains.clear();
 }
 
 void ASTReader::diagnoseOdrViolations() {
diff --git a/clang/test/Modules/pr121245.cpp b/clang/test/Modules/pr121245.cpp
new file mode 100644
index 0000000000000..0e276ad0e435d
--- /dev/null
+++ b/clang/test/Modules/pr121245.cpp
@@ -0,0 +1,93 @@
+// If this test fails, it should be investigated under Debug builds.
+// Before the PR, this test was encountering an `llvm_unreachable()`.
+
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+// RUN: cd %t
+
+// RUN: %clang_cc1 -std=c++20 -emit-header-unit -xc++-user-header %t/hu-01.h \
+// RUN:  -fcxx-exceptions -o %t/hu-01.pcm
+
+// RUN: %clang_cc1 -std=c++20 -emit-header-unit -xc++-user-header %t/hu-02.h \
+// RUN:  -Wno-experimental-header-units -fcxx-exceptions \
+// RUN:  -fmodule-file=%t/hu-01.pcm -o %t/hu-02.pcm
+
+// RUN: %clang_cc1 -std=c++20 -emit-header-unit -xc++-user-header %t/hu-03.h \
+// RUN:  -Wno-experimental-header-units -fcxx-exceptions \
+// RUN:  -fmodule-file=%t/hu-01.pcm -o %t/hu-03.pcm
+
+// RUN: %clang_cc1 -std=c++20 -emit-header-unit -xc++-user-header %t/hu-04.h \
+// RUN:  -Wno-experimental-header-units -fcxx-exceptions \
+// RUN:  -fmodule-file=%t/hu-01.pcm -o %t/hu-04.pcm
+
+// RUN: %clang_cc1 -std=c++20 -emit-header-unit -xc++-user-header %t/hu-05.h \
+// RUN:  -Wno-experimental-header-units -fcxx-exceptions \
+// RUN:  -fmodule-file=%t/hu-03.pcm -fmodule-file=%t/hu-04.pcm \
+// RUN:  -fmodule-file=%t/hu-01.pcm -o %t/hu-05.pcm
+
+// RUN: %clang_cc1 -std=c++20 -emit-obj %t/main.cpp \
+// RUN:  -Wno-experimental-header-units -fcxx-exceptions \
+// RUN:  -fmodule-file=%t/hu-02.pcm -fmodule-file=%t/hu-05.pcm \
+// RUN:  -fmodule-file=%t/hu-04.pcm -fmodule-file=%t/hu-03.pcm \
+// RUN:  -fmodule-file=%t/hu-01.pcm
+
+//--- hu-01.h
+template <typename T>
+struct A {
+  A() {}
+  ~A() {}
+};
+
+template <typename T>
+struct EBO : T {
+  EBO() = default;
+};
+
+template <typename T>
+struct HT : EBO<A<T>> {};
+
+//--- hu-02.h
+import "hu-01.h";
+
+inline void f() {
+  HT<int>();
+}
+
+//--- hu-03.h
+import "hu-01.h";
+
+struct C {
+  C();
+
+  HT<long> _;
+};
+
+//--- hu-04.h
+import "hu-01.h";
+
+void g(HT<long> = {});
+
+//--- hu-05.h
+import "hu-03.h";
+import "hu-04.h";
+import "hu-01.h";
+
+struct B {
+  virtual ~B() = default;
+
+  virtual void f() {
+    HT<long>();
+  }
+};
+
+//--- main.cpp
+import "hu-02.h";
+import "hu-05.h";
+import "hu-03.h";
+
+int main() {
+  f();
+  C();
+  B();
+}

From b05e3a7d5e2285b545510f01ea7c599502e582d8 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Thu, 30 Jan 2025 16:03:00 +1100
Subject: [PATCH 063/282] [ORC] Fix file comment formatting. NFC.

(cherry picked from commit 7fb233f549dee0346332562de050ef2ab3654329)
---
 llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp
index aae7369fc29c4..7f0a45941cf9b 100644
--- a/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp
@@ -1,5 +1,4 @@
-//===------ ELFNixPlatform.cpp - Utilities for executing ELFNix in Orc
-//-----===//
+//===----- ELFNixPlatform.cpp - Utilities for executing ELFNix in Orc -----===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.

From 4abac9fbfdcdf63c221ecdd83eadaa8f578b5d1e Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Mon, 3 Feb 2025 18:34:39 +1100
Subject: [PATCH 064/282] [ORC] Drop 'Info' from
 MachOCompactUnwindInfoSectionName.

Rename MachOCompactUnwindInfoSectionName to MachOCompactUnwindSectionName.

Background:

There are two related sections used for compact-unwind info processing:
__LD,__compact_unwind -- the input table stored in relocatable object formats,
and __TEXT,__unwind_info -- the compressed table produced by the linker and
consumed by libunwind. To keep the distinction clear we'll use *CompactUnwind*
for names that refer to the __LD,__compact_unwind input tables and *UnwindInfo*
for names that refer to the __TEXT,__unwind_info output tables. Dropping 'Info'
from the variable above clarifies which section it refers to.

(cherry picked from commit a1ff2d18466bc27d3dc9b8bba688454e2a1cf196)
---
 .../llvm/ExecutionEngine/Orc/Shared/MachOObjectFormat.h        | 2 +-
 llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp                 | 3 +--
 llvm/lib/ExecutionEngine/Orc/Shared/MachOObjectFormat.cpp      | 2 +-
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Shared/MachOObjectFormat.h b/llvm/include/llvm/ExecutionEngine/Orc/Shared/MachOObjectFormat.h
index b927dfbce992a..31d0ecca20805 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/Shared/MachOObjectFormat.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Shared/MachOObjectFormat.h
@@ -25,7 +25,7 @@ namespace orc {
 extern StringRef MachODataCommonSectionName;
 extern StringRef MachODataDataSectionName;
 extern StringRef MachOEHFrameSectionName;
-extern StringRef MachOCompactUnwindInfoSectionName;
+extern StringRef MachOCompactUnwindSectionName;
 extern StringRef MachOCStringSectionName;
 extern StringRef MachOModInitFuncSectionName;
 extern StringRef MachOObjCCatListSectionName;
diff --git a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
index 48d54190fafb6..9479a69d4f0ba 100644
--- a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
@@ -1278,8 +1278,7 @@ MachOPlatform::MachOPlatformPlugin::findUnwindSectionInfo(
   if (Section *EHFrameSec = G.findSectionByName(MachOEHFrameSectionName))
     ScanUnwindInfoSection(*EHFrameSec, US.DwarfSection);
 
-  if (Section *CUInfoSec =
-          G.findSectionByName(MachOCompactUnwindInfoSectionName))
+  if (Section *CUInfoSec = G.findSectionByName(MachOCompactUnwindSectionName))
     ScanUnwindInfoSection(*CUInfoSec, US.CompactUnwindSection);
 
   // If we didn't find any pointed-to code-blocks then there's no need to
diff --git a/llvm/lib/ExecutionEngine/Orc/Shared/MachOObjectFormat.cpp b/llvm/lib/ExecutionEngine/Orc/Shared/MachOObjectFormat.cpp
index 11e8eb7bc3a19..be92acd37aa8d 100644
--- a/llvm/lib/ExecutionEngine/Orc/Shared/MachOObjectFormat.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Shared/MachOObjectFormat.cpp
@@ -18,7 +18,7 @@ namespace orc {
 StringRef MachODataCommonSectionName = "__DATA,__common";
 StringRef MachODataDataSectionName = "__DATA,__data";
 StringRef MachOEHFrameSectionName = "__TEXT,__eh_frame";
-StringRef MachOCompactUnwindInfoSectionName = "__TEXT,__unwind_info";
+StringRef MachOCompactUnwindSectionName = "__TEXT,__unwind_info";
 StringRef MachOCStringSectionName = "__TEXT,__cstring";
 StringRef MachOModInitFuncSectionName = "__DATA,__mod_init_func";
 StringRef MachOObjCCatListSectionName = "__DATA,__objc_catlist";

From 124480368687dd518254b482707988fe152429ac Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Tue, 4 Feb 2025 16:17:53 +1100
Subject: [PATCH 065/282] [ORC] Add minimal-throw-catch.ll regression test for
 lli -jit-mode=orc.

We already had a -jit-mode=orc-lazy regression test for this, but it should
work equally well in non-lazy mode.

(cherry picked from commit b46211bbf683b30b88e41a684633fc63436e5edf)
---
 .../Orc/minimal-throw-catch.ll                | 52 +++++++++++++++++++
 .../OrcLazy/minimal-throw-catch.ll            |  4 --
 2 files changed, 52 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/ExecutionEngine/Orc/minimal-throw-catch.ll

diff --git a/llvm/test/ExecutionEngine/Orc/minimal-throw-catch.ll b/llvm/test/ExecutionEngine/Orc/minimal-throw-catch.ll
new file mode 100644
index 0000000000000..cd22ec65ed999
--- /dev/null
+++ b/llvm/test/ExecutionEngine/Orc/minimal-throw-catch.ll
@@ -0,0 +1,52 @@
+; REQUIRES: x86_64-apple
+; RUN: lli -jit-kind=orc-lazy %s
+;
+; Basic correctness testing for eh-frame processing and registration.
+
+@_ZTIi = external constant ptr
+
+declare ptr @__cxa_allocate_exception(i64)
+declare void @__cxa_throw(ptr, ptr, ptr)
+
+declare i32 @__gxx_personality_v0(...)
+declare i32 @llvm.eh.typeid.for(ptr)
+declare ptr @__cxa_begin_catch(ptr)
+declare void @__cxa_end_catch()
+
+define void @explode() {
+entry:
+  %exception = tail call ptr @__cxa_allocate_exception(i64 4)
+  store i32 42, ptr %exception, align 16
+  tail call void @__cxa_throw(ptr %exception, ptr @_ZTIi, ptr null)
+  unreachable
+}
+
+define i32 @main(i32 %argc, ptr %argv) personality ptr @__gxx_personality_v0 {
+entry:
+  invoke void @explode()
+          to label %return unwind label %lpad
+
+lpad:
+  %0 = landingpad { ptr, i32 }
+          catch ptr @_ZTIi
+  %1 = extractvalue { ptr, i32 } %0, 1
+  %2 = tail call i32 @llvm.eh.typeid.for(ptr @_ZTIi)
+  %matches = icmp eq i32 %1, %2
+  br i1 %matches, label %catch, label %eh.resume
+
+catch:
+  %3 = extractvalue { ptr, i32 } %0, 0
+  %4 = tail call ptr @__cxa_begin_catch(ptr %3)
+  %5 = load i32, ptr %4, align 4
+  %cmp = icmp ne i32 %5, 42
+  %cond = zext i1 %cmp to i32
+  tail call void @__cxa_end_catch()
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ %cond, %catch ], [ 2, %entry ]
+  ret i32 %retval.0
+
+eh.resume:
+  resume { ptr, i32 } %0
+}
diff --git a/llvm/test/ExecutionEngine/OrcLazy/minimal-throw-catch.ll b/llvm/test/ExecutionEngine/OrcLazy/minimal-throw-catch.ll
index 5bc5769a2c0d2..cd22ec65ed999 100644
--- a/llvm/test/ExecutionEngine/OrcLazy/minimal-throw-catch.ll
+++ b/llvm/test/ExecutionEngine/OrcLazy/minimal-throw-catch.ll
@@ -3,10 +3,6 @@
 ;
 ; Basic correctness testing for eh-frame processing and registration.
 
-source_filename = "minimal-throw-catch.cpp"
-target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.14.0"
-
 @_ZTIi = external constant ptr
 
 declare ptr @__cxa_allocate_exception(i64)

From 3a474e70939a4c68a9598ad15c268621f6c33911 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Tue, 4 Feb 2025 17:38:16 +1100
Subject: [PATCH 066/282] [ORC] Actually use -jit-kind=orc for the new
 minimal-throw-catch.ll test.

b46211bbf68, which introduced a new copy of the minimal-throw-catch.ll test,
failed to update the run line.

(cherry picked from commit c0f7ebe715dbe706224389a3022e6a3880fef0a1)
---
 llvm/test/ExecutionEngine/Orc/minimal-throw-catch.ll | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/ExecutionEngine/Orc/minimal-throw-catch.ll b/llvm/test/ExecutionEngine/Orc/minimal-throw-catch.ll
index cd22ec65ed999..1b8f45184833f 100644
--- a/llvm/test/ExecutionEngine/Orc/minimal-throw-catch.ll
+++ b/llvm/test/ExecutionEngine/Orc/minimal-throw-catch.ll
@@ -1,5 +1,5 @@
 ; REQUIRES: x86_64-apple
-; RUN: lli -jit-kind=orc-lazy %s
+; RUN: lli -jit-kind=orc %s
 ;
 ; Basic correctness testing for eh-frame processing and registration.
 

From c5f8508dd3649fffdff2cfcae9335612f38abea9 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Tue, 4 Feb 2025 18:09:07 +1100
Subject: [PATCH 067/282] [ORC] Rename MachOCompactUnwindSectionName to
 MachOUnwindInfoSectionName.

a1ff2d18466 should have disambiguated MachOCompactUnwindInfoSectionName to
MachOUnwindInfoSectionName, given how it's used in MachOPlatform and its
value.

A MachOCompactUnwindSectionName variable with an appropriate value will be
added in an upcoming patch to re-enable compact-unwind support in JITLink.

(cherry picked from commit b84ac58dce65ea94994c24f40a14208c47f8119f)
---
 .../include/llvm/ExecutionEngine/Orc/Shared/MachOObjectFormat.h | 2 +-
 llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp                  | 2 +-
 llvm/lib/ExecutionEngine/Orc/Shared/MachOObjectFormat.cpp       | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Shared/MachOObjectFormat.h b/llvm/include/llvm/ExecutionEngine/Orc/Shared/MachOObjectFormat.h
index 31d0ecca20805..5de85a4f8674e 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/Shared/MachOObjectFormat.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Shared/MachOObjectFormat.h
@@ -25,7 +25,6 @@ namespace orc {
 extern StringRef MachODataCommonSectionName;
 extern StringRef MachODataDataSectionName;
 extern StringRef MachOEHFrameSectionName;
-extern StringRef MachOCompactUnwindSectionName;
 extern StringRef MachOCStringSectionName;
 extern StringRef MachOModInitFuncSectionName;
 extern StringRef MachOObjCCatListSectionName;
@@ -53,6 +52,7 @@ extern StringRef MachOTextTextSectionName;
 extern StringRef MachOThreadBSSSectionName;
 extern StringRef MachOThreadDataSectionName;
 extern StringRef MachOThreadVarsSectionName;
+extern StringRef MachOUnwindInfoSectionName;
 
 extern StringRef MachOInitSectionNames[22];
 
diff --git a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
index 9479a69d4f0ba..43d38c81fb8a6 100644
--- a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
@@ -1278,7 +1278,7 @@ MachOPlatform::MachOPlatformPlugin::findUnwindSectionInfo(
   if (Section *EHFrameSec = G.findSectionByName(MachOEHFrameSectionName))
     ScanUnwindInfoSection(*EHFrameSec, US.DwarfSection);
 
-  if (Section *CUInfoSec = G.findSectionByName(MachOCompactUnwindSectionName))
+  if (Section *CUInfoSec = G.findSectionByName(MachOUnwindInfoSectionName))
     ScanUnwindInfoSection(*CUInfoSec, US.CompactUnwindSection);
 
   // If we didn't find any pointed-to code-blocks then there's no need to
diff --git a/llvm/lib/ExecutionEngine/Orc/Shared/MachOObjectFormat.cpp b/llvm/lib/ExecutionEngine/Orc/Shared/MachOObjectFormat.cpp
index be92acd37aa8d..d94acf2768817 100644
--- a/llvm/lib/ExecutionEngine/Orc/Shared/MachOObjectFormat.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Shared/MachOObjectFormat.cpp
@@ -18,7 +18,6 @@ namespace orc {
 StringRef MachODataCommonSectionName = "__DATA,__common";
 StringRef MachODataDataSectionName = "__DATA,__data";
 StringRef MachOEHFrameSectionName = "__TEXT,__eh_frame";
-StringRef MachOCompactUnwindSectionName = "__TEXT,__unwind_info";
 StringRef MachOCStringSectionName = "__TEXT,__cstring";
 StringRef MachOModInitFuncSectionName = "__DATA,__mod_init_func";
 StringRef MachOObjCCatListSectionName = "__DATA,__objc_catlist";
@@ -46,6 +45,7 @@ StringRef MachOTextTextSectionName = "__TEXT,__text";
 StringRef MachOThreadBSSSectionName = "__DATA,__thread_bss";
 StringRef MachOThreadDataSectionName = "__DATA,__thread_data";
 StringRef MachOThreadVarsSectionName = "__DATA,__thread_vars";
+StringRef MachOUnwindInfoSectionName = "__TEXT,__unwind_info";
 
 StringRef MachOInitSectionNames[22] = {
     MachOModInitFuncSectionName,         MachOObjCCatListSectionName,

From 1bcbc68e98b07cdef6b03377bb7cfc0d53787577 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Tue, 4 Feb 2025 22:34:38 +1100
Subject: [PATCH 068/282] [ORC] Fix eh-frame record target finding in
 MachOPlatform.

Unwind-info records only have one keep-alive edge to their target function, but
eh-frame records may have multiple edges (to the CIE, function, personality, and
lsda). We need to identify the target-function edge differently for each section
type.

(cherry picked from commit 52b5e3638a39e977bebb491312a6f7c53314efec)
---
 .../lib/ExecutionEngine/Orc/MachOPlatform.cpp | 57 ++++++++++++++-----
 1 file changed, 44 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
index 43d38c81fb8a6..cea0e984718f2 100644
--- a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
@@ -1256,7 +1256,8 @@ MachOPlatform::MachOPlatformPlugin::findUnwindSectionInfo(
   // ScanSection records a section range and adds any executable blocks that
   // that section points to to the CodeBlocks vector.
   SmallVector<Block *> CodeBlocks;
-  auto ScanUnwindInfoSection = [&](Section &Sec, ExecutorAddrRange &SecRange) {
+  auto ScanUnwindInfoSection = [&](Section &Sec, ExecutorAddrRange &SecRange,
+                                   auto GetCodeForRecord) {
     if (Sec.blocks().empty())
       return;
     SecRange = (*Sec.blocks().begin())->getRange();
@@ -1264,22 +1265,52 @@ MachOPlatform::MachOPlatformPlugin::findUnwindSectionInfo(
       auto R = B->getRange();
       SecRange.Start = std::min(SecRange.Start, R.Start);
       SecRange.End = std::max(SecRange.End, R.End);
-      for (auto &E : B->edges()) {
-        if (E.getKind() != Edge::KeepAlive || !E.getTarget().isDefined())
-          continue;
-        auto &TargetBlock = E.getTarget().getBlock();
-        auto &TargetSection = TargetBlock.getSection();
-        if ((TargetSection.getMemProt() & MemProt::Exec) == MemProt::Exec)
-          CodeBlocks.push_back(&TargetBlock);
-      }
+      if (auto *CodeBlock = GetCodeForRecord(*B))
+        CodeBlocks.push_back(CodeBlock);
     }
   };
 
-  if (Section *EHFrameSec = G.findSectionByName(MachOEHFrameSectionName))
-    ScanUnwindInfoSection(*EHFrameSec, US.DwarfSection);
+  if (Section *EHFrameSec = G.findSectionByName(MachOEHFrameSectionName)) {
+    ScanUnwindInfoSection(
+        *EHFrameSec, US.DwarfSection, [&](Block &B) -> Block * {
+          // Filter out CIE, personality, etc. edges.
+          SmallVector<Edge *, 4> BEdges;
+          for (auto &E : B.edges())
+            BEdges.push_back(&E);
+          llvm::sort(BEdges, [](const Edge *LHS, const Edge *RHS) {
+            return LHS->getOffset() < RHS->getOffset();
+          });
+          if (BEdges.size() < 2)
+            return nullptr;
+          auto &TargetBlock = BEdges[1]->getTarget().getBlock();
+#ifndef NDEBUG
+          auto &TargetSection = TargetBlock.getSection();
+          assert(&TargetSection != EHFrameSec &&
+                 (TargetSection.getMemProt() & MemProt::Exec) ==
+                     MemProt::Exec &&
+                 "Invalid eh-frame function target");
+#endif // NDEBUG
+          return &TargetBlock;
+        });
+  }
 
-  if (Section *CUInfoSec = G.findSectionByName(MachOUnwindInfoSectionName))
-    ScanUnwindInfoSection(*CUInfoSec, US.CompactUnwindSection);
+  if (Section *CUInfoSec = G.findSectionByName(MachOUnwindInfoSectionName)) {
+    ScanUnwindInfoSection(
+        *CUInfoSec, US.CompactUnwindSection, [&](Block &B) -> Block * {
+          // Compact unwind records should just have a keep-alive pointing to
+          // the target function.
+          assert(B.edges_size() == 1 &&
+                 "unwind-info record should only have one edge");
+          for (auto &E : B.edges()) {
+            assert(E.getTarget().isDefined() &&
+                   "unwind-info record edge has external target");
+            assert(E.getKind() == Edge::KeepAlive &&
+                   "unwind-info record has unexpected edge kind");
+            return &E.getTarget().getBlock();
+          }
+          return nullptr;
+        });
+  }
 
   // If we didn't find any pointed-to code-blocks then there's no need to
   // register any info.

From 74d53f73bdf81bf72b949e41058398d610612e6b Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Wed, 5 Feb 2025 13:25:50 +1100
Subject: [PATCH 069/282] [ORC] Moch MachOPlatform unwind-info fixes.

Some eh-frame records are CIEs, which don't point to functions. We need to skip
these records. This patch reuses EHFrameCFIBlockInspector to identify function
targets, rather than a custom loop. Any performance impact will be minimal, and
essentially irrelevant once compact-unwind support re-lands (since at that
point we'll discard most eh-frame records).

For unwind-info sections: don't assume one block per record: the unwind-info
section packs all records into a single block.

(cherry picked from commit 9de581b206eceac331aa26e13b62a9a35bfd406f)
---
 .../lib/ExecutionEngine/Orc/MachOPlatform.cpp | 42 +++++--------------
 1 file changed, 11 insertions(+), 31 deletions(-)

diff --git a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
index cea0e984718f2..44ae73671f5b1 100644
--- a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
@@ -9,6 +9,7 @@
 #include "llvm/ExecutionEngine/Orc/MachOPlatform.h"
 
 #include "llvm/BinaryFormat/MachO.h"
+#include "llvm/ExecutionEngine/JITLink/EHFrameSupport.h"
 #include "llvm/ExecutionEngine/JITLink/MachO.h"
 #include "llvm/ExecutionEngine/JITLink/aarch64.h"
 #include "llvm/ExecutionEngine/JITLink/x86_64.h"
@@ -1257,7 +1258,7 @@ MachOPlatform::MachOPlatformPlugin::findUnwindSectionInfo(
   // that section points to to the CodeBlocks vector.
   SmallVector<Block *> CodeBlocks;
   auto ScanUnwindInfoSection = [&](Section &Sec, ExecutorAddrRange &SecRange,
-                                   auto GetCodeForRecord) {
+                                   auto AddCodeBlocks) {
     if (Sec.blocks().empty())
       return;
     SecRange = (*Sec.blocks().begin())->getRange();
@@ -1265,50 +1266,29 @@ MachOPlatform::MachOPlatformPlugin::findUnwindSectionInfo(
       auto R = B->getRange();
       SecRange.Start = std::min(SecRange.Start, R.Start);
       SecRange.End = std::max(SecRange.End, R.End);
-      if (auto *CodeBlock = GetCodeForRecord(*B))
-        CodeBlocks.push_back(CodeBlock);
+      AddCodeBlocks(*B);
     }
   };
 
   if (Section *EHFrameSec = G.findSectionByName(MachOEHFrameSectionName)) {
-    ScanUnwindInfoSection(
-        *EHFrameSec, US.DwarfSection, [&](Block &B) -> Block * {
-          // Filter out CIE, personality, etc. edges.
-          SmallVector<Edge *, 4> BEdges;
-          for (auto &E : B.edges())
-            BEdges.push_back(&E);
-          llvm::sort(BEdges, [](const Edge *LHS, const Edge *RHS) {
-            return LHS->getOffset() < RHS->getOffset();
-          });
-          if (BEdges.size() < 2)
-            return nullptr;
-          auto &TargetBlock = BEdges[1]->getTarget().getBlock();
-#ifndef NDEBUG
-          auto &TargetSection = TargetBlock.getSection();
-          assert(&TargetSection != EHFrameSec &&
-                 (TargetSection.getMemProt() & MemProt::Exec) ==
-                     MemProt::Exec &&
-                 "Invalid eh-frame function target");
-#endif // NDEBUG
-          return &TargetBlock;
-        });
+    ScanUnwindInfoSection(*EHFrameSec, US.DwarfSection, [&](Block &B) {
+      if (auto *Fn = jitlink::EHFrameCFIBlockInspector::FromEdgeScan(B)
+                         .getPCBeginEdge())
+        if (Fn->getTarget().isDefined())
+          CodeBlocks.push_back(&Fn->getTarget().getBlock());
+    });
   }
 
   if (Section *CUInfoSec = G.findSectionByName(MachOUnwindInfoSectionName)) {
     ScanUnwindInfoSection(
-        *CUInfoSec, US.CompactUnwindSection, [&](Block &B) -> Block * {
-          // Compact unwind records should just have a keep-alive pointing to
-          // the target function.
-          assert(B.edges_size() == 1 &&
-                 "unwind-info record should only have one edge");
+        *CUInfoSec, US.CompactUnwindSection, [&](Block &B) {
           for (auto &E : B.edges()) {
             assert(E.getTarget().isDefined() &&
                    "unwind-info record edge has external target");
             assert(E.getKind() == Edge::KeepAlive &&
                    "unwind-info record has unexpected edge kind");
-            return &E.getTarget().getBlock();
+            CodeBlocks.push_back(&E.getTarget().getBlock());
           }
-          return nullptr;
         });
   }
 

From 50c4c2bf6802da5eb47606de1cea89ddc60bf14b Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Tue, 4 Feb 2025 13:21:33 +1100
Subject: [PATCH 070/282] Re-reapply "[ORC] Enable JIT support for the
 compact-unwind..." with fixes.

Re-enables compact-unwind support in JITLink, which was reverted in b04847b427d
due to buildbot failures.

The underlying cause for the failures on the buildbots was the lack of
compact-unwind registration support on older Darwin OSes. Since the
CompactUnwindManager pass now removes eh-frames by default we were left with
unwind-info that could not be registered. On x86-64, where eh-frame info is
produced by default the solution is to fall back to using eh-frames. On arm64
we simply can't support exceptions on older OSes.

This patch updates the EHFrameRegistrationPlugin to remove the compact-unwind
section (__LD,__compact_unwind) when installed, forcing use of eh-frames when
the EHFrameRegistrationPlugin is used. In LLJIT, the EHFrameRegistrationPlugin
continues to be used for all non-Darwin platform, and will be added on Darwin
platforms when the a CompactUnwindRegistrationPlugin instance can't be created
(e.g. due to missing support for compact-unwind info registration).

The lit.cfg.py script is updated to check whether the host OSes default unwind
info supports JIT registration, allowing tests to be disabled for older Darwin
OSes on arm64.

(cherry picked from commit eae6d6d18bd4d9e7dfe5fc1206d23d8ef663c8c7)
---
 clang/test/Interpreter/simple-exception.cpp   |   4 +-
 compiler-rt/lib/orc/macho_platform.cpp        |   6 +
 .../TestCases/Darwin/Generic/exceptions.cpp   |  13 +
 llvm/include/llvm/ExecutionEngine/Orc/Core.h  |   9 +-
 .../Orc/ExecutorProcessControl.h              |   4 +
 .../Orc/Shared/MachOObjectFormat.h            |   1 +
 .../ExecutionEngine/Orc/Shared/OrcRTBridge.h  |   9 +
 .../Orc/TargetProcess/UnwindInfoManager.h     |  78 +++
 .../Orc/UnwindInfoRegistrationPlugin.h        |  70 ++
 .../ExecutionEngine/JITLink/CMakeLists.txt    |   1 +
 .../JITLink/CompactUnwindSupport.cpp          | 103 +++
 .../JITLink/CompactUnwindSupport.h            | 653 ++++++++++++++++++
 .../JITLink/MachOLinkGraphBuilder.cpp         | 116 ----
 .../JITLink/MachOLinkGraphBuilder.h           |  11 -
 .../ExecutionEngine/JITLink/MachO_arm64.cpp   |  54 +-
 .../ExecutionEngine/JITLink/MachO_x86_64.cpp  |  63 +-
 llvm/lib/ExecutionEngine/Orc/CMakeLists.txt   |   1 +
 llvm/lib/ExecutionEngine/Orc/CompileUtils.cpp |   3 +
 llvm/lib/ExecutionEngine/Orc/Core.cpp         |  34 +-
 .../Orc/EHFrameRegistrationPlugin.cpp         |  13 +-
 .../Orc/ExecutorProcessControl.cpp            |   7 +
 llvm/lib/ExecutionEngine/Orc/LLJIT.cpp        |  31 +-
 .../Orc/Shared/MachOObjectFormat.cpp          |   1 +
 .../Orc/Shared/OrcRTBridge.cpp                |  14 +
 .../Orc/TargetProcess/CMakeLists.txt          |   1 +
 .../Orc/TargetProcess/UnwindInfoManager.cpp   | 188 +++++
 .../Orc/UnwindInfoRegistrationPlugin.cpp      | 238 +++++++
 .../Orc/minimal-throw-catch.ll                |   2 +-
 .../OrcLazy/minimal-throw-catch.ll            |   2 +-
 llvm/test/lit.cfg.py                          |  48 ++
 30 files changed, 1606 insertions(+), 172 deletions(-)
 create mode 100644 compiler-rt/test/orc/TestCases/Darwin/Generic/exceptions.cpp
 create mode 100644 llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/UnwindInfoManager.h
 create mode 100644 llvm/include/llvm/ExecutionEngine/Orc/UnwindInfoRegistrationPlugin.h
 create mode 100644 llvm/lib/ExecutionEngine/JITLink/CompactUnwindSupport.cpp
 create mode 100644 llvm/lib/ExecutionEngine/JITLink/CompactUnwindSupport.h
 create mode 100644 llvm/lib/ExecutionEngine/Orc/TargetProcess/UnwindInfoManager.cpp
 create mode 100644 llvm/lib/ExecutionEngine/Orc/UnwindInfoRegistrationPlugin.cpp

diff --git a/clang/test/Interpreter/simple-exception.cpp b/clang/test/Interpreter/simple-exception.cpp
index 6749acd6e6bd2..651e8d9402f89 100644
--- a/clang/test/Interpreter/simple-exception.cpp
+++ b/clang/test/Interpreter/simple-exception.cpp
@@ -1,7 +1,7 @@
 // clang-format off
 // UNSUPPORTED: system-aix
-// XFAIL for arm and arm64, or running on Windows.
-// XFAIL: target=arm{{.*}}, system-windows
+// XFAIL for arm, or running on Windows.
+// XFAIL: target=arm-{{.*}}, target=armv{{.*}}, system-windows
 // RUN: cat %s | clang-repl | FileCheck %s
 
 // Incompatible with msan. It passes with -O3 but fail -Oz. Interpreter
diff --git a/compiler-rt/lib/orc/macho_platform.cpp b/compiler-rt/lib/orc/macho_platform.cpp
index 8ca68587aeb36..4b603fd95e316 100644
--- a/compiler-rt/lib/orc/macho_platform.cpp
+++ b/compiler-rt/lib/orc/macho_platform.cpp
@@ -557,6 +557,12 @@ Error MachOPlatformRuntimeState::registerObjectPlatformSections(
     return make_error<StringError>(ErrStream.str());
   }
 
+  ORC_RT_DEBUG({
+    printdbg("  UnwindInfo: %s, UseCallbackStyleUnwindInfo: %s\n",
+             UnwindInfo ? "true" : "false",
+             UseCallbackStyleUnwindInfo ? "true" : "false");
+  });
+
   if (UnwindInfo && UseCallbackStyleUnwindInfo) {
     ORC_RT_DEBUG({
       printdbg("  Registering new-style unwind info for:\n"
diff --git a/compiler-rt/test/orc/TestCases/Darwin/Generic/exceptions.cpp b/compiler-rt/test/orc/TestCases/Darwin/Generic/exceptions.cpp
new file mode 100644
index 0000000000000..7e9c40c724aec
--- /dev/null
+++ b/compiler-rt/test/orc/TestCases/Darwin/Generic/exceptions.cpp
@@ -0,0 +1,13 @@
+// RUN: %clangxx -c -o %t %s
+// RUN: %llvm_jitlink -slab-allocate=20Mb %t
+//
+// REQUIRES: system-darwin && host-arch-compatible
+
+int main(int argc, char *argv[]) {
+  try {
+    throw 42;
+  } catch (int E) {
+    return 42 - E;
+  }
+  return 1;
+}
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Core.h b/llvm/include/llvm/ExecutionEngine/Orc/Core.h
index db853362f6573..3eddaf4c9c59f 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/Core.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Core.h
@@ -1204,8 +1204,13 @@ class JITDylib : public ThreadSafeRefCountedBase<JITDylib>,
 
   JITDylib(ExecutionSession &ES, std::string Name);
 
-  std::pair<AsynchronousSymbolQuerySet, std::shared_ptr<SymbolDependenceMap>>
-  IL_removeTracker(ResourceTracker &RT);
+  struct RemoveTrackerResult {
+    AsynchronousSymbolQuerySet QueriesToFail;
+    std::shared_ptr<SymbolDependenceMap> FailedSymbols;
+    std::vector<std::unique_ptr<MaterializationUnit>> DefunctMUs;
+  };
+
+  RemoveTrackerResult IL_removeTracker(ResourceTracker &RT);
 
   void transferTracker(ResourceTracker &DstRT, ResourceTracker &SrcRT);
 
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/ExecutorProcessControl.h b/llvm/include/llvm/ExecutionEngine/Orc/ExecutorProcessControl.h
index dcf5592f1717c..86e98e74b7055 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/ExecutorProcessControl.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/ExecutorProcessControl.h
@@ -20,6 +20,7 @@
 #include "llvm/ExecutionEngine/Orc/Shared/TargetProcessControlTypes.h"
 #include "llvm/ExecutionEngine/Orc/Shared/WrapperFunctionUtils.h"
 #include "llvm/ExecutionEngine/Orc/SymbolStringPool.h"
+#include "llvm/ExecutionEngine/Orc/TargetProcess/UnwindInfoManager.h"
 #include "llvm/ExecutionEngine/Orc/TaskDispatch.h"
 #include "llvm/Support/DynamicLibrary.h"
 #include "llvm/Support/MSVCErrorWorkarounds.h"
@@ -507,6 +508,9 @@ class SelfExecutorProcessControl : public ExecutorProcessControl,
                           SymbolLookupCompleteFn F) override;
 
   std::unique_ptr<jitlink::JITLinkMemoryManager> OwnedMemMgr;
+#ifdef __APPLE__
+  std::unique_ptr<UnwindInfoManager> UnwindInfoMgr;
+#endif // __APPLE__
   char GlobalManglingPrefix = 0;
 };
 
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Shared/MachOObjectFormat.h b/llvm/include/llvm/ExecutionEngine/Orc/Shared/MachOObjectFormat.h
index 5de85a4f8674e..8eb8c1b4cc89d 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/Shared/MachOObjectFormat.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Shared/MachOObjectFormat.h
@@ -25,6 +25,7 @@ namespace orc {
 extern StringRef MachODataCommonSectionName;
 extern StringRef MachODataDataSectionName;
 extern StringRef MachOEHFrameSectionName;
+extern StringRef MachOCompactUnwindSectionName;
 extern StringRef MachOCStringSectionName;
 extern StringRef MachOModInitFuncSectionName;
 extern StringRef MachOObjCCatListSectionName;
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h b/llvm/include/llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h
index aed43f6308cba..db5ff135a7164 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h
@@ -88,6 +88,15 @@ using SPSRunAsMainSignature = int64_t(shared::SPSExecutorAddr,
 using SPSRunAsVoidFunctionSignature = int32_t(shared::SPSExecutorAddr);
 using SPSRunAsIntFunctionSignature = int32_t(shared::SPSExecutorAddr, int32_t);
 } // end namespace rt
+
+namespace rt_alt {
+extern const char *UnwindInfoManagerInstanceName;
+extern const char *UnwindInfoManagerFindSectionsHelperName;
+extern const char *UnwindInfoManagerEnableWrapperName;
+extern const char *UnwindInfoManagerDisableWrapperName;
+extern const char *UnwindInfoManagerRegisterActionName;
+extern const char *UnwindInfoManagerDeregisterActionName;
+} // end namespace rt_alt
 } // end namespace orc
 } // end namespace llvm
 
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/UnwindInfoManager.h b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/UnwindInfoManager.h
new file mode 100644
index 0000000000000..fc7719f282122
--- /dev/null
+++ b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/UnwindInfoManager.h
@@ -0,0 +1,78 @@
+//===--- UnwindInfoManager.h -- Register unwind info sections ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Utilities for managing eh-frame and compact-unwind registration and lookup
+// through libunwind's find_dynamic_unwind_sections mechanism.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_UNWINDINFOMANAGER_H
+#define LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_UNWINDINFOMANAGER_H
+
+#include "llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h"
+#include "llvm/ExecutionEngine/Orc/TargetProcess/ExecutorBootstrapService.h"
+#include "llvm/Support/Error.h"
+#include <map>
+#include <mutex>
+
+namespace llvm::orc {
+
+class UnwindInfoManager : public ExecutorBootstrapService {
+public:
+  // This struct's layout should match the unw_dynamic_unwind_sections struct
+  // from libunwind/src/libunwid_ext.h.
+  struct UnwindSections {
+    uintptr_t dso_base;
+    uintptr_t dwarf_section;
+    size_t dwarf_section_length;
+    uintptr_t compact_unwind_section;
+    size_t compact_unwind_section_length;
+  };
+
+  /// If the libunwind find-dynamic-unwind-info callback registration APIs are
+  /// available then this method will return an UnwindInfoManager instance,
+  /// otherwise it will return nullptr.
+  static std::unique_ptr<UnwindInfoManager> TryCreate();
+
+  Error shutdown() override;
+  void addBootstrapSymbols(StringMap<ExecutorAddr> &M) override;
+
+  Error enable(void *FindDynamicUnwindSections);
+  Error disable(void);
+
+  Error registerSections(ArrayRef<orc::ExecutorAddrRange> CodeRanges,
+                         orc::ExecutorAddr DSOBase,
+                         orc::ExecutorAddrRange DWARFEHFrame,
+                         orc::ExecutorAddrRange CompactUnwind);
+
+  Error deregisterSections(ArrayRef<orc::ExecutorAddrRange> CodeRanges);
+
+  int findSections(uintptr_t Addr, UnwindSections *Info);
+
+private:
+  UnwindInfoManager(int (*AddFindDynamicUnwindSections)(void *),
+                    int (*RemoveFindDynamicUnwindSections)(void *))
+      : AddFindDynamicUnwindSections(AddFindDynamicUnwindSections),
+        RemoveFindDynamicUnwindSections(RemoveFindDynamicUnwindSections) {}
+
+  static int findSectionsHelper(UnwindInfoManager *Instance, uintptr_t Addr,
+                                UnwindSections *Info);
+
+  std::mutex M;
+  std::map<uintptr_t, UnwindSections> UWSecs;
+
+  int (*AddFindDynamicUnwindSections)(void *) = nullptr;
+  int (*RemoveFindDynamicUnwindSections)(void *) = nullptr;
+  void *FindDynamicUnwindSections = nullptr;
+
+  static const char *AddFnName, *RemoveFnName;
+};
+
+} // namespace llvm::orc
+
+#endif // LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_UNWINDINFOMANAGER_H
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/UnwindInfoRegistrationPlugin.h b/llvm/include/llvm/ExecutionEngine/Orc/UnwindInfoRegistrationPlugin.h
new file mode 100644
index 0000000000000..eb883a79a93d8
--- /dev/null
+++ b/llvm/include/llvm/ExecutionEngine/Orc/UnwindInfoRegistrationPlugin.h
@@ -0,0 +1,70 @@
+//===- UnwindInfoRegistrationPlugin.h -- libunwind registration -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Register eh-frame and compact-unwind sections with libunwind
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_UNWINDINFOREGISTRATIONPLUGIN_H
+#define LLVM_EXECUTIONENGINE_ORC_UNWINDINFOREGISTRATIONPLUGIN_H
+
+#include "llvm/ExecutionEngine/Orc/LinkGraphLinkingLayer.h"
+
+namespace llvm::orc {
+
+class UnwindInfoRegistrationPlugin : public LinkGraphLinkingLayer::Plugin {
+public:
+  static Expected<std::shared_ptr<UnwindInfoRegistrationPlugin>>
+  Create(IRLayer &IRL, JITDylib &PlatformJD, ExecutorAddr Instance,
+         ExecutorAddr FindHelper, ExecutorAddr Enable, ExecutorAddr Disable,
+         ExecutorAddr Register, ExecutorAddr Deregister);
+
+  static Expected<std::shared_ptr<UnwindInfoRegistrationPlugin>>
+  Create(IRLayer &IRL, JITDylib &PlatformJD);
+
+  ~UnwindInfoRegistrationPlugin();
+
+  void modifyPassConfig(MaterializationResponsibility &MR,
+                        jitlink::LinkGraph &G,
+                        jitlink::PassConfiguration &PassConfig) override;
+
+  Error notifyEmitted(MaterializationResponsibility &MR) override {
+    return Error::success();
+  }
+
+  Error notifyFailed(MaterializationResponsibility &MR) override {
+    return Error::success();
+  }
+
+  Error notifyRemovingResources(JITDylib &JD, ResourceKey K) override {
+    return Error::success();
+  }
+
+  void notifyTransferringResources(JITDylib &JD, ResourceKey DstKey,
+                                   ResourceKey SrcKey) override {}
+
+private:
+  UnwindInfoRegistrationPlugin(ExecutionSession &ES, ExecutorAddr Instance,
+                               ExecutorAddr Disable, ExecutorAddr Register,
+                               ExecutorAddr Deregister)
+      : ES(ES), Instance(Instance), Disable(Disable), Register(Register),
+        Deregister(Deregister) {
+    DSOBaseName = ES.intern("__jitlink$libunwind_dso_base");
+  }
+
+  static Expected<ThreadSafeModule> makeBouncerModule(ExecutionSession &ES);
+  Error addUnwindInfoRegistrationActions(jitlink::LinkGraph &G);
+
+  ExecutionSession &ES;
+  SymbolStringPtr DSOBaseName;
+  ExecutorAddr Instance, Disable, Register, Deregister;
+};
+
+} // namespace llvm::orc
+
+#endif // LLVM_EXECUTIONENGINE_ORC_UNWINDINFOREGISTRATIONPLUGIN_H
diff --git a/llvm/lib/ExecutionEngine/JITLink/CMakeLists.txt b/llvm/lib/ExecutionEngine/JITLink/CMakeLists.txt
index e5f5a99c39bc0..65dd0c7468ae1 100644
--- a/llvm/lib/ExecutionEngine/JITLink/CMakeLists.txt
+++ b/llvm/lib/ExecutionEngine/JITLink/CMakeLists.txt
@@ -3,6 +3,7 @@ tablegen(LLVM COFFOptions.inc -gen-opt-parser-defs)
 add_public_tablegen_target(JITLinkTableGen)
 
 add_llvm_component_library(LLVMJITLink
+  CompactUnwindSupport.cpp
   DWARFRecordSectionSplitter.cpp
   EHFrameSupport.cpp
   JITLink.cpp
diff --git a/llvm/lib/ExecutionEngine/JITLink/CompactUnwindSupport.cpp b/llvm/lib/ExecutionEngine/JITLink/CompactUnwindSupport.cpp
new file mode 100644
index 0000000000000..51e3d26479ffd
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/JITLink/CompactUnwindSupport.cpp
@@ -0,0 +1,103 @@
+//=------- CompactUnwindSupport.cpp - Compact Unwind format support -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Compact Unwind support.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CompactUnwindSupport.h"
+
+#include "llvm/ADT/Sequence.h"
+
+#define DEBUG_TYPE "jitlink"
+
+namespace llvm {
+namespace jitlink {
+
+Error splitCompactUnwindBlocks(LinkGraph &G, Section &CompactUnwindSection,
+                               size_t RecordSize) {
+
+  std::vector<Block *> OriginalBlocks(CompactUnwindSection.blocks().begin(),
+                                      CompactUnwindSection.blocks().end());
+  LLVM_DEBUG({
+    dbgs() << "In " << G.getName() << " splitting compact unwind section "
+           << CompactUnwindSection.getName() << " containing "
+           << OriginalBlocks.size() << " initial blocks...\n";
+  });
+
+  while (!OriginalBlocks.empty()) {
+    auto *B = OriginalBlocks.back();
+    OriginalBlocks.pop_back();
+
+    if (B->getSize() == 0) {
+      LLVM_DEBUG({
+        dbgs() << "  Skipping empty block at "
+               << formatv("{0:x16}", B->getAddress()) << "\n";
+      });
+      continue;
+    }
+
+    unsigned NumBlocks = B->getSize() / RecordSize;
+
+    LLVM_DEBUG({
+      dbgs() << "  Splitting block at " << formatv("{0:x16}", B->getAddress())
+             << " into " << NumBlocks << " compact unwind record(s)\n";
+    });
+
+    if (B->getSize() % RecordSize)
+      return make_error<JITLinkError>(
+          "Error splitting compact unwind record in " + G.getName() +
+          ": block at " + formatv("{0:x}", B->getAddress()) + " has size " +
+          formatv("{0:x}", B->getSize()) +
+          " (not a multiple of CU record size of " +
+          formatv("{0:x}", RecordSize) + ")");
+
+    auto Blocks =
+        G.splitBlock(*B, map_range(seq(1U, NumBlocks), [=](Edge::OffsetT Idx) {
+          return Idx * RecordSize;
+        }));
+
+    for (auto *CURec : Blocks) {
+      bool AddedKeepAlive = false;
+
+      for (auto &E : CURec->edges()) {
+        if (E.getOffset() == 0) {
+          LLVM_DEBUG({
+            dbgs() << "    Updating compact unwind record at "
+                   << CURec->getAddress() << " to point to "
+                   << (E.getTarget().hasName() ? *E.getTarget().getName()
+                                               : StringRef())
+                   << " (at " << E.getTarget().getAddress() << ")\n";
+          });
+
+          if (E.getTarget().isExternal())
+            return make_error<JITLinkError>(
+                "Error adding keep-alive edge for compact unwind record at " +
+                formatv("{0:x}", CURec->getAddress()) + ": target " +
+                *E.getTarget().getName() + " is an external symbol");
+          auto &TgtBlock = E.getTarget().getBlock();
+          auto &CURecSym =
+              G.addAnonymousSymbol(*CURec, 0, RecordSize, false, false);
+          TgtBlock.addEdge(Edge::KeepAlive, 0, CURecSym, 0);
+          AddedKeepAlive = true;
+        }
+      }
+
+      if (!AddedKeepAlive)
+        return make_error<JITLinkError>(
+            "Error adding keep-alive edge for compact unwind record at " +
+            formatv("{0:x}", CURec->getAddress()) +
+            ": no outgoing target edge at offset 0");
+    }
+  }
+
+  return Error::success();
+}
+
+} // end namespace jitlink
+} // end namespace llvm
diff --git a/llvm/lib/ExecutionEngine/JITLink/CompactUnwindSupport.h b/llvm/lib/ExecutionEngine/JITLink/CompactUnwindSupport.h
new file mode 100644
index 0000000000000..dc3ed942aa8ac
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/JITLink/CompactUnwindSupport.h
@@ -0,0 +1,653 @@
+//===- CompactUnwindSupportImpl.h - Compact Unwind format impl --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Compact Unwind format support implementation details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LIB_EXECUTIONENGINE_JITLINK_COMPACTUNWINDSUPPORTIMPL_H
+#define LIB_EXECUTIONENGINE_JITLINK_COMPACTUNWINDSUPPORTIMPL_H
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ExecutionEngine/JITLink/MachO.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Endian.h"
+
+#define DEBUG_TYPE "jitlink_cu"
+
+namespace llvm {
+namespace jitlink {
+
+/// Split blocks in an __LD,__compact_unwind section on record boundaries.
+/// When this function returns edges within each record are guaranteed to be
+/// sorted by offset.
+Error splitCompactUnwindBlocks(LinkGraph &G, Section &CompactUnwindSection,
+                               size_t RecordSize);
+
+/// CRTP base for compact unwind traits classes. Automatically provides derived
+/// constants.
+///
+/// FIXME: Passing PtrSize as a template parameter is a hack to work around a
+///        bug in older MSVC compilers (until at least MSVC 15) where constexpr
+///        fields in the CRTP impl class were not visible to the base class.
+///        Once we no longer need to support these compilers the PtrSize
+///        template argument should be removed and PointerSize should be
+///        defined as a member in the CRTP Impl classes.
+template <typename CRTPImpl, size_t PtrSize> struct CompactUnwindTraits {
+  static constexpr size_t PointerSize = PtrSize;
+  static constexpr size_t Size = 3 * PointerSize + 2 * 4;
+  static constexpr size_t FnFieldOffset = 0;
+  static constexpr size_t SizeFieldOffset = FnFieldOffset + PointerSize;
+  static constexpr size_t EncodingFieldOffset = SizeFieldOffset + 4;
+  static constexpr size_t PersonalityFieldOffset = EncodingFieldOffset + 4;
+  static constexpr size_t LSDAFieldOffset =
+      PersonalityFieldOffset + PointerSize;
+
+  static uint32_t readPCRangeSize(ArrayRef<char> RecordContent) {
+    assert(SizeFieldOffset + 4 <= RecordContent.size() &&
+           "Truncated CU record?");
+    return support::endian::read32<CRTPImpl::Endianness>(RecordContent.data() +
+                                                         SizeFieldOffset);
+  }
+
+  static uint32_t readEncoding(ArrayRef<char> RecordContent) {
+    assert(EncodingFieldOffset + 4 <= RecordContent.size() &&
+           "Truncated CU record?");
+    return support::endian::read32<CRTPImpl::Endianness>(RecordContent.data() +
+                                                         EncodingFieldOffset);
+  }
+};
+
+/// Architecture specific implementation of CompactUnwindManager.
+template <typename CURecTraits> class CompactUnwindManager {
+public:
+  CompactUnwindManager(StringRef CompactUnwindSectionName,
+                       StringRef UnwindInfoSectionName,
+                       StringRef EHFrameSectionName)
+      : CompactUnwindSectionName(CompactUnwindSectionName),
+        UnwindInfoSectionName(UnwindInfoSectionName),
+        EHFrameSectionName(EHFrameSectionName) {}
+
+  // Split compact unwind records, add keep-alive edges from functions to
+  // compact unwind records, and from compact unwind records to FDEs where
+  // needed.
+  //
+  // This method must be called *after* __eh_frame has been processed: it
+  // assumes that eh-frame records have been split up and keep-alive edges have
+  // been inserted.
+  Error prepareForPrune(LinkGraph &G) {
+    Section *CUSec = G.findSectionByName(CompactUnwindSectionName);
+    if (!CUSec || CUSec->empty()) {
+      LLVM_DEBUG({
+        dbgs() << "Compact unwind: No compact unwind info for " << G.getName()
+               << "\n";
+      });
+      return Error::success();
+    }
+
+    LLVM_DEBUG({
+      dbgs() << "Compact unwind: preparing " << G.getName() << " for prune\n";
+    });
+
+    Section *EHFrameSec = G.findSectionByName(EHFrameSectionName);
+
+    if (auto Err = splitCompactUnwindBlocks(G, *CUSec, CURecTraits::Size))
+      return Err;
+
+    LLVM_DEBUG({
+      dbgs() << "  Preparing " << CUSec->blocks_size() << " blocks in "
+             << CompactUnwindSectionName << "\n";
+    });
+
+    for (auto *B : CUSec->blocks()) {
+
+      // Find target function edge.
+      Edge *PCBeginEdge = nullptr;
+      for (auto &E : B->edges_at(CURecTraits::FnFieldOffset)) {
+        PCBeginEdge = &E;
+        break;
+      }
+
+      if (!PCBeginEdge)
+        return make_error<JITLinkError>(
+            "In " + G.getName() + ", compact unwind record at " +
+            formatv("{0:x}", B->getAddress()) + " has no pc-begin edge");
+
+      if (!PCBeginEdge->getTarget().isDefined())
+        return make_error<JITLinkError>(
+            "In " + G.getName() + ", compact unwind record at " +
+            formatv("{0:x}", B->getAddress()) + " points at external symbol " +
+            *PCBeginEdge->getTarget().getName());
+
+      auto &Fn = PCBeginEdge->getTarget();
+
+      if (!Fn.isDefined()) {
+        LLVM_DEBUG({
+          dbgs() << "In " << CompactUnwindSectionName << " for " << G.getName()
+                 << " encountered unexpected pc-edge to undefined symbol "
+                 << Fn.getName() << "\n";
+        });
+        continue;
+      } else {
+        LLVM_DEBUG({
+          dbgs() << "    Found record for function ";
+          if (Fn.hasName())
+            dbgs() << Fn.getName();
+          else
+            dbgs() << "<anon @ " << Fn.getAddress() << '>';
+          dbgs() << '\n';
+        });
+      }
+
+      bool NeedsDWARF = CURecTraits::encodingSpecifiesDWARF(
+          CURecTraits::readEncoding(B->getContent()));
+
+      auto &CURecSym =
+          G.addAnonymousSymbol(*B, 0, CURecTraits::Size, false, false);
+
+      bool KeepAliveAlreadyPresent = false;
+      if (EHFrameSec) {
+        Edge *KeepAliveEdge = nullptr;
+        for (auto &E : Fn.getBlock().edges_at(0)) {
+          if (E.getKind() == Edge::KeepAlive && E.getTarget().isDefined() &&
+              &E.getTarget().getBlock().getSection() == EHFrameSec) {
+            KeepAliveEdge = &E;
+            break;
+          }
+        }
+
+        if (KeepAliveEdge) {
+          // Found a keep-alive edge to an FDE in the eh-frame. Switch the keep
+          // alive edge to point to the CU and if the CU needs DWARF then add
+          // an extra keep-alive edge from the CU to the FDE.
+          auto &FDE = KeepAliveEdge->getTarget();
+          KeepAliveEdge->setTarget(CURecSym);
+          KeepAliveAlreadyPresent = true;
+          if (NeedsDWARF) {
+            LLVM_DEBUG({
+              dbgs() << "      Needs DWARF: adding keep-alive edge to FDE at "
+                     << FDE.getAddress() << "\n";
+            });
+            B->addEdge(Edge::KeepAlive, 0, FDE, 0);
+          }
+        } else {
+          if (NeedsDWARF)
+            return make_error<JITLinkError>(
+                "In " + G.getName() + ", compact unwind recard ot " +
+                formatv("{0:x}", B->getAddress()) +
+                " needs DWARF, but no FDE was found");
+        }
+      } else {
+        if (NeedsDWARF)
+          return make_error<JITLinkError>(
+              "In " + G.getName() + ", compact unwind recard ot " +
+              formatv("{0:x}", B->getAddress()) + " needs DWARF, but no " +
+              EHFrameSectionName + " section exists");
+      }
+
+      if (!KeepAliveAlreadyPresent) {
+        // No FDE edge. We'll need to add a new edge from the function back
+        // to the CU record.
+        Fn.getBlock().addEdge(Edge::KeepAlive, 0, CURecSym, 0);
+      }
+    }
+
+    return Error::success();
+  }
+
+  /// Process all __compact_unwind records and reserve space for __unwind_info.
+  Error processAndReserveUnwindInfo(LinkGraph &G) {
+    // Bail out early if no unwind info.
+    Section *CUSec = G.findSectionByName(CompactUnwindSectionName);
+    if (!CUSec)
+      return Error::success();
+
+    // The __LD/__compact_unwind section is only used as input for the linker.
+    // We'll create a new __TEXT,__unwind_info section for unwind info output.
+    CUSec->setMemLifetime(orc::MemLifetime::NoAlloc);
+
+    // Find / make a mach-header to act as the base for unwind-info offsets
+    // (and to report the arch / subarch to libunwind).
+    if (auto Err = getOrCreateCompactUnwindBase(G))
+      return Err;
+
+    // Error out if there's already unwind-info in the graph: We have no idea
+    // how to merge unwind-info sections.
+    if (G.findSectionByName(UnwindInfoSectionName))
+      return make_error<JITLinkError>("In " + G.getName() + ", " +
+                                      UnwindInfoSectionName +
+                                      " already exists");
+
+    // Process the __compact_unwind section to build the Records vector that
+    // we'll use for writing the __unwind_info section.
+    if (auto Err = processCompactUnwind(G, *CUSec))
+      return Err;
+
+    // Calculate the size of __unwind_info.
+    size_t UnwindInfoSectionSize =
+        UnwindInfoSectionHeaderSize +
+        Personalities.size() * PersonalityEntrySize +
+        (NumSecondLevelPages + 1) * IndexEntrySize + NumLSDAs * LSDAEntrySize +
+        NumSecondLevelPages * SecondLevelPageHeaderSize +
+        Records.size() * SecondLevelPageEntrySize;
+
+    LLVM_DEBUG({
+      dbgs() << "In " << G.getName() << ", reserving "
+             << formatv("{0:x}", UnwindInfoSectionSize) << " bytes for "
+             << UnwindInfoSectionName << "\n";
+    });
+
+    // Create the __unwind_info section and reserve space for it.
+    Section &UnwindInfoSec =
+        G.createSection(UnwindInfoSectionName, orc::MemProt::Read);
+
+    auto UnwindInfoSectionContent = G.allocateBuffer(UnwindInfoSectionSize);
+    memset(UnwindInfoSectionContent.data(), 0, UnwindInfoSectionContent.size());
+    auto &B = G.createMutableContentBlock(
+        UnwindInfoSec, UnwindInfoSectionContent, orc::ExecutorAddr(), 8, 0);
+
+    // Add Keep-alive edges from the __unwind_info block to all of the target
+    // functions.
+    for (auto &R : Records)
+      B.addEdge(Edge::KeepAlive, 0, *R.Fn, 0);
+
+    return Error::success();
+  }
+
+  Error writeUnwindInfo(LinkGraph &G) {
+    Section *CUSec = G.findSectionByName(CompactUnwindSectionName);
+    if (!CUSec || CUSec->empty())
+      return Error::success();
+
+    Section *UnwindInfoSec = G.findSectionByName(UnwindInfoSectionName);
+    if (!UnwindInfoSec)
+      return make_error<JITLinkError>("In " + G.getName() + ", " +
+                                      UnwindInfoSectionName +
+                                      " missing after allocation");
+
+    if (UnwindInfoSec->blocks_size() != 1)
+      return make_error<JITLinkError>(
+          "In " + G.getName() + ", " + UnwindInfoSectionName +
+          " contains more than one block post-allocation");
+
+    LLVM_DEBUG(
+        { dbgs() << "Writing unwind info for " << G.getName() << "...\n"; });
+
+    mergeRecords();
+
+    auto &UnwindInfoBlock = **UnwindInfoSec->blocks().begin();
+    auto Content = UnwindInfoBlock.getMutableContent(G);
+    BinaryStreamWriter Writer(
+        {reinterpret_cast<uint8_t *>(Content.data()), Content.size()},
+        CURecTraits::Endianness);
+
+    // __unwind_info format, from mach-o/compact_unwind_encoding.h on Darwin:
+    //
+    // #define UNWIND_SECTION_VERSION 1
+    // struct unwind_info_section_header
+    // {
+    //     uint32_t    version;            // UNWIND_SECTION_VERSION
+    //     uint32_t    commonEncodingsArraySectionOffset;
+    //     uint32_t    commonEncodingsArrayCount;
+    //     uint32_t    personalityArraySectionOffset;
+    //     uint32_t    personalityArrayCount;
+    //     uint32_t    indexSectionOffset;
+    //     uint32_t    indexCount;
+    //     // compact_unwind_encoding_t[]
+    //     // uint32_t personalities[]
+    //     // unwind_info_section_header_index_entry[]
+    //     // unwind_info_section_header_lsda_index_entry[]
+    // };
+
+    if (auto Err = writeHeader(G, Writer))
+      return Err;
+
+    // Skip common encodings: JITLink doesn't use them.
+
+    if (auto Err = writePersonalities(G, Writer))
+      return Err;
+
+    // Calculate the offset to the LSDAs.
+    size_t SectionOffsetToLSDAs =
+        Writer.getOffset() + (NumSecondLevelPages + 1) * IndexEntrySize;
+
+    // Calculate offset to the 1st second-level page.
+    size_t SectionOffsetToSecondLevelPages =
+        SectionOffsetToLSDAs + NumLSDAs * LSDAEntrySize;
+
+    if (auto Err = writeIndexes(G, Writer, SectionOffsetToLSDAs,
+                                SectionOffsetToSecondLevelPages))
+      return Err;
+
+    if (auto Err = writeLSDAs(G, Writer))
+      return Err;
+
+    if (auto Err = writeSecondLevelPages(G, Writer))
+      return Err;
+
+    LLVM_DEBUG({
+      dbgs() << "    Wrote " << formatv("{0:x}", Writer.getOffset())
+             << " bytes of unwind info.\n";
+    });
+
+    return Error::success();
+  }
+
+private:
+  // Calculate the size of unwind-info.
+  static constexpr size_t MaxPersonalities = 4;
+  static constexpr size_t PersonalityShift = 28;
+
+  static constexpr size_t UnwindInfoSectionHeaderSize = 4 * 7;
+  static constexpr size_t PersonalityEntrySize = 4;
+  static constexpr size_t IndexEntrySize = 3 * 4;
+  static constexpr size_t LSDAEntrySize = 2 * 4;
+  static constexpr size_t SecondLevelPageSize = 4096;
+  static constexpr size_t SecondLevelPageHeaderSize = 8;
+  static constexpr size_t SecondLevelPageEntrySize = 8;
+  static constexpr size_t NumRecordsPerSecondLevelPage =
+      (SecondLevelPageSize - SecondLevelPageHeaderSize) /
+      SecondLevelPageEntrySize;
+
+  struct CompactUnwindRecord {
+    Symbol *Fn = nullptr;
+    uint32_t Size = 0;
+    uint32_t Encoding = 0;
+    Symbol *LSDA = nullptr;
+    Symbol *FDE = nullptr;
+  };
+
+  Error processCompactUnwind(LinkGraph &G, Section &CUSec) {
+    // TODO: Reset NumLSDAs, Personalities and CompactUnwindRecords if
+    // processing more than once.
+    assert(NumLSDAs == 0 && "NumLSDAs should be zero");
+    assert(Records.empty() && "CompactUnwindRecords vector should be empty.");
+    assert(Personalities.empty() && "Personalities vector should be empty.");
+
+    SmallVector<CompactUnwindRecord> NonUniquedRecords;
+    NonUniquedRecords.reserve(CUSec.blocks_size());
+
+    // Process __compact_unwind blocks.
+    for (auto *B : CUSec.blocks()) {
+      CompactUnwindRecord R;
+      R.Encoding = CURecTraits::readEncoding(B->getContent());
+      for (auto &E : B->edges()) {
+        switch (E.getOffset()) {
+        case CURecTraits::FnFieldOffset:
+          // This could be the function-pointer, or the FDE keep-alive. Check
+          // the type to decide.
+          if (E.getKind() == Edge::KeepAlive)
+            R.FDE = &E.getTarget();
+          else
+            R.Fn = &E.getTarget();
+          break;
+        case CURecTraits::PersonalityFieldOffset: {
+          // Add the Personality to the Personalities map and update the
+          // encoding.
+          size_t PersonalityIdx = 0;
+          for (; PersonalityIdx != Personalities.size(); ++PersonalityIdx)
+            if (Personalities[PersonalityIdx] == &E.getTarget())
+              break;
+          if (PersonalityIdx == MaxPersonalities)
+            return make_error<JITLinkError>(
+                "In " + G.getName() +
+                ", __compact_unwind contains too many personalities (max " +
+                formatv("{}", MaxPersonalities) + ")");
+          if (PersonalityIdx == Personalities.size())
+            Personalities.push_back(&E.getTarget());
+
+          R.Encoding |= (PersonalityIdx + 1) << PersonalityShift;
+          break;
+        }
+        case CURecTraits::LSDAFieldOffset:
+          ++NumLSDAs;
+          R.LSDA = &E.getTarget();
+          break;
+        default:
+          return make_error<JITLinkError>("In " + G.getName() +
+                                          ", compact unwind record at " +
+                                          formatv("{0:x}", B->getAddress()) +
+                                          " has unrecognized edge at offset " +
+                                          formatv("{0:x}", E.getOffset()));
+        }
+      }
+      Records.push_back(R);
+    }
+
+    // Sort the records into ascending order.
+    llvm::sort(Records, [](const CompactUnwindRecord &LHS,
+                           const CompactUnwindRecord &RHS) {
+      return LHS.Fn->getAddress() < RHS.Fn->getAddress();
+    });
+
+    // Calculate the number of second-level pages required.
+    NumSecondLevelPages = (Records.size() + NumRecordsPerSecondLevelPage - 1) /
+                          NumRecordsPerSecondLevelPage;
+
+    // Convert personality symbols to GOT entry pointers.
+    typename CURecTraits::GOTManager GOT(G);
+    for (auto &Personality : Personalities)
+      Personality = &GOT.getEntryForTarget(G, *Personality);
+
+    LLVM_DEBUG({
+      dbgs() << "  In " << G.getName() << ", " << CompactUnwindSectionName
+             << ": raw records = " << Records.size()
+             << ", personalities = " << Personalities.size()
+             << ", lsdas = " << NumLSDAs << "\n";
+    });
+
+    return Error::success();
+  }
+
+  void mergeRecords() {
+    SmallVector<CompactUnwindRecord> NonUniqued = std::move(Records);
+    Records.reserve(NonUniqued.size());
+
+    Records.push_back(NonUniqued.front());
+    for (size_t I = 1; I != NonUniqued.size(); ++I) {
+      auto &Next = NonUniqued[I];
+      auto &Last = Records.back();
+
+      bool NextNeedsDWARF = CURecTraits::encodingSpecifiesDWARF(Next.Encoding);
+      bool CannotBeMerged = CURecTraits::encodingCannotBeMerged(Next.Encoding);
+      if (NextNeedsDWARF || (Next.Encoding != Last.Encoding) ||
+          CannotBeMerged || Next.LSDA || Last.LSDA)
+        Records.push_back(Next);
+    }
+
+    // Recalculate derived values that may have changed.
+    NumSecondLevelPages = (Records.size() + NumRecordsPerSecondLevelPage - 1) /
+                          NumRecordsPerSecondLevelPage;
+  }
+
+  Error writeHeader(LinkGraph &G, BinaryStreamWriter &W) {
+    if (!isUInt<32>(NumSecondLevelPages + 1))
+      return make_error<JITLinkError>("In " + G.getName() + ", too many " +
+                                      UnwindInfoSectionName +
+                                      "second-level pages required");
+
+    // Write __unwind_info header.
+    size_t IndexArrayOffset = UnwindInfoSectionHeaderSize +
+                              Personalities.size() * PersonalityEntrySize;
+
+    cantFail(W.writeInteger<uint32_t>(1));
+    cantFail(W.writeInteger<uint32_t>(UnwindInfoSectionHeaderSize));
+    cantFail(W.writeInteger<uint32_t>(0));
+    cantFail(W.writeInteger<uint32_t>(UnwindInfoSectionHeaderSize));
+    cantFail(W.writeInteger<uint32_t>(Personalities.size()));
+    cantFail(W.writeInteger<uint32_t>(IndexArrayOffset));
+    cantFail(W.writeInteger<uint32_t>(NumSecondLevelPages + 1));
+
+    return Error::success();
+  }
+
+  Error writePersonalities(LinkGraph &G, BinaryStreamWriter &W) {
+    // Write personalities.
+    for (auto *PSym : Personalities) {
+      auto Delta = PSym->getAddress() - CompactUnwindBase->getAddress();
+      if (!isUInt<32>(Delta))
+        return makePersonalityRangeError(G, *PSym);
+      cantFail(W.writeInteger<uint32_t>(Delta));
+    }
+    return Error::success();
+  }
+
+  Error writeIndexes(LinkGraph &G, BinaryStreamWriter &W,
+                     size_t SectionOffsetToLSDAs,
+                     size_t SectionOffsetToSecondLevelPages) {
+    // Assume that function deltas are ok in this method -- we'll error
+    // check all of them when we write the second level pages.
+
+    // Write the header index entries.
+    size_t RecordIdx = 0;
+    size_t NumPreviousLSDAs = 0;
+    for (auto &R : Records) {
+      // If this record marks the start of a new second level page.
+      if (RecordIdx % NumRecordsPerSecondLevelPage == 0) {
+        auto FnDelta = R.Fn->getAddress() - CompactUnwindBase->getAddress();
+        auto SecondLevelPageOffset = SectionOffsetToSecondLevelPages +
+                                     (RecordIdx / NumRecordsPerSecondLevelPage);
+        auto LSDAOffset =
+            SectionOffsetToLSDAs + NumPreviousLSDAs * LSDAEntrySize;
+
+        cantFail(W.writeInteger<uint32_t>(FnDelta));
+        cantFail(W.writeInteger<uint32_t>(SecondLevelPageOffset));
+        cantFail(W.writeInteger<uint32_t>(LSDAOffset));
+      }
+      if (R.LSDA)
+        ++NumPreviousLSDAs;
+      ++RecordIdx;
+    }
+
+    // Write the index array terminator.
+    {
+      auto FnEndDelta =
+          Records.back().Fn->getRange().End - CompactUnwindBase->getAddress();
+
+      if (LLVM_UNLIKELY(!isUInt<32>(FnEndDelta)))
+        return make_error<JITLinkError>(
+            "In " + G.getName() + " " + UnwindInfoSectionName +
+            ", delta to end of functions  " +
+            formatv("{0:x}", Records.back().Fn->getRange().End) +
+            " exceeds 32 bits");
+
+      cantFail(W.writeInteger<uint32_t>(FnEndDelta));
+      cantFail(W.writeInteger<uint32_t>(0));
+      cantFail(W.writeInteger<uint32_t>(SectionOffsetToSecondLevelPages));
+    }
+
+    return Error::success();
+  }
+
+  Error writeLSDAs(LinkGraph &G, BinaryStreamWriter &W) {
+    // As with writeIndexes, assume that function deltas are ok for now.
+    for (auto &R : Records) {
+      if (R.LSDA) {
+        auto FnDelta = R.Fn->getAddress() - CompactUnwindBase->getAddress();
+        auto LSDADelta = R.LSDA->getAddress() - CompactUnwindBase->getAddress();
+
+        if (LLVM_UNLIKELY(!isUInt<32>(LSDADelta)))
+          return make_error<JITLinkError>(
+              "In " + G.getName() + " " + UnwindInfoSectionName +
+              ", delta to lsda at " + formatv("{0:x}", R.LSDA->getAddress()) +
+              " exceeds 32 bits");
+
+        cantFail(W.writeInteger<uint32_t>(FnDelta));
+        cantFail(W.writeInteger<uint32_t>(LSDADelta));
+      }
+    }
+
+    return Error::success();
+  }
+
+  Error writeSecondLevelPages(LinkGraph &G, BinaryStreamWriter &W) {
+    size_t RecordIdx = 0;
+
+    for (auto &R : Records) {
+      // When starting a new second-level page, write the page header:
+      //
+      //   2     : uint32_t    -- UNWIND_SECOND_LEVEL_REGULAR
+      //   8     : uint16_t    -- size of second level page table header
+      //   count : uint16_t    -- num entries in this second-level page
+      if (RecordIdx % NumRecordsPerSecondLevelPage == 0) {
+        constexpr uint32_t SecondLevelPageHeaderKind = 2;
+        constexpr uint16_t SecondLevelPageHeaderSize = 8;
+        uint16_t SecondLevelPageNumEntries =
+            std::min(Records.size() - RecordIdx, NumRecordsPerSecondLevelPage);
+
+        cantFail(W.writeInteger<uint32_t>(SecondLevelPageHeaderKind));
+        cantFail(W.writeInteger<uint16_t>(SecondLevelPageHeaderSize));
+        cantFail(W.writeInteger<uint16_t>(SecondLevelPageNumEntries));
+      }
+
+      // Write entry.
+      auto FnDelta = R.Fn->getAddress() - CompactUnwindBase->getAddress();
+
+      if (LLVM_UNLIKELY(!isUInt<32>(FnDelta)))
+        return make_error<JITLinkError>(
+            "In " + G.getName() + " " + UnwindInfoSectionName +
+            ", delta to function at " + formatv("{0:x}", R.Fn->getAddress()) +
+            " exceeds 32 bits");
+
+      cantFail(W.writeInteger<uint32_t>(FnDelta));
+      cantFail(W.writeInteger<uint32_t>(R.Encoding));
+
+      ++RecordIdx;
+    }
+
+    return Error::success();
+  }
+
+  Error getOrCreateCompactUnwindBase(LinkGraph &G) {
+    auto Name = G.intern("__jitlink$libunwind_dso_base");
+    CompactUnwindBase = G.findAbsoluteSymbolByName(Name);
+    if (!CompactUnwindBase) {
+      if (auto LocalCUBase = getOrCreateLocalMachOHeader(G)) {
+        CompactUnwindBase = &*LocalCUBase;
+        auto &B = LocalCUBase->getBlock();
+        G.addDefinedSymbol(B, 0, *Name, B.getSize(), Linkage::Strong,
+                           Scope::Local, false, true);
+      } else
+        return LocalCUBase.takeError();
+    }
+    CompactUnwindBase->setLive(true);
+    return Error::success();
+  }
+
+  Error makePersonalityRangeError(LinkGraph &G, Symbol &PSym) {
+    std::string ErrMsg;
+    {
+      raw_string_ostream ErrStream(ErrMsg);
+      ErrStream << "In " << G.getName() << " " << UnwindInfoSectionName
+                << ", personality ";
+      if (PSym.hasName())
+        ErrStream << PSym.getName() << " ";
+      ErrStream << "at " << PSym.getAddress()
+                << " is out of 32-bit delta range of compact-unwind base at "
+                << CompactUnwindBase->getAddress();
+    }
+    return make_error<JITLinkError>(std::move(ErrMsg));
+  }
+
+  StringRef CompactUnwindSectionName;
+  StringRef UnwindInfoSectionName;
+  StringRef EHFrameSectionName;
+  Symbol *CompactUnwindBase = nullptr;
+
+  size_t NumLSDAs = 0;
+  size_t NumSecondLevelPages = 0;
+  SmallVector<Symbol *, MaxPersonalities> Personalities;
+  SmallVector<CompactUnwindRecord> Records;
+};
+
+} // end namespace jitlink
+} // end namespace llvm
+
+#undef DEBUG_TYPE
+
+#endif // LIB_EXECUTIONENGINE_JITLINK_COMPACTUNWINDSUPPORTIMPL_H
diff --git a/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp b/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp
index 3e757f780b550..179e458c3cd1f 100644
--- a/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp
@@ -733,121 +733,5 @@ Error MachOLinkGraphBuilder::graphifyCStringSection(
   return Error::success();
 }
 
-Error CompactUnwindSplitter::operator()(LinkGraph &G) {
-  auto *CUSec = G.findSectionByName(CompactUnwindSectionName);
-  if (!CUSec)
-    return Error::success();
-
-  if (!G.getTargetTriple().isOSBinFormatMachO())
-    return make_error<JITLinkError>(
-        "Error linking " + G.getName() +
-        ": compact unwind splitting not supported on non-macho target " +
-        G.getTargetTriple().str());
-
-  unsigned CURecordSize = 0;
-  unsigned PersonalityEdgeOffset = 0;
-  unsigned LSDAEdgeOffset = 0;
-  switch (G.getTargetTriple().getArch()) {
-  case Triple::aarch64:
-  case Triple::x86_64:
-    // 64-bit compact-unwind record format:
-    // Range start: 8 bytes.
-    // Range size:  4 bytes.
-    // CU encoding: 4 bytes.
-    // Personality: 8 bytes.
-    // LSDA:        8 bytes.
-    CURecordSize = 32;
-    PersonalityEdgeOffset = 16;
-    LSDAEdgeOffset = 24;
-    break;
-  default:
-    return make_error<JITLinkError>(
-        "Error linking " + G.getName() +
-        ": compact unwind splitting not supported on " +
-        G.getTargetTriple().getArchName());
-  }
-
-  std::vector<Block *> OriginalBlocks(CUSec->blocks().begin(),
-                                      CUSec->blocks().end());
-  LLVM_DEBUG({
-    dbgs() << "In " << G.getName() << " splitting compact unwind section "
-           << CompactUnwindSectionName << " containing "
-           << OriginalBlocks.size() << " initial blocks...\n";
-  });
-
-  while (!OriginalBlocks.empty()) {
-    auto *B = OriginalBlocks.back();
-    OriginalBlocks.pop_back();
-
-    if (B->getSize() == 0) {
-      LLVM_DEBUG({
-        dbgs() << "  Skipping empty block at "
-               << formatv("{0:x16}", B->getAddress()) << "\n";
-      });
-      continue;
-    }
-
-    unsigned NumBlocks = B->getSize() / CURecordSize;
-
-    LLVM_DEBUG({
-      dbgs() << "  Splitting block at " << formatv("{0:x16}", B->getAddress())
-             << " into " << NumBlocks << " compact unwind record(s)\n";
-    });
-
-    if (B->getSize() % CURecordSize)
-      return make_error<JITLinkError>(
-          "Error splitting compact unwind record in " + G.getName() +
-          ": block at " + formatv("{0:x}", B->getAddress()) + " has size " +
-          formatv("{0:x}", B->getSize()) +
-          " (not a multiple of CU record size of " +
-          formatv("{0:x}", CURecordSize) + ")");
-
-    auto Blocks =
-        G.splitBlock(*B, map_range(seq(1U, NumBlocks), [=](Edge::OffsetT Idx) {
-          return Idx * CURecordSize;
-        }));
-
-    for (auto *CURec : Blocks) {
-      bool AddedKeepAlive = false;
-
-      for (auto &E : CURec->edges()) {
-        if (E.getOffset() == 0) {
-          LLVM_DEBUG({
-            dbgs() << "    Updating compact unwind record at "
-                   << CURec->getAddress() << " to point to "
-                   << (E.getTarget().hasName() ? *E.getTarget().getName()
-                                               : StringRef())
-                   << " (at " << E.getTarget().getAddress() << ")\n";
-          });
-
-          if (E.getTarget().isExternal())
-            return make_error<JITLinkError>(
-                "Error adding keep-alive edge for compact unwind record at " +
-                formatv("{0:x}", CURec->getAddress()) + ": target " +
-                *E.getTarget().getName() + " is an external symbol");
-          auto &TgtBlock = E.getTarget().getBlock();
-          auto &CURecSym =
-              G.addAnonymousSymbol(*CURec, 0, CURecordSize, false, false);
-          TgtBlock.addEdge(Edge::KeepAlive, 0, CURecSym, 0);
-          AddedKeepAlive = true;
-        } else if (E.getOffset() != PersonalityEdgeOffset &&
-                   E.getOffset() != LSDAEdgeOffset)
-          return make_error<JITLinkError>(
-              "Unexpected edge at offset " + formatv("{0:x}", E.getOffset()) +
-              " in compact unwind record at " +
-              formatv("{0:x}", CURec->getAddress()));
-      }
-
-      if (!AddedKeepAlive)
-        return make_error<JITLinkError>(
-            "Error adding keep-alive edge for compact unwind record at " +
-            formatv("{0:x}", CURec->getAddress()) +
-            ": no outgoing target edge at offset 0");
-    }
-  }
-
-  return Error::success();
-}
-
 } // end namespace jitlink
 } // end namespace llvm
diff --git a/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.h b/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.h
index 6afa01250f62d..343218ec9ad18 100644
--- a/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.h
+++ b/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.h
@@ -236,17 +236,6 @@ class MachOLinkGraphBuilder {
   StringMap<SectionParserFunction> CustomSectionParserFunctions;
 };
 
-/// A pass to split up __LD,__compact_unwind sections.
-class CompactUnwindSplitter {
-public:
-  CompactUnwindSplitter(StringRef CompactUnwindSectionName)
-      : CompactUnwindSectionName(CompactUnwindSectionName) {}
-  Error operator()(LinkGraph &G);
-
-private:
-  StringRef CompactUnwindSectionName;
-};
-
 } // end namespace jitlink
 } // end namespace llvm
 
diff --git a/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp b/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp
index 29061fff9c2ae..4860db4f5eb37 100644
--- a/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp
@@ -13,7 +13,9 @@
 #include "llvm/ExecutionEngine/JITLink/MachO_arm64.h"
 #include "llvm/ExecutionEngine/JITLink/DWARFRecordSectionSplitter.h"
 #include "llvm/ExecutionEngine/JITLink/aarch64.h"
+#include "llvm/ExecutionEngine/Orc/Shared/MachOObjectFormat.h"
 
+#include "CompactUnwindSupport.h"
 #include "DefineExternalSectionStartAndEndSymbols.h"
 #include "MachOLinkGraphBuilder.h"
 
@@ -625,6 +627,27 @@ static Error applyPACSigningToModInitPointers(LinkGraph &G) {
   return Error::success();
 }
 
+struct CompactUnwindTraits_MachO_arm64
+    : public CompactUnwindTraits<CompactUnwindTraits_MachO_arm64,
+                                 /* PointerSize = */ 8> {
+  // FIXME: Reinstate once we no longer need the MSVC workaround. See
+  //        FIXME for CompactUnwindTraits in CompactUnwindSupport.h.
+  // constexpr static size_t PointerSize = 8;
+
+  constexpr static endianness Endianness = endianness::little;
+
+  constexpr static uint32_t EncodingModeMask = 0x0f000000;
+
+  using GOTManager = aarch64::GOTTableManager;
+
+  static bool encodingSpecifiesDWARF(uint32_t Encoding) {
+    constexpr uint32_t DWARFMode = 0x03000000;
+    return (Encoding & EncodingModeMask) == DWARFMode;
+  }
+
+  static bool encodingCannotBeMerged(uint32_t Encoding) { return false; }
+};
+
 void link_MachO_arm64(std::unique_ptr<LinkGraph> G,
                       std::unique_ptr<JITLinkContext> Ctx) {
 
@@ -637,16 +660,21 @@ void link_MachO_arm64(std::unique_ptr<LinkGraph> G,
     else
       Config.PrePrunePasses.push_back(markAllSymbolsLive);
 
-    // Add compact unwind splitter pass.
-    Config.PrePrunePasses.push_back(
-        CompactUnwindSplitter("__LD,__compact_unwind"));
-
     // Add eh-frame passes.
-    // FIXME: Prune eh-frames for which compact-unwind is available once
-    // we support compact-unwind registration with libunwind.
     Config.PrePrunePasses.push_back(createEHFrameSplitterPass_MachO_arm64());
     Config.PrePrunePasses.push_back(createEHFrameEdgeFixerPass_MachO_arm64());
 
+    // Create a compact-unwind manager for use in passes below.
+    auto CompactUnwindMgr =
+        std::make_shared<CompactUnwindManager<CompactUnwindTraits_MachO_arm64>>(
+            orc::MachOCompactUnwindSectionName, orc::MachOUnwindInfoSectionName,
+            orc::MachOEHFrameSectionName);
+
+    // Add compact unwind prepare pass.
+    Config.PrePrunePasses.push_back([CompactUnwindMgr](LinkGraph &G) {
+      return CompactUnwindMgr->prepareForPrune(G);
+    });
+
     // Resolve any external section start / end symbols.
     Config.PostAllocationPasses.push_back(
         createDefineExternalSectionStartAndEndSymbolsPass(
@@ -663,6 +691,16 @@ void link_MachO_arm64(std::unique_ptr<LinkGraph> G,
       Config.PreFixupPasses.push_back(
           aarch64::lowerPointer64AuthEdgesToSigningFunction);
     }
+
+    // Reserve unwind-info space.
+    Config.PostPrunePasses.push_back([CompactUnwindMgr](LinkGraph &G) {
+      return CompactUnwindMgr->processAndReserveUnwindInfo(G);
+    });
+
+    // Translate compact-unwind to unwind-info.
+    Config.PreFixupPasses.push_back([CompactUnwindMgr](LinkGraph &G) {
+      return CompactUnwindMgr->writeUnwindInfo(G);
+    });
   }
 
   if (auto Err = Ctx->modifyPassConfig(*G, Config))
@@ -673,11 +711,11 @@ void link_MachO_arm64(std::unique_ptr<LinkGraph> G,
 }
 
 LinkGraphPassFunction createEHFrameSplitterPass_MachO_arm64() {
-  return DWARFRecordSectionSplitter("__TEXT,__eh_frame");
+  return DWARFRecordSectionSplitter(orc::MachOEHFrameSectionName);
 }
 
 LinkGraphPassFunction createEHFrameEdgeFixerPass_MachO_arm64() {
-  return EHFrameEdgeFixer("__TEXT,__eh_frame", aarch64::PointerSize,
+  return EHFrameEdgeFixer(orc::MachOEHFrameSectionName, aarch64::PointerSize,
                           aarch64::Pointer32, aarch64::Pointer64,
                           aarch64::Delta32, aarch64::Delta64,
                           aarch64::NegDelta32);
diff --git a/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp
index 9547266dc9789..d56dfdc07636d 100644
--- a/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp
@@ -13,7 +13,9 @@
 #include "llvm/ExecutionEngine/JITLink/MachO_x86_64.h"
 #include "llvm/ExecutionEngine/JITLink/DWARFRecordSectionSplitter.h"
 #include "llvm/ExecutionEngine/JITLink/x86_64.h"
+#include "llvm/ExecutionEngine/Orc/Shared/MachOObjectFormat.h"
 
+#include "CompactUnwindSupport.h"
 #include "DefineExternalSectionStartAndEndSymbols.h"
 #include "MachOLinkGraphBuilder.h"
 
@@ -500,26 +502,57 @@ Expected<std::unique_ptr<LinkGraph>> createLinkGraphFromMachOObject_x86_64(
       .buildGraph();
 }
 
+struct CompactUnwindTraits_MachO_x86_64
+    : public CompactUnwindTraits<CompactUnwindTraits_MachO_x86_64,
+                                 /* PointerSize = */ 8> {
+  // FIXME: Reinstate once we no longer need the MSVC workaround. See
+  //        FIXME for CompactUnwindTraits in CompactUnwindSupport.h.
+  // constexpr static size_t PointerSize = 8;
+
+  constexpr static endianness Endianness = endianness::little;
+
+  constexpr static uint32_t EncodingModeMask = 0x0f000000;
+
+  using GOTManager = x86_64::GOTTableManager;
+
+  static bool encodingSpecifiesDWARF(uint32_t Encoding) {
+    constexpr uint32_t DWARFMode = 0x04000000;
+    return (Encoding & EncodingModeMask) == DWARFMode;
+  }
+
+  static bool encodingCannotBeMerged(uint32_t Encoding) {
+    constexpr uint32_t StackIndirectMode = 0x03000000;
+    return (Encoding & EncodingModeMask) == StackIndirectMode;
+  }
+};
+
 void link_MachO_x86_64(std::unique_ptr<LinkGraph> G,
                        std::unique_ptr<JITLinkContext> Ctx) {
 
   PassConfiguration Config;
 
   if (Ctx->shouldAddDefaultTargetPasses(G->getTargetTriple())) {
-    // Add eh-frame passes.
-    Config.PrePrunePasses.push_back(createEHFrameSplitterPass_MachO_x86_64());
-    Config.PrePrunePasses.push_back(createEHFrameEdgeFixerPass_MachO_x86_64());
-
-    // Add compact unwind splitter pass.
-    Config.PrePrunePasses.push_back(
-        CompactUnwindSplitter("__LD,__compact_unwind"));
-
     // Add a mark-live pass.
     if (auto MarkLive = Ctx->getMarkLivePass(G->getTargetTriple()))
       Config.PrePrunePasses.push_back(std::move(MarkLive));
     else
       Config.PrePrunePasses.push_back(markAllSymbolsLive);
 
+    // Add eh-frame passes.
+    Config.PrePrunePasses.push_back(createEHFrameSplitterPass_MachO_x86_64());
+    Config.PrePrunePasses.push_back(createEHFrameEdgeFixerPass_MachO_x86_64());
+
+    // Create a compact-unwind manager for use in passes below.
+    auto CompactUnwindMgr = std::make_shared<
+        CompactUnwindManager<CompactUnwindTraits_MachO_x86_64>>(
+        orc::MachOCompactUnwindSectionName, orc::MachOUnwindInfoSectionName,
+        orc::MachOEHFrameSectionName);
+
+    // Add compact unwind prepare pass.
+    Config.PrePrunePasses.push_back([CompactUnwindMgr](LinkGraph &G) {
+      return CompactUnwindMgr->prepareForPrune(G);
+    });
+
     // Resolve any external section start / end symbols.
     Config.PostAllocationPasses.push_back(
         createDefineExternalSectionStartAndEndSymbolsPass(
@@ -528,6 +561,16 @@ void link_MachO_x86_64(std::unique_ptr<LinkGraph> G,
     // Add an in-place GOT/Stubs pass.
     Config.PostPrunePasses.push_back(buildGOTAndStubs_MachO_x86_64);
 
+    // Reserve space for unwind-info.
+    Config.PostPrunePasses.push_back([CompactUnwindMgr](LinkGraph &G) {
+      return CompactUnwindMgr->processAndReserveUnwindInfo(G);
+    });
+
+    // Translate compact-unwind to unwind-info.
+    Config.PreFixupPasses.push_back([CompactUnwindMgr](LinkGraph &G) {
+      return CompactUnwindMgr->writeUnwindInfo(G);
+    });
+
     // Add GOT/Stubs optimizer pass.
     Config.PreFixupPasses.push_back(x86_64::optimizeGOTAndStubAccesses);
   }
@@ -540,11 +583,11 @@ void link_MachO_x86_64(std::unique_ptr<LinkGraph> G,
 }
 
 LinkGraphPassFunction createEHFrameSplitterPass_MachO_x86_64() {
-  return DWARFRecordSectionSplitter("__TEXT,__eh_frame");
+  return DWARFRecordSectionSplitter(orc::MachOEHFrameSectionName);
 }
 
 LinkGraphPassFunction createEHFrameEdgeFixerPass_MachO_x86_64() {
-  return EHFrameEdgeFixer("__TEXT,__eh_frame", x86_64::PointerSize,
+  return EHFrameEdgeFixer(orc::MachOEHFrameSectionName, x86_64::PointerSize,
                           x86_64::Pointer32, x86_64::Pointer64, x86_64::Delta32,
                           x86_64::Delta64, x86_64::NegDelta32);
 }
diff --git a/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt b/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt
index 2ab5d6dd39b63..8a866294eee25 100644
--- a/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt
+++ b/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt
@@ -57,6 +57,7 @@ add_llvm_component_library(LLVMOrcJIT
   ExecutorProcessControl.cpp
   TaskDispatch.cpp
   ThreadSafeModule.cpp
+  UnwindInfoRegistrationPlugin.cpp
   RedirectionManager.cpp
   JITLinkRedirectableSymbolManager.cpp
   ReOptimizeLayer.cpp
diff --git a/llvm/lib/ExecutionEngine/Orc/CompileUtils.cpp b/llvm/lib/ExecutionEngine/Orc/CompileUtils.cpp
index 5d2f3cd4a8be8..c4d65af1b57f8 100644
--- a/llvm/lib/ExecutionEngine/Orc/CompileUtils.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/CompileUtils.cpp
@@ -33,6 +33,9 @@ irManglingOptionsFromTargetOptions(const TargetOptions &Opts) {
 
 /// Compile a Module to an ObjectFile.
 Expected<SimpleCompiler::CompileResult> SimpleCompiler::operator()(Module &M) {
+  if (M.getDataLayout().isDefault())
+    M.setDataLayout(TM.createDataLayout());
+
   CompileResult CachedObject = tryToLoadFromObjectCache(M);
   if (CachedObject)
     return std::move(CachedObject);
diff --git a/llvm/lib/ExecutionEngine/Orc/Core.cpp b/llvm/lib/ExecutionEngine/Orc/Core.cpp
index d47eb4416d3c2..9f466e725668a 100644
--- a/llvm/lib/ExecutionEngine/Orc/Core.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Core.cpp
@@ -1251,9 +1251,7 @@ JITDylib::JITDylib(ExecutionSession &ES, std::string Name)
   LinkOrder.push_back({this, JITDylibLookupFlags::MatchAllSymbols});
 }
 
-std::pair<JITDylib::AsynchronousSymbolQuerySet,
-          std::shared_ptr<SymbolDependenceMap>>
-JITDylib::IL_removeTracker(ResourceTracker &RT) {
+JITDylib::RemoveTrackerResult JITDylib::IL_removeTracker(ResourceTracker &RT) {
   // Note: Should be called under the session lock.
   assert(State != Closed && "JD is defunct");
 
@@ -1292,7 +1290,10 @@ JITDylib::IL_removeTracker(ResourceTracker &RT) {
       SymbolsToFail.push_back(Sym);
   }
 
-  auto Result = ES.IL_failSymbols(*this, std::move(SymbolsToFail));
+  auto [QueriesToFail, FailedSymbols] =
+      ES.IL_failSymbols(*this, std::move(SymbolsToFail));
+
+  std::vector<std::unique_ptr<MaterializationUnit>> DefunctMUs;
 
   // Removed symbols should be taken out of the table altogether.
   for (auto &Sym : SymbolsToRemove) {
@@ -1302,7 +1303,12 @@ JITDylib::IL_removeTracker(ResourceTracker &RT) {
     // Remove Materializer if present.
     if (I->second.hasMaterializerAttached()) {
       // FIXME: Should this discard the symbols?
-      UnmaterializedInfos.erase(Sym);
+      auto J = UnmaterializedInfos.find(Sym);
+      assert(J != UnmaterializedInfos.end() &&
+             "Symbol table indicates MU present, but no UMI record");
+      if (J->second->MU)
+        DefunctMUs.push_back(std::move(J->second->MU));
+      UnmaterializedInfos.erase(J);
     } else {
       assert(!UnmaterializedInfos.count(Sym) &&
              "Symbol has materializer attached");
@@ -1313,7 +1319,8 @@ JITDylib::IL_removeTracker(ResourceTracker &RT) {
 
   shrinkMaterializationInfoMemory();
 
-  return Result;
+  return {std::move(QueriesToFail), std::move(FailedSymbols),
+          std::move(DefunctMUs)};
 }
 
 void JITDylib::transferTracker(ResourceTracker &DstRT, ResourceTracker &SrcRT) {
@@ -2180,16 +2187,17 @@ Error ExecutionSession::removeResourceTracker(ResourceTracker &RT) {
   });
   std::vector<ResourceManager *> CurrentResourceManagers;
 
-  JITDylib::AsynchronousSymbolQuerySet QueriesToFail;
-  std::shared_ptr<SymbolDependenceMap> FailedSymbols;
+  JITDylib::RemoveTrackerResult R;
 
   runSessionLocked([&] {
     CurrentResourceManagers = ResourceManagers;
     RT.makeDefunct();
-    std::tie(QueriesToFail, FailedSymbols) =
-        RT.getJITDylib().IL_removeTracker(RT);
+    R = RT.getJITDylib().IL_removeTracker(RT);
   });
 
+  // Release any defunct MaterializationUnits.
+  R.DefunctMUs.clear();
+
   Error Err = Error::success();
 
   auto &JD = RT.getJITDylib();
@@ -2197,9 +2205,9 @@ Error ExecutionSession::removeResourceTracker(ResourceTracker &RT) {
     Err = joinErrors(std::move(Err),
                      L->handleRemoveResources(JD, RT.getKeyUnsafe()));
 
-  for (auto &Q : QueriesToFail)
-    Q->handleFailed(
-        make_error<FailedToMaterialize>(getSymbolStringPool(), FailedSymbols));
+  for (auto &Q : R.QueriesToFail)
+    Q->handleFailed(make_error<FailedToMaterialize>(getSymbolStringPool(),
+                                                    R.FailedSymbols));
 
   return Err;
 }
diff --git a/llvm/lib/ExecutionEngine/Orc/EHFrameRegistrationPlugin.cpp b/llvm/lib/ExecutionEngine/Orc/EHFrameRegistrationPlugin.cpp
index 217c693dae9c9..161bd68ff0854 100644
--- a/llvm/lib/ExecutionEngine/Orc/EHFrameRegistrationPlugin.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/EHFrameRegistrationPlugin.cpp
@@ -9,6 +9,7 @@
 #include "llvm/ExecutionEngine/Orc/EHFrameRegistrationPlugin.h"
 
 #include "llvm/ExecutionEngine/JITLink/EHFrameSupport.h"
+#include "llvm/ExecutionEngine/Orc/Shared/MachOObjectFormat.h"
 
 #define DEBUG_TYPE "orc"
 
@@ -21,11 +22,19 @@ EHFrameRegistrationPlugin::EHFrameRegistrationPlugin(
     : ES(ES), Registrar(std::move(Registrar)) {}
 
 void EHFrameRegistrationPlugin::modifyPassConfig(
-    MaterializationResponsibility &MR, LinkGraph &G,
+    MaterializationResponsibility &MR, LinkGraph &LG,
     PassConfiguration &PassConfig) {
 
+  if (LG.getTargetTriple().isOSBinFormatMachO())
+    PassConfig.PrePrunePasses.insert(
+        PassConfig.PrePrunePasses.begin(), [](LinkGraph &G) {
+          if (auto *CUSec = G.findSectionByName(MachOCompactUnwindSectionName))
+            G.removeSection(*CUSec);
+          return Error::success();
+        });
+
   PassConfig.PostFixupPasses.push_back(createEHFrameRecorderPass(
-      G.getTargetTriple(), [this, &MR](ExecutorAddr Addr, size_t Size) {
+      LG.getTargetTriple(), [this, &MR](ExecutorAddr Addr, size_t Size) {
         if (Addr) {
           std::lock_guard<std::mutex> Lock(EHFramePluginMutex);
           assert(!InProcessLinks.count(&MR) &&
diff --git a/llvm/lib/ExecutionEngine/Orc/ExecutorProcessControl.cpp b/llvm/lib/ExecutionEngine/Orc/ExecutorProcessControl.cpp
index aa799687e6d5d..b51fa24be76d1 100644
--- a/llvm/lib/ExecutionEngine/Orc/ExecutorProcessControl.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/ExecutorProcessControl.cpp
@@ -45,6 +45,7 @@ SelfExecutorProcessControl::SelfExecutorProcessControl(
   this->DylibMgr = this;
   this->JDI = {ExecutorAddr::fromPtr(jitDispatchViaWrapperFunctionManager),
                ExecutorAddr::fromPtr(this)};
+
   if (this->TargetTriple.isOSBinFormatMachO())
     GlobalManglingPrefix = '_';
 
@@ -52,6 +53,12 @@ SelfExecutorProcessControl::SelfExecutorProcessControl(
       ExecutorAddr::fromPtr(&llvm_orc_registerEHFrameSectionWrapper);
   this->BootstrapSymbols[rt::DeregisterEHFrameSectionWrapperName] =
       ExecutorAddr::fromPtr(&llvm_orc_deregisterEHFrameSectionWrapper);
+
+#ifdef __APPLE__
+  this->UnwindInfoMgr = UnwindInfoManager::TryCreate();
+  if (this->UnwindInfoMgr)
+    this->UnwindInfoMgr->addBootstrapSymbols(this->BootstrapSymbols);
+#endif // __APPLE__
 }
 
 Expected<std::unique_ptr<SelfExecutorProcessControl>>
diff --git a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
index 80500d0fdd9bc..938fe58ef85cf 100644
--- a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
@@ -21,6 +21,7 @@
 #include "llvm/ExecutionEngine/Orc/ObjectTransformLayer.h"
 #include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
 #include "llvm/ExecutionEngine/Orc/TargetProcess/RegisterEHFrames.h"
+#include "llvm/ExecutionEngine/Orc/UnwindInfoRegistrationPlugin.h"
 #include "llvm/ExecutionEngine/SectionMemoryManager.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
@@ -1220,12 +1221,30 @@ Expected<JITDylibSP> setUpGenericLLVMIRPlatform(LLJIT &J) {
 
   if (auto *OLL = dyn_cast<ObjectLinkingLayer>(&J.getObjLinkingLayer())) {
 
-    auto &ES = J.getExecutionSession();
-    if (auto EHFrameRegistrar = EPCEHFrameRegistrar::Create(ES))
-      OLL->addPlugin(std::make_unique<EHFrameRegistrationPlugin>(
-          ES, std::move(*EHFrameRegistrar)));
-    else
-      return EHFrameRegistrar.takeError();
+    bool CompactUnwindInfoSupported = false;
+
+    // Enable compact-unwind support if possible.
+    if (J.getTargetTriple().isOSDarwin() ||
+        J.getTargetTriple().isOSBinFormatMachO()) {
+      if (auto UIRP = UnwindInfoRegistrationPlugin::Create(
+              J.getIRCompileLayer(), PlatformJD)) {
+        CompactUnwindInfoSupported = true;
+        OLL->addPlugin(std::move(*UIRP));
+        LLVM_DEBUG(dbgs() << "Enabled compact-unwind support.\n");
+      } else
+        consumeError(UIRP.takeError());
+    }
+
+    // Otherwise fall back to standard unwind registration.
+    if (!CompactUnwindInfoSupported) {
+      auto &ES = J.getExecutionSession();
+      if (auto EHFrameRegistrar = EPCEHFrameRegistrar::Create(ES)) {
+        OLL->addPlugin(std::make_unique<EHFrameRegistrationPlugin>(
+            ES, std::move(*EHFrameRegistrar)));
+        LLVM_DEBUG(dbgs() << "Enabled eh-frame support.\n");
+      } else
+        return EHFrameRegistrar.takeError();
+    }
   }
 
   J.setPlatformSupport(
diff --git a/llvm/lib/ExecutionEngine/Orc/Shared/MachOObjectFormat.cpp b/llvm/lib/ExecutionEngine/Orc/Shared/MachOObjectFormat.cpp
index d94acf2768817..53488b75a2686 100644
--- a/llvm/lib/ExecutionEngine/Orc/Shared/MachOObjectFormat.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Shared/MachOObjectFormat.cpp
@@ -18,6 +18,7 @@ namespace orc {
 StringRef MachODataCommonSectionName = "__DATA,__common";
 StringRef MachODataDataSectionName = "__DATA,__data";
 StringRef MachOEHFrameSectionName = "__TEXT,__eh_frame";
+StringRef MachOCompactUnwindSectionName = "__LD,__compact_unwind";
 StringRef MachOCStringSectionName = "__TEXT,__cstring";
 StringRef MachOModInitFuncSectionName = "__DATA,__mod_init_func";
 StringRef MachOObjCCatListSectionName = "__DATA,__objc_catlist";
diff --git a/llvm/lib/ExecutionEngine/Orc/Shared/OrcRTBridge.cpp b/llvm/lib/ExecutionEngine/Orc/Shared/OrcRTBridge.cpp
index 54a25c007c589..fef3ff989a52a 100644
--- a/llvm/lib/ExecutionEngine/Orc/Shared/OrcRTBridge.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Shared/OrcRTBridge.cpp
@@ -64,5 +64,19 @@ const char *RunAsIntFunctionWrapperName =
     "__llvm_orc_bootstrap_run_as_int_function_wrapper";
 
 } // end namespace rt
+namespace rt_alt {
+const char *UnwindInfoManagerInstanceName =
+    "orc_rt_alt_UnwindInfoManager_Instance";
+const char *UnwindInfoManagerFindSectionsHelperName =
+    "orc_rt_alt_UnwindInfoManager_findSectionsHelper";
+const char *UnwindInfoManagerEnableWrapperName =
+    "orc_rt_alt_UnwindInfoManager_enable";
+const char *UnwindInfoManagerDisableWrapperName =
+    "orc_rt_alt_UnwindInfoManager_disable";
+const char *UnwindInfoManagerRegisterActionName =
+    "orc_rt_alt_UnwindInfoManager_register";
+const char *UnwindInfoManagerDeregisterActionName =
+    "orc_rt_alt_UnwindInfoManager_deregister";
+} // end namespace rt_alt
 } // end namespace orc
 } // end namespace llvm
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/CMakeLists.txt b/llvm/lib/ExecutionEngine/Orc/TargetProcess/CMakeLists.txt
index 3d1dfe758c79d..ffc1bbfa121b3 100644
--- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/CMakeLists.txt
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/CMakeLists.txt
@@ -20,6 +20,7 @@ add_llvm_component_library(LLVMOrcTargetProcess
   SimpleExecutorMemoryManager.cpp
   SimpleRemoteEPCServer.cpp
   TargetExecutionUtils.cpp
+  UnwindInfoManager.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/ExecutionEngine/Orc
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/UnwindInfoManager.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/UnwindInfoManager.cpp
new file mode 100644
index 0000000000000..9f748154c03e5
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/UnwindInfoManager.cpp
@@ -0,0 +1,188 @@
+//===------- UnwindInfoManager.cpp - Register unwind info sections --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/TargetProcess/UnwindInfoManager.h"
+#include "llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h"
+#include "llvm/ExecutionEngine/Orc/Shared/WrapperFunctionUtils.h"
+#include "llvm/Support/DynamicLibrary.h"
+
+#define DEBUG_TYPE "orc"
+
+using namespace llvm;
+using namespace llvm::orc;
+using namespace llvm::orc::shared;
+
+static orc::shared::CWrapperFunctionResult
+llvm_orc_rt_alt_UnwindInfoManager_enable(const char *Data, uint64_t Size) {
+  return WrapperFunction<SPSError(SPSExecutorAddr, SPSExecutorAddr)>::handle(
+             Data, Size,
+             [](ExecutorAddr Instance, ExecutorAddr FindFn) {
+               return Instance.toPtr<UnwindInfoManager *>()->enable(
+                   FindFn.toPtr<void *>());
+             })
+      .release();
+}
+
+static orc::shared::CWrapperFunctionResult
+llvm_orc_rt_alt_UnwindInfoManager_disable(const char *Data, uint64_t Size) {
+  return WrapperFunction<SPSError(SPSExecutorAddr)>::handle(
+             Data, Size,
+             [](ExecutorAddr Instance) {
+               return Instance.toPtr<UnwindInfoManager *>()->disable();
+             })
+      .release();
+}
+
+static orc::shared::CWrapperFunctionResult
+llvm_orc_rt_alt_UnwindInfoManager_register(const char *Data, uint64_t Size) {
+  using SPSSig =
+      SPSError(SPSExecutorAddr, SPSSequence<SPSExecutorAddrRange>,
+               SPSExecutorAddr, SPSExecutorAddrRange, SPSExecutorAddrRange);
+
+  return WrapperFunction<SPSSig>::handle(
+             Data, Size,
+             [](ExecutorAddr Instance,
+                std::vector<ExecutorAddrRange> CodeRanges, ExecutorAddr DSOBase,
+                ExecutorAddrRange DWARFRange,
+                ExecutorAddrRange CompactUnwindRange) {
+               return Instance.toPtr<UnwindInfoManager *>()->registerSections(
+                   CodeRanges, DSOBase, DWARFRange, CompactUnwindRange);
+             })
+      .release();
+}
+
+static orc::shared::CWrapperFunctionResult
+llvm_orc_rt_alt_UnwindInfoManager_deregister(const char *Data, uint64_t Size) {
+  using SPSSig = SPSError(SPSExecutorAddr, SPSSequence<SPSExecutorAddrRange>);
+
+  return WrapperFunction<SPSSig>::handle(
+             Data, Size,
+             [](ExecutorAddr Instance,
+                std::vector<ExecutorAddrRange> CodeRanges) {
+               return Instance.toPtr<UnwindInfoManager *>()->deregisterSections(
+                   CodeRanges);
+             })
+      .release();
+}
+
+namespace llvm::orc {
+
+const char *UnwindInfoManager::AddFnName =
+    "__unw_add_find_dynamic_unwind_sections";
+const char *UnwindInfoManager::RemoveFnName =
+    "__unw_remove_find_dynamic_unwind_sections";
+
+std::unique_ptr<UnwindInfoManager> UnwindInfoManager::TryCreate() {
+  std::string ErrMsg;
+  auto DL = sys::DynamicLibrary::getPermanentLibrary(nullptr, &ErrMsg);
+  if (!DL.isValid())
+    return nullptr;
+
+  auto AddFindDynamicUnwindSections =
+      (int (*)(void *))DL.getAddressOfSymbol(AddFnName);
+  if (!AddFindDynamicUnwindSections)
+    return nullptr;
+
+  auto RemoveFindDynamicUnwindSections =
+      (int (*)(void *))DL.getAddressOfSymbol(RemoveFnName);
+  if (!RemoveFindDynamicUnwindSections)
+    return nullptr;
+
+  return std::unique_ptr<UnwindInfoManager>(new UnwindInfoManager(
+      AddFindDynamicUnwindSections, RemoveFindDynamicUnwindSections));
+}
+
+Error UnwindInfoManager::shutdown() { return Error::success(); }
+
+void UnwindInfoManager::addBootstrapSymbols(StringMap<ExecutorAddr> &M) {
+  M[rt_alt::UnwindInfoManagerInstanceName] = ExecutorAddr::fromPtr(this);
+  M[rt_alt::UnwindInfoManagerFindSectionsHelperName] =
+      ExecutorAddr::fromPtr(&findSectionsHelper);
+  M[rt_alt::UnwindInfoManagerEnableWrapperName] =
+      ExecutorAddr::fromPtr(llvm_orc_rt_alt_UnwindInfoManager_enable);
+  M[rt_alt::UnwindInfoManagerDisableWrapperName] =
+      ExecutorAddr::fromPtr(llvm_orc_rt_alt_UnwindInfoManager_disable);
+  M[rt_alt::UnwindInfoManagerRegisterActionName] =
+      ExecutorAddr::fromPtr(llvm_orc_rt_alt_UnwindInfoManager_register);
+  M[rt_alt::UnwindInfoManagerDeregisterActionName] =
+      ExecutorAddr::fromPtr(llvm_orc_rt_alt_UnwindInfoManager_deregister);
+}
+
+Error UnwindInfoManager::enable(void *FindDynamicUnwindSections) {
+  LLVM_DEBUG(dbgs() << "Enabling UnwindInfoManager.\n");
+
+  if (auto Err = AddFindDynamicUnwindSections(FindDynamicUnwindSections))
+    return make_error<StringError>(Twine("Could not register function via ") +
+                                       AddFnName +
+                                       ", error code = " + Twine(Err),
+                                   inconvertibleErrorCode());
+
+  this->FindDynamicUnwindSections = FindDynamicUnwindSections;
+  return Error::success();
+}
+
+Error UnwindInfoManager::disable(void) {
+  LLVM_DEBUG(dbgs() << "Disabling UnwindInfoManager.\n");
+
+  if (FindDynamicUnwindSections)
+    if (auto Err = RemoveFindDynamicUnwindSections(FindDynamicUnwindSections))
+      return make_error<StringError>(
+          Twine("Could not deregister function via ") + RemoveFnName +
+              "error code = " + Twine(Err),
+          inconvertibleErrorCode());
+
+  FindDynamicUnwindSections = nullptr;
+  return Error::success();
+}
+
+Error UnwindInfoManager::registerSections(
+    ArrayRef<ExecutorAddrRange> CodeRanges, ExecutorAddr DSOBase,
+    ExecutorAddrRange DWARFEHFrame, ExecutorAddrRange CompactUnwind) {
+  std::lock_guard<std::mutex> Lock(M);
+  for (auto &R : CodeRanges)
+    UWSecs[R.Start.getValue()] =
+        UnwindSections{static_cast<uintptr_t>(DSOBase.getValue()),
+                       static_cast<uintptr_t>(DWARFEHFrame.Start.getValue()),
+                       static_cast<size_t>(DWARFEHFrame.size()),
+                       static_cast<uintptr_t>(CompactUnwind.Start.getValue()),
+                       static_cast<size_t>(CompactUnwind.size())};
+  return Error::success();
+}
+
+Error UnwindInfoManager::deregisterSections(
+    ArrayRef<ExecutorAddrRange> CodeRanges) {
+  std::lock_guard<std::mutex> Lock(M);
+  for (auto &R : CodeRanges) {
+    auto I = UWSecs.find(R.Start.getValue());
+    if (I == UWSecs.end())
+      return make_error<StringError>(
+          "No unwind-info sections registered for range " +
+              formatv("{0:x} - {1:x}", R.Start, R.End),
+          inconvertibleErrorCode());
+    UWSecs.erase(I);
+  }
+  return Error::success();
+}
+
+int UnwindInfoManager::findSections(uintptr_t Addr, UnwindSections *Info) {
+  std::lock_guard<std::mutex> Lock(M);
+  auto I = UWSecs.upper_bound(Addr);
+  if (I == UWSecs.begin())
+    return 0;
+  --I;
+  *Info = I->second;
+  return 1;
+}
+
+int UnwindInfoManager::findSectionsHelper(UnwindInfoManager *Instance,
+                                          uintptr_t Addr,
+                                          UnwindSections *Info) {
+  return Instance->findSections(Addr, Info);
+}
+
+} // namespace llvm::orc
diff --git a/llvm/lib/ExecutionEngine/Orc/UnwindInfoRegistrationPlugin.cpp b/llvm/lib/ExecutionEngine/Orc/UnwindInfoRegistrationPlugin.cpp
new file mode 100644
index 0000000000000..ae1f3f98269db
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/Orc/UnwindInfoRegistrationPlugin.cpp
@@ -0,0 +1,238 @@
+//===----- UnwindInfoRegistrationPlugin.cpp - libunwind registration ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/UnwindInfoRegistrationPlugin.h"
+
+#include "llvm/ADT/ScopeExit.h"
+#include "llvm/ExecutionEngine/Orc/AbsoluteSymbols.h"
+#include "llvm/ExecutionEngine/Orc/Shared/MachOObjectFormat.h"
+#include "llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+
+#define DEBUG_TYPE "orc"
+
+using namespace llvm::jitlink;
+
+static const char *FindDynamicUnwindSectionsFunctionName =
+    "_orc_rt_alt_find_dynamic_unwind_sections";
+
+namespace llvm::orc {
+
+Expected<std::shared_ptr<UnwindInfoRegistrationPlugin>>
+UnwindInfoRegistrationPlugin::Create(IRLayer &IRL, JITDylib &PlatformJD,
+                                     ExecutorAddr Instance,
+                                     ExecutorAddr FindHelper,
+                                     ExecutorAddr Enable, ExecutorAddr Disable,
+                                     ExecutorAddr Register,
+                                     ExecutorAddr Deregister) {
+
+  auto &ES = IRL.getExecutionSession();
+
+  // Build bouncer module.
+  auto M = makeBouncerModule(ES);
+  if (!M)
+    return M.takeError();
+
+  auto BouncerRT = PlatformJD.createResourceTracker();
+  auto RemoveBouncerModule = make_scope_exit([&]() {
+    if (auto Err = BouncerRT->remove())
+      ES.reportError(std::move(Err));
+  });
+
+  if (auto Err = PlatformJD.define(absoluteSymbols(
+          {{ES.intern(rt_alt::UnwindInfoManagerInstanceName),
+            ExecutorSymbolDef(Instance, JITSymbolFlags())},
+           {ES.intern(rt_alt::UnwindInfoManagerFindSectionsHelperName),
+            ExecutorSymbolDef(FindHelper, JITSymbolFlags::Callable)}})))
+    return std::move(Err);
+
+  if (auto Err = IRL.add(BouncerRT, std::move(*M)))
+    return Err;
+
+  auto FindUnwindSections =
+      ES.lookup({&PlatformJD}, FindDynamicUnwindSectionsFunctionName);
+  if (!FindUnwindSections)
+    return FindUnwindSections.takeError();
+
+  using namespace shared;
+  using SPSEnableSig = SPSError(SPSExecutorAddr, SPSExecutorAddr);
+  Error CallErr = Error::success();
+  if (auto Err = ES.callSPSWrapper<SPSEnableSig>(
+          Enable, CallErr, Instance, FindUnwindSections->getAddress())) {
+    consumeError(std::move(CallErr));
+    return std::move(Err);
+  }
+
+  if (CallErr)
+    return std::move(CallErr);
+
+  RemoveBouncerModule.release();
+
+  return std::shared_ptr<UnwindInfoRegistrationPlugin>(
+      new UnwindInfoRegistrationPlugin(ES, Instance, Disable, Register,
+                                       Deregister));
+}
+
+Expected<std::shared_ptr<UnwindInfoRegistrationPlugin>>
+UnwindInfoRegistrationPlugin::Create(IRLayer &IRL, JITDylib &PlatformJD) {
+
+  ExecutorAddr Instance, FindHelper, Enable, Disable, Register, Deregister;
+
+  auto &EPC = IRL.getExecutionSession().getExecutorProcessControl();
+  if (auto Err = EPC.getBootstrapSymbols(
+          {{Instance, rt_alt::UnwindInfoManagerInstanceName},
+           {FindHelper, rt_alt::UnwindInfoManagerFindSectionsHelperName},
+           {Enable, rt_alt::UnwindInfoManagerEnableWrapperName},
+           {Disable, rt_alt::UnwindInfoManagerDisableWrapperName},
+           {Register, rt_alt::UnwindInfoManagerRegisterActionName},
+           {Deregister, rt_alt::UnwindInfoManagerDeregisterActionName}}))
+    return std::move(Err);
+
+  return Create(IRL, PlatformJD, Instance, FindHelper, Enable, Disable,
+                Register, Deregister);
+}
+
+UnwindInfoRegistrationPlugin::~UnwindInfoRegistrationPlugin() {
+  using namespace shared;
+  using SPSDisableSig = SPSError(SPSExecutorAddr);
+  Error CallErr = Error::success();
+  if (auto Err = ES.callSPSWrapper<SPSDisableSig>(Disable, CallErr, Instance)) {
+    consumeError(std::move(CallErr));
+    ES.reportError(std::move(Err));
+  }
+  if (CallErr)
+    ES.reportError(std::move(CallErr));
+}
+
+void UnwindInfoRegistrationPlugin::modifyPassConfig(
+    MaterializationResponsibility &MR, LinkGraph &G,
+    PassConfiguration &PassConfig) {
+
+  PassConfig.PostFixupPasses.push_back(
+      [this](LinkGraph &G) { return addUnwindInfoRegistrationActions(G); });
+}
+
+Expected<ThreadSafeModule>
+UnwindInfoRegistrationPlugin::makeBouncerModule(ExecutionSession &ES) {
+  auto Ctx = std::make_unique<LLVMContext>();
+  auto M = std::make_unique<Module>("__libunwind_find_unwind_bouncer", *Ctx);
+  M->setTargetTriple(ES.getTargetTriple().str());
+
+  auto EscapeName = [](const char *N) { return std::string("\01") + N; };
+
+  auto *PtrTy = PointerType::getUnqual(*Ctx);
+  auto *OpaqueStructTy = StructType::create(*Ctx, "UnwindInfoMgr");
+  auto *UnwindMgrInstance = new GlobalVariable(
+      *M, OpaqueStructTy, true, GlobalValue::ExternalLinkage, nullptr,
+      EscapeName(rt_alt::UnwindInfoManagerInstanceName));
+
+  auto *Int64Ty = Type::getInt64Ty(*Ctx);
+  auto *FindHelperTy = FunctionType::get(Int64Ty, {PtrTy, PtrTy, PtrTy}, false);
+  auto *FindHelperFn = Function::Create(
+      FindHelperTy, GlobalValue::ExternalLinkage,
+      EscapeName(rt_alt::UnwindInfoManagerFindSectionsHelperName), *M);
+
+  auto *FindFnTy = FunctionType::get(Int64Ty, {PtrTy, PtrTy}, false);
+  auto *FindFn =
+      Function::Create(FindFnTy, GlobalValue::ExternalLinkage,
+                       EscapeName(FindDynamicUnwindSectionsFunctionName), *M);
+  auto *EntryBlock = BasicBlock::Create(M->getContext(), StringRef(), FindFn);
+  IRBuilder<> IB(EntryBlock);
+
+  std::vector<Value *> FindHelperArgs;
+  FindHelperArgs.push_back(UnwindMgrInstance);
+  for (auto &Arg : FindFn->args())
+    FindHelperArgs.push_back(&Arg);
+
+  IB.CreateRet(IB.CreateCall(FindHelperFn, FindHelperArgs));
+
+  return ThreadSafeModule(std::move(M), std::move(Ctx));
+}
+
+Error UnwindInfoRegistrationPlugin::addUnwindInfoRegistrationActions(
+    LinkGraph &G) {
+  ExecutorAddrRange EHFrameRange, UnwindInfoRange;
+
+  std::vector<Block *> CodeBlocks;
+
+  auto ScanUnwindInfoSection = [&](Section &Sec, ExecutorAddrRange &SecRange) {
+    if (Sec.empty())
+      return;
+
+    SecRange.Start = (*Sec.blocks().begin())->getAddress();
+    for (auto *B : Sec.blocks()) {
+      auto R = B->getRange();
+      SecRange.Start = std::min(SecRange.Start, R.Start);
+      SecRange.End = std::max(SecRange.End, R.End);
+      for (auto &E : B->edges()) {
+        if (E.getKind() != Edge::KeepAlive || !E.getTarget().isDefined())
+          continue;
+        auto &TargetBlock = E.getTarget().getBlock();
+        auto &TargetSection = TargetBlock.getSection();
+        if ((TargetSection.getMemProt() & MemProt::Exec) == MemProt::Exec)
+          CodeBlocks.push_back(&TargetBlock);
+      }
+    }
+  };
+
+  if (auto *EHFrame = G.findSectionByName(MachOEHFrameSectionName))
+    ScanUnwindInfoSection(*EHFrame, EHFrameRange);
+
+  if (auto *UnwindInfo = G.findSectionByName(MachOUnwindInfoSectionName))
+    ScanUnwindInfoSection(*UnwindInfo, UnwindInfoRange);
+
+  if (CodeBlocks.empty())
+    return Error::success();
+
+  if ((EHFrameRange == ExecutorAddrRange() &&
+       UnwindInfoRange == ExecutorAddrRange()))
+    return Error::success();
+
+  llvm::sort(CodeBlocks, [](const Block *LHS, const Block *RHS) {
+    return LHS->getAddress() < RHS->getAddress();
+  });
+
+  SmallVector<ExecutorAddrRange> CodeRanges;
+  for (auto *B : CodeBlocks) {
+    if (CodeRanges.empty() || CodeRanges.back().End != B->getAddress())
+      CodeRanges.push_back(B->getRange());
+    else
+      CodeRanges.back().End = B->getRange().End;
+  }
+
+  ExecutorAddr DSOBase;
+  if (auto *DSOBaseSym = G.findAbsoluteSymbolByName(DSOBaseName))
+    DSOBase = DSOBaseSym->getAddress();
+  else if (auto *DSOBaseSym = G.findExternalSymbolByName(DSOBaseName))
+    DSOBase = DSOBaseSym->getAddress();
+  else if (auto *DSOBaseSym = G.findDefinedSymbolByName(DSOBaseName))
+    DSOBase = DSOBaseSym->getAddress();
+  else
+    return make_error<StringError>("In " + G.getName() +
+                                       " could not find dso base symbol",
+                                   inconvertibleErrorCode());
+
+  using namespace shared;
+  using SPSRegisterArgs =
+      SPSArgList<SPSExecutorAddr, SPSSequence<SPSExecutorAddrRange>,
+                 SPSExecutorAddr, SPSExecutorAddrRange, SPSExecutorAddrRange>;
+  using SPSDeregisterArgs =
+      SPSArgList<SPSExecutorAddr, SPSSequence<SPSExecutorAddrRange>>;
+
+  G.allocActions().push_back(
+      {cantFail(WrapperFunctionCall::Create<SPSRegisterArgs>(
+           Register, Instance, CodeRanges, DSOBase, EHFrameRange,
+           UnwindInfoRange)),
+       cantFail(WrapperFunctionCall::Create<SPSDeregisterArgs>(
+           Deregister, Instance, CodeRanges))});
+
+  return Error::success();
+}
+
+} // namespace llvm::orc
diff --git a/llvm/test/ExecutionEngine/Orc/minimal-throw-catch.ll b/llvm/test/ExecutionEngine/Orc/minimal-throw-catch.ll
index 1b8f45184833f..4ee55c61a81ea 100644
--- a/llvm/test/ExecutionEngine/Orc/minimal-throw-catch.ll
+++ b/llvm/test/ExecutionEngine/Orc/minimal-throw-catch.ll
@@ -1,4 +1,4 @@
-; REQUIRES: x86_64-apple
+; REQUIRES: system-darwin && host-unwind-supports-jit
 ; RUN: lli -jit-kind=orc %s
 ;
 ; Basic correctness testing for eh-frame processing and registration.
diff --git a/llvm/test/ExecutionEngine/OrcLazy/minimal-throw-catch.ll b/llvm/test/ExecutionEngine/OrcLazy/minimal-throw-catch.ll
index cd22ec65ed999..83d2d89a5d607 100644
--- a/llvm/test/ExecutionEngine/OrcLazy/minimal-throw-catch.ll
+++ b/llvm/test/ExecutionEngine/OrcLazy/minimal-throw-catch.ll
@@ -1,4 +1,4 @@
-; REQUIRES: x86_64-apple
+; REQUIRES: system-darwin && host-unwind-supports-jit
 ; RUN: lli -jit-kind=orc-lazy %s
 ;
 ; Basic correctness testing for eh-frame processing and registration.
diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index 3c0069d10412a..aad7a088551b2 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -566,6 +566,54 @@ def have_ld64_plugin_support():
 if have_ld64_plugin_support():
     config.available_features.add("ld64_plugin")
 
+def host_unwind_supports_jit():
+    # Do we expect the host machine to support JIT registration of clang's
+    # default unwind info format for the host (e.g. eh-frames, compact-unwind,
+    # etc.).
+
+    # Linux and the BSDs use DWARF eh-frames and all known unwinders support
+    # register_frame at minimum.
+    if platform.system() in [ "Linux", "FreeBSD", "NetBSD" ]:
+        return True
+
+    # Windows does not support frame info without the ORC runtime.
+    if platform.system() == "Windows":
+        return False
+
+    # On Darwin/x86-64 clang produces both eh-frames and compact-unwind, and
+    # libunwind supports register_frame. On Darwin/arm64 clang produces
+    # compact-unwind only, and JIT'd registration is not available before
+    # macOS 14.0.
+    if platform.system() == "Darwin":
+
+        assert (
+            "arm64" in config.host_triple
+            or "x86_64" in config.host_triple
+        )
+
+        if "x86_64" in config.host_triple:
+            return True
+
+        # Must be arm64. Check the macOS version.
+        try:
+            osx_version = subprocess.check_output(
+                ["sw_vers", "-productVersion"], universal_newlines=True
+            )
+            osx_version = tuple(int(x) for x in osx_version.split("."))
+            if len(osx_version) == 2:
+                osx_version = (osx_version[0], osx_version[1], 0)
+            if osx_version >= (14, 0):
+                return True
+        except:
+            pass
+
+        return False
+
+    return False
+
+if host_unwind_supports_jit():
+    config.available_features.add("host-unwind-supports-jit")
+
 # Ask llvm-config about asserts
 llvm_config.feature_config(
     [

From 7782b8310d49d7b3751ecc7ce7511c0c34f62efb Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Thu, 6 Feb 2025 10:50:16 +1100
Subject: [PATCH 071/282] [ORC-RT] Add a comment explaining the purpose of this
 testcase. NFC.

(cherry picked from commit aefa30e2301f155d4f4737d6f6c55c66eac58b2d)
---
 compiler-rt/test/orc/TestCases/Darwin/Generic/exceptions.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/compiler-rt/test/orc/TestCases/Darwin/Generic/exceptions.cpp b/compiler-rt/test/orc/TestCases/Darwin/Generic/exceptions.cpp
index 7e9c40c724aec..304588a327386 100644
--- a/compiler-rt/test/orc/TestCases/Darwin/Generic/exceptions.cpp
+++ b/compiler-rt/test/orc/TestCases/Darwin/Generic/exceptions.cpp
@@ -3,6 +3,7 @@
 //
 // REQUIRES: system-darwin && host-arch-compatible
 
+// Test that trivial throw / catch works.
 int main(int argc, char *argv[]) {
   try {
     throw 42;

From 0c23bde7176d17f37923e0d8c847e48ebb700a91 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Thu, 6 Feb 2025 10:50:41 +1100
Subject: [PATCH 072/282] [ORC] Fix buggy calculation of second-level-page
 offset in unwind-info.

SecondLevelPageOffset should be incremented by SecondLevelPageSize bytes, not
one byte.

Failure to calculate the offset correctly leads to corrupted unwind-info (and
consequently broken exceptions / unwinding) when more than one second level
page is needed. Since JITLink's unwind support only produces
UNWIND_SECOND_LEVEL_REGULAR-style pages this would trigger for any file
containing more than 511 functions with unwind info. The included test-case
contains 1022 functions (sufficient for both the current format and any
future implementation that supports UNWIND_SECOND_LEVEL_COMPRESSED pages).

Thanks to @edoardo on discord for spotting this bug!

(cherry picked from commit 88f55d16c4c247a9eef326961a1445dee3f2e30c)
---
 .../Generic/exceptions-stress-test-tower.cpp  | 1039 +++++++++++++++++
 .../JITLink/CompactUnwindSupport.h            |    5 +-
 2 files changed, 1042 insertions(+), 2 deletions(-)
 create mode 100644 compiler-rt/test/orc/TestCases/Darwin/Generic/exceptions-stress-test-tower.cpp

diff --git a/compiler-rt/test/orc/TestCases/Darwin/Generic/exceptions-stress-test-tower.cpp b/compiler-rt/test/orc/TestCases/Darwin/Generic/exceptions-stress-test-tower.cpp
new file mode 100644
index 0000000000000..f7a39a9dccd6f
--- /dev/null
+++ b/compiler-rt/test/orc/TestCases/Darwin/Generic/exceptions-stress-test-tower.cpp
@@ -0,0 +1,1039 @@
+// RUN: %clangxx -c -o %t %s
+// RUN: %llvm_jitlink -slab-allocate=20Mb %t
+//
+// REQUIRES: system-darwin && host-arch-compatible
+
+// Test that we can throw and catch an exception through a large number of
+// stack frames. The number (1022) is chosen to force emission of multiple
+// unwind info second-level pages.
+
+static void f_0() { throw 42; }
+static void f_1() { try { f_0(); } catch (...) { throw; } }
+static void f_2() { try { f_1(); } catch (...) { throw; } }
+static void f_3() { try { f_2(); } catch (...) { throw; } }
+static void f_4() { try { f_3(); } catch (...) { throw; } }
+static void f_5() { try { f_4(); } catch (...) { throw; } }
+static void f_6() { try { f_5(); } catch (...) { throw; } }
+static void f_7() { try { f_6(); } catch (...) { throw; } }
+static void f_8() { try { f_7(); } catch (...) { throw; } }
+static void f_9() { try { f_8(); } catch (...) { throw; } }
+static void f_10() { try { f_9(); } catch (...) { throw; } }
+static void f_11() { try { f_10(); } catch (...) { throw; } }
+static void f_12() { try { f_11(); } catch (...) { throw; } }
+static void f_13() { try { f_12(); } catch (...) { throw; } }
+static void f_14() { try { f_13(); } catch (...) { throw; } }
+static void f_15() { try { f_14(); } catch (...) { throw; } }
+static void f_16() { try { f_15(); } catch (...) { throw; } }
+static void f_17() { try { f_16(); } catch (...) { throw; } }
+static void f_18() { try { f_17(); } catch (...) { throw; } }
+static void f_19() { try { f_18(); } catch (...) { throw; } }
+static void f_20() { try { f_19(); } catch (...) { throw; } }
+static void f_21() { try { f_20(); } catch (...) { throw; } }
+static void f_22() { try { f_21(); } catch (...) { throw; } }
+static void f_23() { try { f_22(); } catch (...) { throw; } }
+static void f_24() { try { f_23(); } catch (...) { throw; } }
+static void f_25() { try { f_24(); } catch (...) { throw; } }
+static void f_26() { try { f_25(); } catch (...) { throw; } }
+static void f_27() { try { f_26(); } catch (...) { throw; } }
+static void f_28() { try { f_27(); } catch (...) { throw; } }
+static void f_29() { try { f_28(); } catch (...) { throw; } }
+static void f_30() { try { f_29(); } catch (...) { throw; } }
+static void f_31() { try { f_30(); } catch (...) { throw; } }
+static void f_32() { try { f_31(); } catch (...) { throw; } }
+static void f_33() { try { f_32(); } catch (...) { throw; } }
+static void f_34() { try { f_33(); } catch (...) { throw; } }
+static void f_35() { try { f_34(); } catch (...) { throw; } }
+static void f_36() { try { f_35(); } catch (...) { throw; } }
+static void f_37() { try { f_36(); } catch (...) { throw; } }
+static void f_38() { try { f_37(); } catch (...) { throw; } }
+static void f_39() { try { f_38(); } catch (...) { throw; } }
+static void f_40() { try { f_39(); } catch (...) { throw; } }
+static void f_41() { try { f_40(); } catch (...) { throw; } }
+static void f_42() { try { f_41(); } catch (...) { throw; } }
+static void f_43() { try { f_42(); } catch (...) { throw; } }
+static void f_44() { try { f_43(); } catch (...) { throw; } }
+static void f_45() { try { f_44(); } catch (...) { throw; } }
+static void f_46() { try { f_45(); } catch (...) { throw; } }
+static void f_47() { try { f_46(); } catch (...) { throw; } }
+static void f_48() { try { f_47(); } catch (...) { throw; } }
+static void f_49() { try { f_48(); } catch (...) { throw; } }
+static void f_50() { try { f_49(); } catch (...) { throw; } }
+static void f_51() { try { f_50(); } catch (...) { throw; } }
+static void f_52() { try { f_51(); } catch (...) { throw; } }
+static void f_53() { try { f_52(); } catch (...) { throw; } }
+static void f_54() { try { f_53(); } catch (...) { throw; } }
+static void f_55() { try { f_54(); } catch (...) { throw; } }
+static void f_56() { try { f_55(); } catch (...) { throw; } }
+static void f_57() { try { f_56(); } catch (...) { throw; } }
+static void f_58() { try { f_57(); } catch (...) { throw; } }
+static void f_59() { try { f_58(); } catch (...) { throw; } }
+static void f_60() { try { f_59(); } catch (...) { throw; } }
+static void f_61() { try { f_60(); } catch (...) { throw; } }
+static void f_62() { try { f_61(); } catch (...) { throw; } }
+static void f_63() { try { f_62(); } catch (...) { throw; } }
+static void f_64() { try { f_63(); } catch (...) { throw; } }
+static void f_65() { try { f_64(); } catch (...) { throw; } }
+static void f_66() { try { f_65(); } catch (...) { throw; } }
+static void f_67() { try { f_66(); } catch (...) { throw; } }
+static void f_68() { try { f_67(); } catch (...) { throw; } }
+static void f_69() { try { f_68(); } catch (...) { throw; } }
+static void f_70() { try { f_69(); } catch (...) { throw; } }
+static void f_71() { try { f_70(); } catch (...) { throw; } }
+static void f_72() { try { f_71(); } catch (...) { throw; } }
+static void f_73() { try { f_72(); } catch (...) { throw; } }
+static void f_74() { try { f_73(); } catch (...) { throw; } }
+static void f_75() { try { f_74(); } catch (...) { throw; } }
+static void f_76() { try { f_75(); } catch (...) { throw; } }
+static void f_77() { try { f_76(); } catch (...) { throw; } }
+static void f_78() { try { f_77(); } catch (...) { throw; } }
+static void f_79() { try { f_78(); } catch (...) { throw; } }
+static void f_80() { try { f_79(); } catch (...) { throw; } }
+static void f_81() { try { f_80(); } catch (...) { throw; } }
+static void f_82() { try { f_81(); } catch (...) { throw; } }
+static void f_83() { try { f_82(); } catch (...) { throw; } }
+static void f_84() { try { f_83(); } catch (...) { throw; } }
+static void f_85() { try { f_84(); } catch (...) { throw; } }
+static void f_86() { try { f_85(); } catch (...) { throw; } }
+static void f_87() { try { f_86(); } catch (...) { throw; } }
+static void f_88() { try { f_87(); } catch (...) { throw; } }
+static void f_89() { try { f_88(); } catch (...) { throw; } }
+static void f_90() { try { f_89(); } catch (...) { throw; } }
+static void f_91() { try { f_90(); } catch (...) { throw; } }
+static void f_92() { try { f_91(); } catch (...) { throw; } }
+static void f_93() { try { f_92(); } catch (...) { throw; } }
+static void f_94() { try { f_93(); } catch (...) { throw; } }
+static void f_95() { try { f_94(); } catch (...) { throw; } }
+static void f_96() { try { f_95(); } catch (...) { throw; } }
+static void f_97() { try { f_96(); } catch (...) { throw; } }
+static void f_98() { try { f_97(); } catch (...) { throw; } }
+static void f_99() { try { f_98(); } catch (...) { throw; } }
+static void f_100() { try { f_99(); } catch (...) { throw; } }
+static void f_101() { try { f_100(); } catch (...) { throw; } }
+static void f_102() { try { f_101(); } catch (...) { throw; } }
+static void f_103() { try { f_102(); } catch (...) { throw; } }
+static void f_104() { try { f_103(); } catch (...) { throw; } }
+static void f_105() { try { f_104(); } catch (...) { throw; } }
+static void f_106() { try { f_105(); } catch (...) { throw; } }
+static void f_107() { try { f_106(); } catch (...) { throw; } }
+static void f_108() { try { f_107(); } catch (...) { throw; } }
+static void f_109() { try { f_108(); } catch (...) { throw; } }
+static void f_110() { try { f_109(); } catch (...) { throw; } }
+static void f_111() { try { f_110(); } catch (...) { throw; } }
+static void f_112() { try { f_111(); } catch (...) { throw; } }
+static void f_113() { try { f_112(); } catch (...) { throw; } }
+static void f_114() { try { f_113(); } catch (...) { throw; } }
+static void f_115() { try { f_114(); } catch (...) { throw; } }
+static void f_116() { try { f_115(); } catch (...) { throw; } }
+static void f_117() { try { f_116(); } catch (...) { throw; } }
+static void f_118() { try { f_117(); } catch (...) { throw; } }
+static void f_119() { try { f_118(); } catch (...) { throw; } }
+static void f_120() { try { f_119(); } catch (...) { throw; } }
+static void f_121() { try { f_120(); } catch (...) { throw; } }
+static void f_122() { try { f_121(); } catch (...) { throw; } }
+static void f_123() { try { f_122(); } catch (...) { throw; } }
+static void f_124() { try { f_123(); } catch (...) { throw; } }
+static void f_125() { try { f_124(); } catch (...) { throw; } }
+static void f_126() { try { f_125(); } catch (...) { throw; } }
+static void f_127() { try { f_126(); } catch (...) { throw; } }
+static void f_128() { try { f_127(); } catch (...) { throw; } }
+static void f_129() { try { f_128(); } catch (...) { throw; } }
+static void f_130() { try { f_129(); } catch (...) { throw; } }
+static void f_131() { try { f_130(); } catch (...) { throw; } }
+static void f_132() { try { f_131(); } catch (...) { throw; } }
+static void f_133() { try { f_132(); } catch (...) { throw; } }
+static void f_134() { try { f_133(); } catch (...) { throw; } }
+static void f_135() { try { f_134(); } catch (...) { throw; } }
+static void f_136() { try { f_135(); } catch (...) { throw; } }
+static void f_137() { try { f_136(); } catch (...) { throw; } }
+static void f_138() { try { f_137(); } catch (...) { throw; } }
+static void f_139() { try { f_138(); } catch (...) { throw; } }
+static void f_140() { try { f_139(); } catch (...) { throw; } }
+static void f_141() { try { f_140(); } catch (...) { throw; } }
+static void f_142() { try { f_141(); } catch (...) { throw; } }
+static void f_143() { try { f_142(); } catch (...) { throw; } }
+static void f_144() { try { f_143(); } catch (...) { throw; } }
+static void f_145() { try { f_144(); } catch (...) { throw; } }
+static void f_146() { try { f_145(); } catch (...) { throw; } }
+static void f_147() { try { f_146(); } catch (...) { throw; } }
+static void f_148() { try { f_147(); } catch (...) { throw; } }
+static void f_149() { try { f_148(); } catch (...) { throw; } }
+static void f_150() { try { f_149(); } catch (...) { throw; } }
+static void f_151() { try { f_150(); } catch (...) { throw; } }
+static void f_152() { try { f_151(); } catch (...) { throw; } }
+static void f_153() { try { f_152(); } catch (...) { throw; } }
+static void f_154() { try { f_153(); } catch (...) { throw; } }
+static void f_155() { try { f_154(); } catch (...) { throw; } }
+static void f_156() { try { f_155(); } catch (...) { throw; } }
+static void f_157() { try { f_156(); } catch (...) { throw; } }
+static void f_158() { try { f_157(); } catch (...) { throw; } }
+static void f_159() { try { f_158(); } catch (...) { throw; } }
+static void f_160() { try { f_159(); } catch (...) { throw; } }
+static void f_161() { try { f_160(); } catch (...) { throw; } }
+static void f_162() { try { f_161(); } catch (...) { throw; } }
+static void f_163() { try { f_162(); } catch (...) { throw; } }
+static void f_164() { try { f_163(); } catch (...) { throw; } }
+static void f_165() { try { f_164(); } catch (...) { throw; } }
+static void f_166() { try { f_165(); } catch (...) { throw; } }
+static void f_167() { try { f_166(); } catch (...) { throw; } }
+static void f_168() { try { f_167(); } catch (...) { throw; } }
+static void f_169() { try { f_168(); } catch (...) { throw; } }
+static void f_170() { try { f_169(); } catch (...) { throw; } }
+static void f_171() { try { f_170(); } catch (...) { throw; } }
+static void f_172() { try { f_171(); } catch (...) { throw; } }
+static void f_173() { try { f_172(); } catch (...) { throw; } }
+static void f_174() { try { f_173(); } catch (...) { throw; } }
+static void f_175() { try { f_174(); } catch (...) { throw; } }
+static void f_176() { try { f_175(); } catch (...) { throw; } }
+static void f_177() { try { f_176(); } catch (...) { throw; } }
+static void f_178() { try { f_177(); } catch (...) { throw; } }
+static void f_179() { try { f_178(); } catch (...) { throw; } }
+static void f_180() { try { f_179(); } catch (...) { throw; } }
+static void f_181() { try { f_180(); } catch (...) { throw; } }
+static void f_182() { try { f_181(); } catch (...) { throw; } }
+static void f_183() { try { f_182(); } catch (...) { throw; } }
+static void f_184() { try { f_183(); } catch (...) { throw; } }
+static void f_185() { try { f_184(); } catch (...) { throw; } }
+static void f_186() { try { f_185(); } catch (...) { throw; } }
+static void f_187() { try { f_186(); } catch (...) { throw; } }
+static void f_188() { try { f_187(); } catch (...) { throw; } }
+static void f_189() { try { f_188(); } catch (...) { throw; } }
+static void f_190() { try { f_189(); } catch (...) { throw; } }
+static void f_191() { try { f_190(); } catch (...) { throw; } }
+static void f_192() { try { f_191(); } catch (...) { throw; } }
+static void f_193() { try { f_192(); } catch (...) { throw; } }
+static void f_194() { try { f_193(); } catch (...) { throw; } }
+static void f_195() { try { f_194(); } catch (...) { throw; } }
+static void f_196() { try { f_195(); } catch (...) { throw; } }
+static void f_197() { try { f_196(); } catch (...) { throw; } }
+static void f_198() { try { f_197(); } catch (...) { throw; } }
+static void f_199() { try { f_198(); } catch (...) { throw; } }
+static void f_200() { try { f_199(); } catch (...) { throw; } }
+static void f_201() { try { f_200(); } catch (...) { throw; } }
+static void f_202() { try { f_201(); } catch (...) { throw; } }
+static void f_203() { try { f_202(); } catch (...) { throw; } }
+static void f_204() { try { f_203(); } catch (...) { throw; } }
+static void f_205() { try { f_204(); } catch (...) { throw; } }
+static void f_206() { try { f_205(); } catch (...) { throw; } }
+static void f_207() { try { f_206(); } catch (...) { throw; } }
+static void f_208() { try { f_207(); } catch (...) { throw; } }
+static void f_209() { try { f_208(); } catch (...) { throw; } }
+static void f_210() { try { f_209(); } catch (...) { throw; } }
+static void f_211() { try { f_210(); } catch (...) { throw; } }
+static void f_212() { try { f_211(); } catch (...) { throw; } }
+static void f_213() { try { f_212(); } catch (...) { throw; } }
+static void f_214() { try { f_213(); } catch (...) { throw; } }
+static void f_215() { try { f_214(); } catch (...) { throw; } }
+static void f_216() { try { f_215(); } catch (...) { throw; } }
+static void f_217() { try { f_216(); } catch (...) { throw; } }
+static void f_218() { try { f_217(); } catch (...) { throw; } }
+static void f_219() { try { f_218(); } catch (...) { throw; } }
+static void f_220() { try { f_219(); } catch (...) { throw; } }
+static void f_221() { try { f_220(); } catch (...) { throw; } }
+static void f_222() { try { f_221(); } catch (...) { throw; } }
+static void f_223() { try { f_222(); } catch (...) { throw; } }
+static void f_224() { try { f_223(); } catch (...) { throw; } }
+static void f_225() { try { f_224(); } catch (...) { throw; } }
+static void f_226() { try { f_225(); } catch (...) { throw; } }
+static void f_227() { try { f_226(); } catch (...) { throw; } }
+static void f_228() { try { f_227(); } catch (...) { throw; } }
+static void f_229() { try { f_228(); } catch (...) { throw; } }
+static void f_230() { try { f_229(); } catch (...) { throw; } }
+static void f_231() { try { f_230(); } catch (...) { throw; } }
+static void f_232() { try { f_231(); } catch (...) { throw; } }
+static void f_233() { try { f_232(); } catch (...) { throw; } }
+static void f_234() { try { f_233(); } catch (...) { throw; } }
+static void f_235() { try { f_234(); } catch (...) { throw; } }
+static void f_236() { try { f_235(); } catch (...) { throw; } }
+static void f_237() { try { f_236(); } catch (...) { throw; } }
+static void f_238() { try { f_237(); } catch (...) { throw; } }
+static void f_239() { try { f_238(); } catch (...) { throw; } }
+static void f_240() { try { f_239(); } catch (...) { throw; } }
+static void f_241() { try { f_240(); } catch (...) { throw; } }
+static void f_242() { try { f_241(); } catch (...) { throw; } }
+static void f_243() { try { f_242(); } catch (...) { throw; } }
+static void f_244() { try { f_243(); } catch (...) { throw; } }
+static void f_245() { try { f_244(); } catch (...) { throw; } }
+static void f_246() { try { f_245(); } catch (...) { throw; } }
+static void f_247() { try { f_246(); } catch (...) { throw; } }
+static void f_248() { try { f_247(); } catch (...) { throw; } }
+static void f_249() { try { f_248(); } catch (...) { throw; } }
+static void f_250() { try { f_249(); } catch (...) { throw; } }
+static void f_251() { try { f_250(); } catch (...) { throw; } }
+static void f_252() { try { f_251(); } catch (...) { throw; } }
+static void f_253() { try { f_252(); } catch (...) { throw; } }
+static void f_254() { try { f_253(); } catch (...) { throw; } }
+static void f_255() { try { f_254(); } catch (...) { throw; } }
+static void f_256() { try { f_255(); } catch (...) { throw; } }
+static void f_257() { try { f_256(); } catch (...) { throw; } }
+static void f_258() { try { f_257(); } catch (...) { throw; } }
+static void f_259() { try { f_258(); } catch (...) { throw; } }
+static void f_260() { try { f_259(); } catch (...) { throw; } }
+static void f_261() { try { f_260(); } catch (...) { throw; } }
+static void f_262() { try { f_261(); } catch (...) { throw; } }
+static void f_263() { try { f_262(); } catch (...) { throw; } }
+static void f_264() { try { f_263(); } catch (...) { throw; } }
+static void f_265() { try { f_264(); } catch (...) { throw; } }
+static void f_266() { try { f_265(); } catch (...) { throw; } }
+static void f_267() { try { f_266(); } catch (...) { throw; } }
+static void f_268() { try { f_267(); } catch (...) { throw; } }
+static void f_269() { try { f_268(); } catch (...) { throw; } }
+static void f_270() { try { f_269(); } catch (...) { throw; } }
+static void f_271() { try { f_270(); } catch (...) { throw; } }
+static void f_272() { try { f_271(); } catch (...) { throw; } }
+static void f_273() { try { f_272(); } catch (...) { throw; } }
+static void f_274() { try { f_273(); } catch (...) { throw; } }
+static void f_275() { try { f_274(); } catch (...) { throw; } }
+static void f_276() { try { f_275(); } catch (...) { throw; } }
+static void f_277() { try { f_276(); } catch (...) { throw; } }
+static void f_278() { try { f_277(); } catch (...) { throw; } }
+static void f_279() { try { f_278(); } catch (...) { throw; } }
+static void f_280() { try { f_279(); } catch (...) { throw; } }
+static void f_281() { try { f_280(); } catch (...) { throw; } }
+static void f_282() { try { f_281(); } catch (...) { throw; } }
+static void f_283() { try { f_282(); } catch (...) { throw; } }
+static void f_284() { try { f_283(); } catch (...) { throw; } }
+static void f_285() { try { f_284(); } catch (...) { throw; } }
+static void f_286() { try { f_285(); } catch (...) { throw; } }
+static void f_287() { try { f_286(); } catch (...) { throw; } }
+static void f_288() { try { f_287(); } catch (...) { throw; } }
+static void f_289() { try { f_288(); } catch (...) { throw; } }
+static void f_290() { try { f_289(); } catch (...) { throw; } }
+static void f_291() { try { f_290(); } catch (...) { throw; } }
+static void f_292() { try { f_291(); } catch (...) { throw; } }
+static void f_293() { try { f_292(); } catch (...) { throw; } }
+static void f_294() { try { f_293(); } catch (...) { throw; } }
+static void f_295() { try { f_294(); } catch (...) { throw; } }
+static void f_296() { try { f_295(); } catch (...) { throw; } }
+static void f_297() { try { f_296(); } catch (...) { throw; } }
+static void f_298() { try { f_297(); } catch (...) { throw; } }
+static void f_299() { try { f_298(); } catch (...) { throw; } }
+static void f_300() { try { f_299(); } catch (...) { throw; } }
+static void f_301() { try { f_300(); } catch (...) { throw; } }
+static void f_302() { try { f_301(); } catch (...) { throw; } }
+static void f_303() { try { f_302(); } catch (...) { throw; } }
+static void f_304() { try { f_303(); } catch (...) { throw; } }
+static void f_305() { try { f_304(); } catch (...) { throw; } }
+static void f_306() { try { f_305(); } catch (...) { throw; } }
+static void f_307() { try { f_306(); } catch (...) { throw; } }
+static void f_308() { try { f_307(); } catch (...) { throw; } }
+static void f_309() { try { f_308(); } catch (...) { throw; } }
+static void f_310() { try { f_309(); } catch (...) { throw; } }
+static void f_311() { try { f_310(); } catch (...) { throw; } }
+static void f_312() { try { f_311(); } catch (...) { throw; } }
+static void f_313() { try { f_312(); } catch (...) { throw; } }
+static void f_314() { try { f_313(); } catch (...) { throw; } }
+static void f_315() { try { f_314(); } catch (...) { throw; } }
+static void f_316() { try { f_315(); } catch (...) { throw; } }
+static void f_317() { try { f_316(); } catch (...) { throw; } }
+static void f_318() { try { f_317(); } catch (...) { throw; } }
+static void f_319() { try { f_318(); } catch (...) { throw; } }
+static void f_320() { try { f_319(); } catch (...) { throw; } }
+static void f_321() { try { f_320(); } catch (...) { throw; } }
+static void f_322() { try { f_321(); } catch (...) { throw; } }
+static void f_323() { try { f_322(); } catch (...) { throw; } }
+static void f_324() { try { f_323(); } catch (...) { throw; } }
+static void f_325() { try { f_324(); } catch (...) { throw; } }
+static void f_326() { try { f_325(); } catch (...) { throw; } }
+static void f_327() { try { f_326(); } catch (...) { throw; } }
+static void f_328() { try { f_327(); } catch (...) { throw; } }
+static void f_329() { try { f_328(); } catch (...) { throw; } }
+static void f_330() { try { f_329(); } catch (...) { throw; } }
+static void f_331() { try { f_330(); } catch (...) { throw; } }
+static void f_332() { try { f_331(); } catch (...) { throw; } }
+static void f_333() { try { f_332(); } catch (...) { throw; } }
+static void f_334() { try { f_333(); } catch (...) { throw; } }
+static void f_335() { try { f_334(); } catch (...) { throw; } }
+static void f_336() { try { f_335(); } catch (...) { throw; } }
+static void f_337() { try { f_336(); } catch (...) { throw; } }
+static void f_338() { try { f_337(); } catch (...) { throw; } }
+static void f_339() { try { f_338(); } catch (...) { throw; } }
+static void f_340() { try { f_339(); } catch (...) { throw; } }
+static void f_341() { try { f_340(); } catch (...) { throw; } }
+static void f_342() { try { f_341(); } catch (...) { throw; } }
+static void f_343() { try { f_342(); } catch (...) { throw; } }
+static void f_344() { try { f_343(); } catch (...) { throw; } }
+static void f_345() { try { f_344(); } catch (...) { throw; } }
+static void f_346() { try { f_345(); } catch (...) { throw; } }
+static void f_347() { try { f_346(); } catch (...) { throw; } }
+static void f_348() { try { f_347(); } catch (...) { throw; } }
+static void f_349() { try { f_348(); } catch (...) { throw; } }
+static void f_350() { try { f_349(); } catch (...) { throw; } }
+static void f_351() { try { f_350(); } catch (...) { throw; } }
+static void f_352() { try { f_351(); } catch (...) { throw; } }
+static void f_353() { try { f_352(); } catch (...) { throw; } }
+static void f_354() { try { f_353(); } catch (...) { throw; } }
+static void f_355() { try { f_354(); } catch (...) { throw; } }
+static void f_356() { try { f_355(); } catch (...) { throw; } }
+static void f_357() { try { f_356(); } catch (...) { throw; } }
+static void f_358() { try { f_357(); } catch (...) { throw; } }
+static void f_359() { try { f_358(); } catch (...) { throw; } }
+static void f_360() { try { f_359(); } catch (...) { throw; } }
+static void f_361() { try { f_360(); } catch (...) { throw; } }
+static void f_362() { try { f_361(); } catch (...) { throw; } }
+static void f_363() { try { f_362(); } catch (...) { throw; } }
+static void f_364() { try { f_363(); } catch (...) { throw; } }
+static void f_365() { try { f_364(); } catch (...) { throw; } }
+static void f_366() { try { f_365(); } catch (...) { throw; } }
+static void f_367() { try { f_366(); } catch (...) { throw; } }
+static void f_368() { try { f_367(); } catch (...) { throw; } }
+static void f_369() { try { f_368(); } catch (...) { throw; } }
+static void f_370() { try { f_369(); } catch (...) { throw; } }
+static void f_371() { try { f_370(); } catch (...) { throw; } }
+static void f_372() { try { f_371(); } catch (...) { throw; } }
+static void f_373() { try { f_372(); } catch (...) { throw; } }
+static void f_374() { try { f_373(); } catch (...) { throw; } }
+static void f_375() { try { f_374(); } catch (...) { throw; } }
+static void f_376() { try { f_375(); } catch (...) { throw; } }
+static void f_377() { try { f_376(); } catch (...) { throw; } }
+static void f_378() { try { f_377(); } catch (...) { throw; } }
+static void f_379() { try { f_378(); } catch (...) { throw; } }
+static void f_380() { try { f_379(); } catch (...) { throw; } }
+static void f_381() { try { f_380(); } catch (...) { throw; } }
+static void f_382() { try { f_381(); } catch (...) { throw; } }
+static void f_383() { try { f_382(); } catch (...) { throw; } }
+static void f_384() { try { f_383(); } catch (...) { throw; } }
+static void f_385() { try { f_384(); } catch (...) { throw; } }
+static void f_386() { try { f_385(); } catch (...) { throw; } }
+static void f_387() { try { f_386(); } catch (...) { throw; } }
+static void f_388() { try { f_387(); } catch (...) { throw; } }
+static void f_389() { try { f_388(); } catch (...) { throw; } }
+static void f_390() { try { f_389(); } catch (...) { throw; } }
+static void f_391() { try { f_390(); } catch (...) { throw; } }
+static void f_392() { try { f_391(); } catch (...) { throw; } }
+static void f_393() { try { f_392(); } catch (...) { throw; } }
+static void f_394() { try { f_393(); } catch (...) { throw; } }
+static void f_395() { try { f_394(); } catch (...) { throw; } }
+static void f_396() { try { f_395(); } catch (...) { throw; } }
+static void f_397() { try { f_396(); } catch (...) { throw; } }
+static void f_398() { try { f_397(); } catch (...) { throw; } }
+static void f_399() { try { f_398(); } catch (...) { throw; } }
+static void f_400() { try { f_399(); } catch (...) { throw; } }
+static void f_401() { try { f_400(); } catch (...) { throw; } }
+static void f_402() { try { f_401(); } catch (...) { throw; } }
+static void f_403() { try { f_402(); } catch (...) { throw; } }
+static void f_404() { try { f_403(); } catch (...) { throw; } }
+static void f_405() { try { f_404(); } catch (...) { throw; } }
+static void f_406() { try { f_405(); } catch (...) { throw; } }
+static void f_407() { try { f_406(); } catch (...) { throw; } }
+static void f_408() { try { f_407(); } catch (...) { throw; } }
+static void f_409() { try { f_408(); } catch (...) { throw; } }
+static void f_410() { try { f_409(); } catch (...) { throw; } }
+static void f_411() { try { f_410(); } catch (...) { throw; } }
+static void f_412() { try { f_411(); } catch (...) { throw; } }
+static void f_413() { try { f_412(); } catch (...) { throw; } }
+static void f_414() { try { f_413(); } catch (...) { throw; } }
+static void f_415() { try { f_414(); } catch (...) { throw; } }
+static void f_416() { try { f_415(); } catch (...) { throw; } }
+static void f_417() { try { f_416(); } catch (...) { throw; } }
+static void f_418() { try { f_417(); } catch (...) { throw; } }
+static void f_419() { try { f_418(); } catch (...) { throw; } }
+static void f_420() { try { f_419(); } catch (...) { throw; } }
+static void f_421() { try { f_420(); } catch (...) { throw; } }
+static void f_422() { try { f_421(); } catch (...) { throw; } }
+static void f_423() { try { f_422(); } catch (...) { throw; } }
+static void f_424() { try { f_423(); } catch (...) { throw; } }
+static void f_425() { try { f_424(); } catch (...) { throw; } }
+static void f_426() { try { f_425(); } catch (...) { throw; } }
+static void f_427() { try { f_426(); } catch (...) { throw; } }
+static void f_428() { try { f_427(); } catch (...) { throw; } }
+static void f_429() { try { f_428(); } catch (...) { throw; } }
+static void f_430() { try { f_429(); } catch (...) { throw; } }
+static void f_431() { try { f_430(); } catch (...) { throw; } }
+static void f_432() { try { f_431(); } catch (...) { throw; } }
+static void f_433() { try { f_432(); } catch (...) { throw; } }
+static void f_434() { try { f_433(); } catch (...) { throw; } }
+static void f_435() { try { f_434(); } catch (...) { throw; } }
+static void f_436() { try { f_435(); } catch (...) { throw; } }
+static void f_437() { try { f_436(); } catch (...) { throw; } }
+static void f_438() { try { f_437(); } catch (...) { throw; } }
+static void f_439() { try { f_438(); } catch (...) { throw; } }
+static void f_440() { try { f_439(); } catch (...) { throw; } }
+static void f_441() { try { f_440(); } catch (...) { throw; } }
+static void f_442() { try { f_441(); } catch (...) { throw; } }
+static void f_443() { try { f_442(); } catch (...) { throw; } }
+static void f_444() { try { f_443(); } catch (...) { throw; } }
+static void f_445() { try { f_444(); } catch (...) { throw; } }
+static void f_446() { try { f_445(); } catch (...) { throw; } }
+static void f_447() { try { f_446(); } catch (...) { throw; } }
+static void f_448() { try { f_447(); } catch (...) { throw; } }
+static void f_449() { try { f_448(); } catch (...) { throw; } }
+static void f_450() { try { f_449(); } catch (...) { throw; } }
+static void f_451() { try { f_450(); } catch (...) { throw; } }
+static void f_452() { try { f_451(); } catch (...) { throw; } }
+static void f_453() { try { f_452(); } catch (...) { throw; } }
+static void f_454() { try { f_453(); } catch (...) { throw; } }
+static void f_455() { try { f_454(); } catch (...) { throw; } }
+static void f_456() { try { f_455(); } catch (...) { throw; } }
+static void f_457() { try { f_456(); } catch (...) { throw; } }
+static void f_458() { try { f_457(); } catch (...) { throw; } }
+static void f_459() { try { f_458(); } catch (...) { throw; } }
+static void f_460() { try { f_459(); } catch (...) { throw; } }
+static void f_461() { try { f_460(); } catch (...) { throw; } }
+static void f_462() { try { f_461(); } catch (...) { throw; } }
+static void f_463() { try { f_462(); } catch (...) { throw; } }
+static void f_464() { try { f_463(); } catch (...) { throw; } }
+static void f_465() { try { f_464(); } catch (...) { throw; } }
+static void f_466() { try { f_465(); } catch (...) { throw; } }
+static void f_467() { try { f_466(); } catch (...) { throw; } }
+static void f_468() { try { f_467(); } catch (...) { throw; } }
+static void f_469() { try { f_468(); } catch (...) { throw; } }
+static void f_470() { try { f_469(); } catch (...) { throw; } }
+static void f_471() { try { f_470(); } catch (...) { throw; } }
+static void f_472() { try { f_471(); } catch (...) { throw; } }
+static void f_473() { try { f_472(); } catch (...) { throw; } }
+static void f_474() { try { f_473(); } catch (...) { throw; } }
+static void f_475() { try { f_474(); } catch (...) { throw; } }
+static void f_476() { try { f_475(); } catch (...) { throw; } }
+static void f_477() { try { f_476(); } catch (...) { throw; } }
+static void f_478() { try { f_477(); } catch (...) { throw; } }
+static void f_479() { try { f_478(); } catch (...) { throw; } }
+static void f_480() { try { f_479(); } catch (...) { throw; } }
+static void f_481() { try { f_480(); } catch (...) { throw; } }
+static void f_482() { try { f_481(); } catch (...) { throw; } }
+static void f_483() { try { f_482(); } catch (...) { throw; } }
+static void f_484() { try { f_483(); } catch (...) { throw; } }
+static void f_485() { try { f_484(); } catch (...) { throw; } }
+static void f_486() { try { f_485(); } catch (...) { throw; } }
+static void f_487() { try { f_486(); } catch (...) { throw; } }
+static void f_488() { try { f_487(); } catch (...) { throw; } }
+static void f_489() { try { f_488(); } catch (...) { throw; } }
+static void f_490() { try { f_489(); } catch (...) { throw; } }
+static void f_491() { try { f_490(); } catch (...) { throw; } }
+static void f_492() { try { f_491(); } catch (...) { throw; } }
+static void f_493() { try { f_492(); } catch (...) { throw; } }
+static void f_494() { try { f_493(); } catch (...) { throw; } }
+static void f_495() { try { f_494(); } catch (...) { throw; } }
+static void f_496() { try { f_495(); } catch (...) { throw; } }
+static void f_497() { try { f_496(); } catch (...) { throw; } }
+static void f_498() { try { f_497(); } catch (...) { throw; } }
+static void f_499() { try { f_498(); } catch (...) { throw; } }
+static void f_500() { try { f_499(); } catch (...) { throw; } }
+static void f_501() { try { f_500(); } catch (...) { throw; } }
+static void f_502() { try { f_501(); } catch (...) { throw; } }
+static void f_503() { try { f_502(); } catch (...) { throw; } }
+static void f_504() { try { f_503(); } catch (...) { throw; } }
+static void f_505() { try { f_504(); } catch (...) { throw; } }
+static void f_506() { try { f_505(); } catch (...) { throw; } }
+static void f_507() { try { f_506(); } catch (...) { throw; } }
+static void f_508() { try { f_507(); } catch (...) { throw; } }
+static void f_509() { try { f_508(); } catch (...) { throw; } }
+static void f_510() { try { f_509(); } catch (...) { throw; } }
+static void f_511() { try { f_510(); } catch (...) { throw; } }
+static void f_512() { try { f_511(); } catch (...) { throw; } }
+static void f_513() { try { f_512(); } catch (...) { throw; } }
+static void f_514() { try { f_513(); } catch (...) { throw; } }
+static void f_515() { try { f_514(); } catch (...) { throw; } }
+static void f_516() { try { f_515(); } catch (...) { throw; } }
+static void f_517() { try { f_516(); } catch (...) { throw; } }
+static void f_518() { try { f_517(); } catch (...) { throw; } }
+static void f_519() { try { f_518(); } catch (...) { throw; } }
+static void f_520() { try { f_519(); } catch (...) { throw; } }
+static void f_521() { try { f_520(); } catch (...) { throw; } }
+static void f_522() { try { f_521(); } catch (...) { throw; } }
+static void f_523() { try { f_522(); } catch (...) { throw; } }
+static void f_524() { try { f_523(); } catch (...) { throw; } }
+static void f_525() { try { f_524(); } catch (...) { throw; } }
+static void f_526() { try { f_525(); } catch (...) { throw; } }
+static void f_527() { try { f_526(); } catch (...) { throw; } }
+static void f_528() { try { f_527(); } catch (...) { throw; } }
+static void f_529() { try { f_528(); } catch (...) { throw; } }
+static void f_530() { try { f_529(); } catch (...) { throw; } }
+static void f_531() { try { f_530(); } catch (...) { throw; } }
+static void f_532() { try { f_531(); } catch (...) { throw; } }
+static void f_533() { try { f_532(); } catch (...) { throw; } }
+static void f_534() { try { f_533(); } catch (...) { throw; } }
+static void f_535() { try { f_534(); } catch (...) { throw; } }
+static void f_536() { try { f_535(); } catch (...) { throw; } }
+static void f_537() { try { f_536(); } catch (...) { throw; } }
+static void f_538() { try { f_537(); } catch (...) { throw; } }
+static void f_539() { try { f_538(); } catch (...) { throw; } }
+static void f_540() { try { f_539(); } catch (...) { throw; } }
+static void f_541() { try { f_540(); } catch (...) { throw; } }
+static void f_542() { try { f_541(); } catch (...) { throw; } }
+static void f_543() { try { f_542(); } catch (...) { throw; } }
+static void f_544() { try { f_543(); } catch (...) { throw; } }
+static void f_545() { try { f_544(); } catch (...) { throw; } }
+static void f_546() { try { f_545(); } catch (...) { throw; } }
+static void f_547() { try { f_546(); } catch (...) { throw; } }
+static void f_548() { try { f_547(); } catch (...) { throw; } }
+static void f_549() { try { f_548(); } catch (...) { throw; } }
+static void f_550() { try { f_549(); } catch (...) { throw; } }
+static void f_551() { try { f_550(); } catch (...) { throw; } }
+static void f_552() { try { f_551(); } catch (...) { throw; } }
+static void f_553() { try { f_552(); } catch (...) { throw; } }
+static void f_554() { try { f_553(); } catch (...) { throw; } }
+static void f_555() { try { f_554(); } catch (...) { throw; } }
+static void f_556() { try { f_555(); } catch (...) { throw; } }
+static void f_557() { try { f_556(); } catch (...) { throw; } }
+static void f_558() { try { f_557(); } catch (...) { throw; } }
+static void f_559() { try { f_558(); } catch (...) { throw; } }
+static void f_560() { try { f_559(); } catch (...) { throw; } }
+static void f_561() { try { f_560(); } catch (...) { throw; } }
+static void f_562() { try { f_561(); } catch (...) { throw; } }
+static void f_563() { try { f_562(); } catch (...) { throw; } }
+static void f_564() { try { f_563(); } catch (...) { throw; } }
+static void f_565() { try { f_564(); } catch (...) { throw; } }
+static void f_566() { try { f_565(); } catch (...) { throw; } }
+static void f_567() { try { f_566(); } catch (...) { throw; } }
+static void f_568() { try { f_567(); } catch (...) { throw; } }
+static void f_569() { try { f_568(); } catch (...) { throw; } }
+static void f_570() { try { f_569(); } catch (...) { throw; } }
+static void f_571() { try { f_570(); } catch (...) { throw; } }
+static void f_572() { try { f_571(); } catch (...) { throw; } }
+static void f_573() { try { f_572(); } catch (...) { throw; } }
+static void f_574() { try { f_573(); } catch (...) { throw; } }
+static void f_575() { try { f_574(); } catch (...) { throw; } }
+static void f_576() { try { f_575(); } catch (...) { throw; } }
+static void f_577() { try { f_576(); } catch (...) { throw; } }
+static void f_578() { try { f_577(); } catch (...) { throw; } }
+static void f_579() { try { f_578(); } catch (...) { throw; } }
+static void f_580() { try { f_579(); } catch (...) { throw; } }
+static void f_581() { try { f_580(); } catch (...) { throw; } }
+static void f_582() { try { f_581(); } catch (...) { throw; } }
+static void f_583() { try { f_582(); } catch (...) { throw; } }
+static void f_584() { try { f_583(); } catch (...) { throw; } }
+static void f_585() { try { f_584(); } catch (...) { throw; } }
+static void f_586() { try { f_585(); } catch (...) { throw; } }
+static void f_587() { try { f_586(); } catch (...) { throw; } }
+static void f_588() { try { f_587(); } catch (...) { throw; } }
+static void f_589() { try { f_588(); } catch (...) { throw; } }
+static void f_590() { try { f_589(); } catch (...) { throw; } }
+static void f_591() { try { f_590(); } catch (...) { throw; } }
+static void f_592() { try { f_591(); } catch (...) { throw; } }
+static void f_593() { try { f_592(); } catch (...) { throw; } }
+static void f_594() { try { f_593(); } catch (...) { throw; } }
+static void f_595() { try { f_594(); } catch (...) { throw; } }
+static void f_596() { try { f_595(); } catch (...) { throw; } }
+static void f_597() { try { f_596(); } catch (...) { throw; } }
+static void f_598() { try { f_597(); } catch (...) { throw; } }
+static void f_599() { try { f_598(); } catch (...) { throw; } }
+static void f_600() { try { f_599(); } catch (...) { throw; } }
+static void f_601() { try { f_600(); } catch (...) { throw; } }
+static void f_602() { try { f_601(); } catch (...) { throw; } }
+static void f_603() { try { f_602(); } catch (...) { throw; } }
+static void f_604() { try { f_603(); } catch (...) { throw; } }
+static void f_605() { try { f_604(); } catch (...) { throw; } }
+static void f_606() { try { f_605(); } catch (...) { throw; } }
+static void f_607() { try { f_606(); } catch (...) { throw; } }
+static void f_608() { try { f_607(); } catch (...) { throw; } }
+static void f_609() { try { f_608(); } catch (...) { throw; } }
+static void f_610() { try { f_609(); } catch (...) { throw; } }
+static void f_611() { try { f_610(); } catch (...) { throw; } }
+static void f_612() { try { f_611(); } catch (...) { throw; } }
+static void f_613() { try { f_612(); } catch (...) { throw; } }
+static void f_614() { try { f_613(); } catch (...) { throw; } }
+static void f_615() { try { f_614(); } catch (...) { throw; } }
+static void f_616() { try { f_615(); } catch (...) { throw; } }
+static void f_617() { try { f_616(); } catch (...) { throw; } }
+static void f_618() { try { f_617(); } catch (...) { throw; } }
+static void f_619() { try { f_618(); } catch (...) { throw; } }
+static void f_620() { try { f_619(); } catch (...) { throw; } }
+static void f_621() { try { f_620(); } catch (...) { throw; } }
+static void f_622() { try { f_621(); } catch (...) { throw; } }
+static void f_623() { try { f_622(); } catch (...) { throw; } }
+static void f_624() { try { f_623(); } catch (...) { throw; } }
+static void f_625() { try { f_624(); } catch (...) { throw; } }
+static void f_626() { try { f_625(); } catch (...) { throw; } }
+static void f_627() { try { f_626(); } catch (...) { throw; } }
+static void f_628() { try { f_627(); } catch (...) { throw; } }
+static void f_629() { try { f_628(); } catch (...) { throw; } }
+static void f_630() { try { f_629(); } catch (...) { throw; } }
+static void f_631() { try { f_630(); } catch (...) { throw; } }
+static void f_632() { try { f_631(); } catch (...) { throw; } }
+static void f_633() { try { f_632(); } catch (...) { throw; } }
+static void f_634() { try { f_633(); } catch (...) { throw; } }
+static void f_635() { try { f_634(); } catch (...) { throw; } }
+static void f_636() { try { f_635(); } catch (...) { throw; } }
+static void f_637() { try { f_636(); } catch (...) { throw; } }
+static void f_638() { try { f_637(); } catch (...) { throw; } }
+static void f_639() { try { f_638(); } catch (...) { throw; } }
+static void f_640() { try { f_639(); } catch (...) { throw; } }
+static void f_641() { try { f_640(); } catch (...) { throw; } }
+static void f_642() { try { f_641(); } catch (...) { throw; } }
+static void f_643() { try { f_642(); } catch (...) { throw; } }
+static void f_644() { try { f_643(); } catch (...) { throw; } }
+static void f_645() { try { f_644(); } catch (...) { throw; } }
+static void f_646() { try { f_645(); } catch (...) { throw; } }
+static void f_647() { try { f_646(); } catch (...) { throw; } }
+static void f_648() { try { f_647(); } catch (...) { throw; } }
+static void f_649() { try { f_648(); } catch (...) { throw; } }
+static void f_650() { try { f_649(); } catch (...) { throw; } }
+static void f_651() { try { f_650(); } catch (...) { throw; } }
+static void f_652() { try { f_651(); } catch (...) { throw; } }
+static void f_653() { try { f_652(); } catch (...) { throw; } }
+static void f_654() { try { f_653(); } catch (...) { throw; } }
+static void f_655() { try { f_654(); } catch (...) { throw; } }
+static void f_656() { try { f_655(); } catch (...) { throw; } }
+static void f_657() { try { f_656(); } catch (...) { throw; } }
+static void f_658() { try { f_657(); } catch (...) { throw; } }
+static void f_659() { try { f_658(); } catch (...) { throw; } }
+static void f_660() { try { f_659(); } catch (...) { throw; } }
+static void f_661() { try { f_660(); } catch (...) { throw; } }
+static void f_662() { try { f_661(); } catch (...) { throw; } }
+static void f_663() { try { f_662(); } catch (...) { throw; } }
+static void f_664() { try { f_663(); } catch (...) { throw; } }
+static void f_665() { try { f_664(); } catch (...) { throw; } }
+static void f_666() { try { f_665(); } catch (...) { throw; } }
+static void f_667() { try { f_666(); } catch (...) { throw; } }
+static void f_668() { try { f_667(); } catch (...) { throw; } }
+static void f_669() { try { f_668(); } catch (...) { throw; } }
+static void f_670() { try { f_669(); } catch (...) { throw; } }
+static void f_671() { try { f_670(); } catch (...) { throw; } }
+static void f_672() { try { f_671(); } catch (...) { throw; } }
+static void f_673() { try { f_672(); } catch (...) { throw; } }
+static void f_674() { try { f_673(); } catch (...) { throw; } }
+static void f_675() { try { f_674(); } catch (...) { throw; } }
+static void f_676() { try { f_675(); } catch (...) { throw; } }
+static void f_677() { try { f_676(); } catch (...) { throw; } }
+static void f_678() { try { f_677(); } catch (...) { throw; } }
+static void f_679() { try { f_678(); } catch (...) { throw; } }
+static void f_680() { try { f_679(); } catch (...) { throw; } }
+static void f_681() { try { f_680(); } catch (...) { throw; } }
+static void f_682() { try { f_681(); } catch (...) { throw; } }
+static void f_683() { try { f_682(); } catch (...) { throw; } }
+static void f_684() { try { f_683(); } catch (...) { throw; } }
+static void f_685() { try { f_684(); } catch (...) { throw; } }
+static void f_686() { try { f_685(); } catch (...) { throw; } }
+static void f_687() { try { f_686(); } catch (...) { throw; } }
+static void f_688() { try { f_687(); } catch (...) { throw; } }
+static void f_689() { try { f_688(); } catch (...) { throw; } }
+static void f_690() { try { f_689(); } catch (...) { throw; } }
+static void f_691() { try { f_690(); } catch (...) { throw; } }
+static void f_692() { try { f_691(); } catch (...) { throw; } }
+static void f_693() { try { f_692(); } catch (...) { throw; } }
+static void f_694() { try { f_693(); } catch (...) { throw; } }
+static void f_695() { try { f_694(); } catch (...) { throw; } }
+static void f_696() { try { f_695(); } catch (...) { throw; } }
+static void f_697() { try { f_696(); } catch (...) { throw; } }
+static void f_698() { try { f_697(); } catch (...) { throw; } }
+static void f_699() { try { f_698(); } catch (...) { throw; } }
+static void f_700() { try { f_699(); } catch (...) { throw; } }
+static void f_701() { try { f_700(); } catch (...) { throw; } }
+static void f_702() { try { f_701(); } catch (...) { throw; } }
+static void f_703() { try { f_702(); } catch (...) { throw; } }
+static void f_704() { try { f_703(); } catch (...) { throw; } }
+static void f_705() { try { f_704(); } catch (...) { throw; } }
+static void f_706() { try { f_705(); } catch (...) { throw; } }
+static void f_707() { try { f_706(); } catch (...) { throw; } }
+static void f_708() { try { f_707(); } catch (...) { throw; } }
+static void f_709() { try { f_708(); } catch (...) { throw; } }
+static void f_710() { try { f_709(); } catch (...) { throw; } }
+static void f_711() { try { f_710(); } catch (...) { throw; } }
+static void f_712() { try { f_711(); } catch (...) { throw; } }
+static void f_713() { try { f_712(); } catch (...) { throw; } }
+static void f_714() { try { f_713(); } catch (...) { throw; } }
+static void f_715() { try { f_714(); } catch (...) { throw; } }
+static void f_716() { try { f_715(); } catch (...) { throw; } }
+static void f_717() { try { f_716(); } catch (...) { throw; } }
+static void f_718() { try { f_717(); } catch (...) { throw; } }
+static void f_719() { try { f_718(); } catch (...) { throw; } }
+static void f_720() { try { f_719(); } catch (...) { throw; } }
+static void f_721() { try { f_720(); } catch (...) { throw; } }
+static void f_722() { try { f_721(); } catch (...) { throw; } }
+static void f_723() { try { f_722(); } catch (...) { throw; } }
+static void f_724() { try { f_723(); } catch (...) { throw; } }
+static void f_725() { try { f_724(); } catch (...) { throw; } }
+static void f_726() { try { f_725(); } catch (...) { throw; } }
+static void f_727() { try { f_726(); } catch (...) { throw; } }
+static void f_728() { try { f_727(); } catch (...) { throw; } }
+static void f_729() { try { f_728(); } catch (...) { throw; } }
+static void f_730() { try { f_729(); } catch (...) { throw; } }
+static void f_731() { try { f_730(); } catch (...) { throw; } }
+static void f_732() { try { f_731(); } catch (...) { throw; } }
+static void f_733() { try { f_732(); } catch (...) { throw; } }
+static void f_734() { try { f_733(); } catch (...) { throw; } }
+static void f_735() { try { f_734(); } catch (...) { throw; } }
+static void f_736() { try { f_735(); } catch (...) { throw; } }
+static void f_737() { try { f_736(); } catch (...) { throw; } }
+static void f_738() { try { f_737(); } catch (...) { throw; } }
+static void f_739() { try { f_738(); } catch (...) { throw; } }
+static void f_740() { try { f_739(); } catch (...) { throw; } }
+static void f_741() { try { f_740(); } catch (...) { throw; } }
+static void f_742() { try { f_741(); } catch (...) { throw; } }
+static void f_743() { try { f_742(); } catch (...) { throw; } }
+static void f_744() { try { f_743(); } catch (...) { throw; } }
+static void f_745() { try { f_744(); } catch (...) { throw; } }
+static void f_746() { try { f_745(); } catch (...) { throw; } }
+static void f_747() { try { f_746(); } catch (...) { throw; } }
+static void f_748() { try { f_747(); } catch (...) { throw; } }
+static void f_749() { try { f_748(); } catch (...) { throw; } }
+static void f_750() { try { f_749(); } catch (...) { throw; } }
+static void f_751() { try { f_750(); } catch (...) { throw; } }
+static void f_752() { try { f_751(); } catch (...) { throw; } }
+static void f_753() { try { f_752(); } catch (...) { throw; } }
+static void f_754() { try { f_753(); } catch (...) { throw; } }
+static void f_755() { try { f_754(); } catch (...) { throw; } }
+static void f_756() { try { f_755(); } catch (...) { throw; } }
+static void f_757() { try { f_756(); } catch (...) { throw; } }
+static void f_758() { try { f_757(); } catch (...) { throw; } }
+static void f_759() { try { f_758(); } catch (...) { throw; } }
+static void f_760() { try { f_759(); } catch (...) { throw; } }
+static void f_761() { try { f_760(); } catch (...) { throw; } }
+static void f_762() { try { f_761(); } catch (...) { throw; } }
+static void f_763() { try { f_762(); } catch (...) { throw; } }
+static void f_764() { try { f_763(); } catch (...) { throw; } }
+static void f_765() { try { f_764(); } catch (...) { throw; } }
+static void f_766() { try { f_765(); } catch (...) { throw; } }
+static void f_767() { try { f_766(); } catch (...) { throw; } }
+static void f_768() { try { f_767(); } catch (...) { throw; } }
+static void f_769() { try { f_768(); } catch (...) { throw; } }
+static void f_770() { try { f_769(); } catch (...) { throw; } }
+static void f_771() { try { f_770(); } catch (...) { throw; } }
+static void f_772() { try { f_771(); } catch (...) { throw; } }
+static void f_773() { try { f_772(); } catch (...) { throw; } }
+static void f_774() { try { f_773(); } catch (...) { throw; } }
+static void f_775() { try { f_774(); } catch (...) { throw; } }
+static void f_776() { try { f_775(); } catch (...) { throw; } }
+static void f_777() { try { f_776(); } catch (...) { throw; } }
+static void f_778() { try { f_777(); } catch (...) { throw; } }
+static void f_779() { try { f_778(); } catch (...) { throw; } }
+static void f_780() { try { f_779(); } catch (...) { throw; } }
+static void f_781() { try { f_780(); } catch (...) { throw; } }
+static void f_782() { try { f_781(); } catch (...) { throw; } }
+static void f_783() { try { f_782(); } catch (...) { throw; } }
+static void f_784() { try { f_783(); } catch (...) { throw; } }
+static void f_785() { try { f_784(); } catch (...) { throw; } }
+static void f_786() { try { f_785(); } catch (...) { throw; } }
+static void f_787() { try { f_786(); } catch (...) { throw; } }
+static void f_788() { try { f_787(); } catch (...) { throw; } }
+static void f_789() { try { f_788(); } catch (...) { throw; } }
+static void f_790() { try { f_789(); } catch (...) { throw; } }
+static void f_791() { try { f_790(); } catch (...) { throw; } }
+static void f_792() { try { f_791(); } catch (...) { throw; } }
+static void f_793() { try { f_792(); } catch (...) { throw; } }
+static void f_794() { try { f_793(); } catch (...) { throw; } }
+static void f_795() { try { f_794(); } catch (...) { throw; } }
+static void f_796() { try { f_795(); } catch (...) { throw; } }
+static void f_797() { try { f_796(); } catch (...) { throw; } }
+static void f_798() { try { f_797(); } catch (...) { throw; } }
+static void f_799() { try { f_798(); } catch (...) { throw; } }
+static void f_800() { try { f_799(); } catch (...) { throw; } }
+static void f_801() { try { f_800(); } catch (...) { throw; } }
+static void f_802() { try { f_801(); } catch (...) { throw; } }
+static void f_803() { try { f_802(); } catch (...) { throw; } }
+static void f_804() { try { f_803(); } catch (...) { throw; } }
+static void f_805() { try { f_804(); } catch (...) { throw; } }
+static void f_806() { try { f_805(); } catch (...) { throw; } }
+static void f_807() { try { f_806(); } catch (...) { throw; } }
+static void f_808() { try { f_807(); } catch (...) { throw; } }
+static void f_809() { try { f_808(); } catch (...) { throw; } }
+static void f_810() { try { f_809(); } catch (...) { throw; } }
+static void f_811() { try { f_810(); } catch (...) { throw; } }
+static void f_812() { try { f_811(); } catch (...) { throw; } }
+static void f_813() { try { f_812(); } catch (...) { throw; } }
+static void f_814() { try { f_813(); } catch (...) { throw; } }
+static void f_815() { try { f_814(); } catch (...) { throw; } }
+static void f_816() { try { f_815(); } catch (...) { throw; } }
+static void f_817() { try { f_816(); } catch (...) { throw; } }
+static void f_818() { try { f_817(); } catch (...) { throw; } }
+static void f_819() { try { f_818(); } catch (...) { throw; } }
+static void f_820() { try { f_819(); } catch (...) { throw; } }
+static void f_821() { try { f_820(); } catch (...) { throw; } }
+static void f_822() { try { f_821(); } catch (...) { throw; } }
+static void f_823() { try { f_822(); } catch (...) { throw; } }
+static void f_824() { try { f_823(); } catch (...) { throw; } }
+static void f_825() { try { f_824(); } catch (...) { throw; } }
+static void f_826() { try { f_825(); } catch (...) { throw; } }
+static void f_827() { try { f_826(); } catch (...) { throw; } }
+static void f_828() { try { f_827(); } catch (...) { throw; } }
+static void f_829() { try { f_828(); } catch (...) { throw; } }
+static void f_830() { try { f_829(); } catch (...) { throw; } }
+static void f_831() { try { f_830(); } catch (...) { throw; } }
+static void f_832() { try { f_831(); } catch (...) { throw; } }
+static void f_833() { try { f_832(); } catch (...) { throw; } }
+static void f_834() { try { f_833(); } catch (...) { throw; } }
+static void f_835() { try { f_834(); } catch (...) { throw; } }
+static void f_836() { try { f_835(); } catch (...) { throw; } }
+static void f_837() { try { f_836(); } catch (...) { throw; } }
+static void f_838() { try { f_837(); } catch (...) { throw; } }
+static void f_839() { try { f_838(); } catch (...) { throw; } }
+static void f_840() { try { f_839(); } catch (...) { throw; } }
+static void f_841() { try { f_840(); } catch (...) { throw; } }
+static void f_842() { try { f_841(); } catch (...) { throw; } }
+static void f_843() { try { f_842(); } catch (...) { throw; } }
+static void f_844() { try { f_843(); } catch (...) { throw; } }
+static void f_845() { try { f_844(); } catch (...) { throw; } }
+static void f_846() { try { f_845(); } catch (...) { throw; } }
+static void f_847() { try { f_846(); } catch (...) { throw; } }
+static void f_848() { try { f_847(); } catch (...) { throw; } }
+static void f_849() { try { f_848(); } catch (...) { throw; } }
+static void f_850() { try { f_849(); } catch (...) { throw; } }
+static void f_851() { try { f_850(); } catch (...) { throw; } }
+static void f_852() { try { f_851(); } catch (...) { throw; } }
+static void f_853() { try { f_852(); } catch (...) { throw; } }
+static void f_854() { try { f_853(); } catch (...) { throw; } }
+static void f_855() { try { f_854(); } catch (...) { throw; } }
+static void f_856() { try { f_855(); } catch (...) { throw; } }
+static void f_857() { try { f_856(); } catch (...) { throw; } }
+static void f_858() { try { f_857(); } catch (...) { throw; } }
+static void f_859() { try { f_858(); } catch (...) { throw; } }
+static void f_860() { try { f_859(); } catch (...) { throw; } }
+static void f_861() { try { f_860(); } catch (...) { throw; } }
+static void f_862() { try { f_861(); } catch (...) { throw; } }
+static void f_863() { try { f_862(); } catch (...) { throw; } }
+static void f_864() { try { f_863(); } catch (...) { throw; } }
+static void f_865() { try { f_864(); } catch (...) { throw; } }
+static void f_866() { try { f_865(); } catch (...) { throw; } }
+static void f_867() { try { f_866(); } catch (...) { throw; } }
+static void f_868() { try { f_867(); } catch (...) { throw; } }
+static void f_869() { try { f_868(); } catch (...) { throw; } }
+static void f_870() { try { f_869(); } catch (...) { throw; } }
+static void f_871() { try { f_870(); } catch (...) { throw; } }
+static void f_872() { try { f_871(); } catch (...) { throw; } }
+static void f_873() { try { f_872(); } catch (...) { throw; } }
+static void f_874() { try { f_873(); } catch (...) { throw; } }
+static void f_875() { try { f_874(); } catch (...) { throw; } }
+static void f_876() { try { f_875(); } catch (...) { throw; } }
+static void f_877() { try { f_876(); } catch (...) { throw; } }
+static void f_878() { try { f_877(); } catch (...) { throw; } }
+static void f_879() { try { f_878(); } catch (...) { throw; } }
+static void f_880() { try { f_879(); } catch (...) { throw; } }
+static void f_881() { try { f_880(); } catch (...) { throw; } }
+static void f_882() { try { f_881(); } catch (...) { throw; } }
+static void f_883() { try { f_882(); } catch (...) { throw; } }
+static void f_884() { try { f_883(); } catch (...) { throw; } }
+static void f_885() { try { f_884(); } catch (...) { throw; } }
+static void f_886() { try { f_885(); } catch (...) { throw; } }
+static void f_887() { try { f_886(); } catch (...) { throw; } }
+static void f_888() { try { f_887(); } catch (...) { throw; } }
+static void f_889() { try { f_888(); } catch (...) { throw; } }
+static void f_890() { try { f_889(); } catch (...) { throw; } }
+static void f_891() { try { f_890(); } catch (...) { throw; } }
+static void f_892() { try { f_891(); } catch (...) { throw; } }
+static void f_893() { try { f_892(); } catch (...) { throw; } }
+static void f_894() { try { f_893(); } catch (...) { throw; } }
+static void f_895() { try { f_894(); } catch (...) { throw; } }
+static void f_896() { try { f_895(); } catch (...) { throw; } }
+static void f_897() { try { f_896(); } catch (...) { throw; } }
+static void f_898() { try { f_897(); } catch (...) { throw; } }
+static void f_899() { try { f_898(); } catch (...) { throw; } }
+static void f_900() { try { f_899(); } catch (...) { throw; } }
+static void f_901() { try { f_900(); } catch (...) { throw; } }
+static void f_902() { try { f_901(); } catch (...) { throw; } }
+static void f_903() { try { f_902(); } catch (...) { throw; } }
+static void f_904() { try { f_903(); } catch (...) { throw; } }
+static void f_905() { try { f_904(); } catch (...) { throw; } }
+static void f_906() { try { f_905(); } catch (...) { throw; } }
+static void f_907() { try { f_906(); } catch (...) { throw; } }
+static void f_908() { try { f_907(); } catch (...) { throw; } }
+static void f_909() { try { f_908(); } catch (...) { throw; } }
+static void f_910() { try { f_909(); } catch (...) { throw; } }
+static void f_911() { try { f_910(); } catch (...) { throw; } }
+static void f_912() { try { f_911(); } catch (...) { throw; } }
+static void f_913() { try { f_912(); } catch (...) { throw; } }
+static void f_914() { try { f_913(); } catch (...) { throw; } }
+static void f_915() { try { f_914(); } catch (...) { throw; } }
+static void f_916() { try { f_915(); } catch (...) { throw; } }
+static void f_917() { try { f_916(); } catch (...) { throw; } }
+static void f_918() { try { f_917(); } catch (...) { throw; } }
+static void f_919() { try { f_918(); } catch (...) { throw; } }
+static void f_920() { try { f_919(); } catch (...) { throw; } }
+static void f_921() { try { f_920(); } catch (...) { throw; } }
+static void f_922() { try { f_921(); } catch (...) { throw; } }
+static void f_923() { try { f_922(); } catch (...) { throw; } }
+static void f_924() { try { f_923(); } catch (...) { throw; } }
+static void f_925() { try { f_924(); } catch (...) { throw; } }
+static void f_926() { try { f_925(); } catch (...) { throw; } }
+static void f_927() { try { f_926(); } catch (...) { throw; } }
+static void f_928() { try { f_927(); } catch (...) { throw; } }
+static void f_929() { try { f_928(); } catch (...) { throw; } }
+static void f_930() { try { f_929(); } catch (...) { throw; } }
+static void f_931() { try { f_930(); } catch (...) { throw; } }
+static void f_932() { try { f_931(); } catch (...) { throw; } }
+static void f_933() { try { f_932(); } catch (...) { throw; } }
+static void f_934() { try { f_933(); } catch (...) { throw; } }
+static void f_935() { try { f_934(); } catch (...) { throw; } }
+static void f_936() { try { f_935(); } catch (...) { throw; } }
+static void f_937() { try { f_936(); } catch (...) { throw; } }
+static void f_938() { try { f_937(); } catch (...) { throw; } }
+static void f_939() { try { f_938(); } catch (...) { throw; } }
+static void f_940() { try { f_939(); } catch (...) { throw; } }
+static void f_941() { try { f_940(); } catch (...) { throw; } }
+static void f_942() { try { f_941(); } catch (...) { throw; } }
+static void f_943() { try { f_942(); } catch (...) { throw; } }
+static void f_944() { try { f_943(); } catch (...) { throw; } }
+static void f_945() { try { f_944(); } catch (...) { throw; } }
+static void f_946() { try { f_945(); } catch (...) { throw; } }
+static void f_947() { try { f_946(); } catch (...) { throw; } }
+static void f_948() { try { f_947(); } catch (...) { throw; } }
+static void f_949() { try { f_948(); } catch (...) { throw; } }
+static void f_950() { try { f_949(); } catch (...) { throw; } }
+static void f_951() { try { f_950(); } catch (...) { throw; } }
+static void f_952() { try { f_951(); } catch (...) { throw; } }
+static void f_953() { try { f_952(); } catch (...) { throw; } }
+static void f_954() { try { f_953(); } catch (...) { throw; } }
+static void f_955() { try { f_954(); } catch (...) { throw; } }
+static void f_956() { try { f_955(); } catch (...) { throw; } }
+static void f_957() { try { f_956(); } catch (...) { throw; } }
+static void f_958() { try { f_957(); } catch (...) { throw; } }
+static void f_959() { try { f_958(); } catch (...) { throw; } }
+static void f_960() { try { f_959(); } catch (...) { throw; } }
+static void f_961() { try { f_960(); } catch (...) { throw; } }
+static void f_962() { try { f_961(); } catch (...) { throw; } }
+static void f_963() { try { f_962(); } catch (...) { throw; } }
+static void f_964() { try { f_963(); } catch (...) { throw; } }
+static void f_965() { try { f_964(); } catch (...) { throw; } }
+static void f_966() { try { f_965(); } catch (...) { throw; } }
+static void f_967() { try { f_966(); } catch (...) { throw; } }
+static void f_968() { try { f_967(); } catch (...) { throw; } }
+static void f_969() { try { f_968(); } catch (...) { throw; } }
+static void f_970() { try { f_969(); } catch (...) { throw; } }
+static void f_971() { try { f_970(); } catch (...) { throw; } }
+static void f_972() { try { f_971(); } catch (...) { throw; } }
+static void f_973() { try { f_972(); } catch (...) { throw; } }
+static void f_974() { try { f_973(); } catch (...) { throw; } }
+static void f_975() { try { f_974(); } catch (...) { throw; } }
+static void f_976() { try { f_975(); } catch (...) { throw; } }
+static void f_977() { try { f_976(); } catch (...) { throw; } }
+static void f_978() { try { f_977(); } catch (...) { throw; } }
+static void f_979() { try { f_978(); } catch (...) { throw; } }
+static void f_980() { try { f_979(); } catch (...) { throw; } }
+static void f_981() { try { f_980(); } catch (...) { throw; } }
+static void f_982() { try { f_981(); } catch (...) { throw; } }
+static void f_983() { try { f_982(); } catch (...) { throw; } }
+static void f_984() { try { f_983(); } catch (...) { throw; } }
+static void f_985() { try { f_984(); } catch (...) { throw; } }
+static void f_986() { try { f_985(); } catch (...) { throw; } }
+static void f_987() { try { f_986(); } catch (...) { throw; } }
+static void f_988() { try { f_987(); } catch (...) { throw; } }
+static void f_989() { try { f_988(); } catch (...) { throw; } }
+static void f_990() { try { f_989(); } catch (...) { throw; } }
+static void f_991() { try { f_990(); } catch (...) { throw; } }
+static void f_992() { try { f_991(); } catch (...) { throw; } }
+static void f_993() { try { f_992(); } catch (...) { throw; } }
+static void f_994() { try { f_993(); } catch (...) { throw; } }
+static void f_995() { try { f_994(); } catch (...) { throw; } }
+static void f_996() { try { f_995(); } catch (...) { throw; } }
+static void f_997() { try { f_996(); } catch (...) { throw; } }
+static void f_998() { try { f_997(); } catch (...) { throw; } }
+static void f_999() { try { f_998(); } catch (...) { throw; } }
+static void f_1000() { try { f_999(); } catch (...) { throw; } }
+static void f_1001() { try { f_1000(); } catch (...) { throw; } }
+static void f_1002() { try { f_1001(); } catch (...) { throw; } }
+static void f_1003() { try { f_1002(); } catch (...) { throw; } }
+static void f_1004() { try { f_1003(); } catch (...) { throw; } }
+static void f_1005() { try { f_1004(); } catch (...) { throw; } }
+static void f_1006() { try { f_1005(); } catch (...) { throw; } }
+static void f_1007() { try { f_1006(); } catch (...) { throw; } }
+static void f_1008() { try { f_1007(); } catch (...) { throw; } }
+static void f_1009() { try { f_1008(); } catch (...) { throw; } }
+static void f_1010() { try { f_1009(); } catch (...) { throw; } }
+static void f_1011() { try { f_1010(); } catch (...) { throw; } }
+static void f_1012() { try { f_1011(); } catch (...) { throw; } }
+static void f_1013() { try { f_1012(); } catch (...) { throw; } }
+static void f_1014() { try { f_1013(); } catch (...) { throw; } }
+static void f_1015() { try { f_1014(); } catch (...) { throw; } }
+static void f_1016() { try { f_1015(); } catch (...) { throw; } }
+static void f_1017() { try { f_1016(); } catch (...) { throw; } }
+static void f_1018() { try { f_1017(); } catch (...) { throw; } }
+static void f_1019() { try { f_1018(); } catch (...) { throw; } }
+static void f_1020() { try { f_1019(); } catch (...) { throw; } }
+int main(int argc, char *argv[]) {
+  try {
+    f_1020();
+  } catch (int n) {
+    return 42 - n;
+  }
+  return 1;
+}
+
diff --git a/llvm/lib/ExecutionEngine/JITLink/CompactUnwindSupport.h b/llvm/lib/ExecutionEngine/JITLink/CompactUnwindSupport.h
index dc3ed942aa8ac..a2c0c13cdbf66 100644
--- a/llvm/lib/ExecutionEngine/JITLink/CompactUnwindSupport.h
+++ b/llvm/lib/ExecutionEngine/JITLink/CompactUnwindSupport.h
@@ -510,8 +510,9 @@ template <typename CURecTraits> class CompactUnwindManager {
       // If this record marks the start of a new second level page.
       if (RecordIdx % NumRecordsPerSecondLevelPage == 0) {
         auto FnDelta = R.Fn->getAddress() - CompactUnwindBase->getAddress();
-        auto SecondLevelPageOffset = SectionOffsetToSecondLevelPages +
-                                     (RecordIdx / NumRecordsPerSecondLevelPage);
+        auto SecondLevelPageOffset =
+            SectionOffsetToSecondLevelPages +
+            SecondLevelPageSize * (RecordIdx / NumRecordsPerSecondLevelPage);
         auto LSDAOffset =
             SectionOffsetToLSDAs + NumPreviousLSDAs * LSDAEntrySize;
 

From 975fa93b5f3715215ac2c4c74e86f466349bc0de Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Thu, 6 Feb 2025 14:24:17 +1100
Subject: [PATCH 073/282] [JITLink] Add a jitlink::Symbol::getSection()
 convenience method.

`Sym.getSection()` is equivalent to `Sym.getBlock().getSection()`.

(cherry picked from commit 4a2a8ed70da7ec44f0aa9092595e5b0f81a7e841)
---
 .../llvm/ExecutionEngine/JITLink/JITLink.h    | 11 ++++---
 .../ExecutionEngine/JITLink/COFF_x86_64.cpp   |  3 +-
 .../JITLink/CompactUnwindSupport.h            |  2 +-
 llvm/lib/ExecutionEngine/JITLink/JITLink.cpp  |  2 +-
 .../ExecutionEngine/JITLink/JITLinkGeneric.h  |  2 +-
 llvm/lib/ExecutionEngine/JITLink/aarch32.cpp  |  6 ++--
 .../Orc/Debugging/PerfSupportPlugin.cpp       |  2 +-
 .../Orc/Debugging/VTuneSupportPlugin.cpp      |  2 +-
 .../lib/ExecutionEngine/Orc/MachOPlatform.cpp |  2 +-
 .../JITLink/LinkGraphTests.cpp                | 30 +++++++++++++++++++
 10 files changed, 47 insertions(+), 15 deletions(-)

diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h b/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h
index 297e603164b24..8f0dfea0c97ac 100644
--- a/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h
@@ -587,6 +587,9 @@ class Symbol {
     return static_cast<const Block &>(*Base);
   }
 
+  /// Return the Section for this Symbol (Symbol must be defined).
+  Section &getSection() const { return getBlock().getSection(); }
+
   /// Returns the offset for this symbol within the underlying addressable.
   orc::ExecutorAddrDiff getOffset() const { return Offset; }
 
@@ -1460,7 +1463,7 @@ class LinkGraph {
       A.setAddress(orc::ExecutorAddr());
     } else {
       assert(Sym.isDefined() && "Sym is not a defined symbol");
-      Section &Sec = Sym.getBlock().getSection();
+      Section &Sec = Sym.getSection();
       Sec.removeSymbol(Sym);
       Sym.makeExternal(createAddressable(orc::ExecutorAddr(), false));
     }
@@ -1488,7 +1491,7 @@ class LinkGraph {
       Sym.setScope(Scope::Local);
     } else {
       assert(Sym.isDefined() && "Sym is not a defined symbol");
-      Section &Sec = Sym.getBlock().getSection();
+      Section &Sec = Sym.getSection();
       Sec.removeSymbol(Sym);
       Sym.makeAbsolute(createAddressable(Address));
     }
@@ -1534,7 +1537,7 @@ class LinkGraph {
   transferDefinedSymbol(Symbol &Sym, Block &DestBlock,
                         orc::ExecutorAddrDiff NewOffset,
                         std::optional<orc::ExecutorAddrDiff> ExplicitNewSize) {
-    auto &OldSection = Sym.getBlock().getSection();
+    auto &OldSection = Sym.getSection();
     Sym.setBlock(DestBlock);
     Sym.setOffset(NewOffset);
     if (ExplicitNewSize)
@@ -1622,7 +1625,7 @@ class LinkGraph {
   /// Removes defined symbols. Does not remove the underlying block.
   void removeDefinedSymbol(Symbol &Sym) {
     assert(Sym.isDefined() && "Sym is not a defined symbol");
-    Sym.getBlock().getSection().removeSymbol(Sym);
+    Sym.getSection().removeSymbol(Sym);
     destroySymbol(Sym);
   }
 
diff --git a/llvm/lib/ExecutionEngine/JITLink/COFF_x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/COFF_x86_64.cpp
index 151f1b337087d..8ceb08051e423 100644
--- a/llvm/lib/ExecutionEngine/JITLink/COFF_x86_64.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/COFF_x86_64.cpp
@@ -221,8 +221,7 @@ class COFFLinkGraphLowering_x86_64 {
         }
         case EdgeKind_coff_x86_64::SecRel32: {
           E.setAddend(E.getAddend() -
-                      getSectionStart(E.getTarget().getBlock().getSection())
-                          .getValue());
+                      getSectionStart(E.getTarget().getSection()).getValue());
           E.setKind(x86_64::Pointer32);
           break;
         }
diff --git a/llvm/lib/ExecutionEngine/JITLink/CompactUnwindSupport.h b/llvm/lib/ExecutionEngine/JITLink/CompactUnwindSupport.h
index a2c0c13cdbf66..c306264b6da5d 100644
--- a/llvm/lib/ExecutionEngine/JITLink/CompactUnwindSupport.h
+++ b/llvm/lib/ExecutionEngine/JITLink/CompactUnwindSupport.h
@@ -155,7 +155,7 @@ template <typename CURecTraits> class CompactUnwindManager {
         Edge *KeepAliveEdge = nullptr;
         for (auto &E : Fn.getBlock().edges_at(0)) {
           if (E.getKind() == Edge::KeepAlive && E.getTarget().isDefined() &&
-              &E.getTarget().getBlock().getSection() == EHFrameSec) {
+              &E.getTarget().getSection() == EHFrameSec) {
             KeepAliveEdge = &E;
             break;
           }
diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp b/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp
index 6b77330bb764b..e8ce9b2b9527d 100644
--- a/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp
@@ -424,7 +424,7 @@ Error makeTargetOutOfRangeError(const LinkGraph &G, const Block &B,
     if (E.getTarget().hasName()) {
       ErrStream << "\"" << E.getTarget().getName() << "\"";
     } else
-      ErrStream << E.getTarget().getBlock().getSection().getName() << " + "
+      ErrStream << E.getTarget().getSection().getName() << " + "
                 << formatv("{0:x}", E.getOffset());
     ErrStream << " at address " << formatv("{0:x}", E.getTarget().getAddress())
               << " is out of range of " << G.getEdgeKindName(E.getKind())
diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.h b/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.h
index e5d05e6b1b7bf..0bf714d6fcdf3 100644
--- a/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.h
+++ b/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.h
@@ -162,7 +162,7 @@ template <typename LinkerImpl> class JITLinker : public JITLinkerBase {
           // If B is a block in a Standard or Finalize section then make sure
           // that no edges point to symbols in NoAlloc sections.
           assert((NoAllocSection || !E.getTarget().isDefined() ||
-                  E.getTarget().getBlock().getSection().getMemLifetime() !=
+                  E.getTarget().getSection().getMemLifetime() !=
                       orc::MemLifetime::NoAlloc) &&
                  "Block in allocated section has edge pointing to no-alloc "
                  "section");
diff --git a/llvm/lib/ExecutionEngine/JITLink/aarch32.cpp b/llvm/lib/ExecutionEngine/JITLink/aarch32.cpp
index 7b255717f2383..01bb6e0403578 100644
--- a/llvm/lib/ExecutionEngine/JITLink/aarch32.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/aarch32.cpp
@@ -874,7 +874,7 @@ bool StubsManager_prev7::visitEdge(LinkGraph &G, Block *B, Edge &E) {
   LLVM_DEBUG({
     dbgs() << "    Using " << (UseThumb ? "Thumb" : "Arm") << " entrypoint "
            << *StubEntrypoint << " in "
-           << StubEntrypoint->getBlock().getSection().getName() << "\n";
+           << StubEntrypoint->getSection().getName() << "\n";
   });
 
   E.setTarget(*StubEntrypoint);
@@ -919,8 +919,8 @@ bool StubsManager_v7::visitEdge(LinkGraph &G, Block *B, Edge &E) {
          "Instruction set states of stub and relocation site should be equal");
   LLVM_DEBUG({
     dbgs() << "    Using " << (MakeThumb ? "Thumb" : "Arm") << " entry "
-           << *StubSymbol << " in "
-           << StubSymbol->getBlock().getSection().getName() << "\n";
+           << *StubSymbol << " in " << StubSymbol->getSection().getName()
+           << "\n";
   });
 
   E.setTarget(*StubSymbol);
diff --git a/llvm/lib/ExecutionEngine/Orc/Debugging/PerfSupportPlugin.cpp b/llvm/lib/ExecutionEngine/Orc/Debugging/PerfSupportPlugin.cpp
index 7a970f3cccf46..bc5b3bb538e39 100644
--- a/llvm/lib/ExecutionEngine/Orc/Debugging/PerfSupportPlugin.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Debugging/PerfSupportPlugin.cpp
@@ -113,7 +113,7 @@ getCodeLoadRecord(const Symbol &Sym, std::atomic<uint64_t> &CodeIndex) {
 
 static std::optional<PerfJITDebugInfoRecord>
 getDebugInfoRecord(const Symbol &Sym, DWARFContext &DC) {
-  auto &Section = Sym.getBlock().getSection();
+  auto &Section = Sym.getSection();
   auto Addr = Sym.getAddress();
   auto Size = Sym.getSize();
   auto SAddr = object::SectionedAddress{Addr.getValue(), Section.getOrdinal()};
diff --git a/llvm/lib/ExecutionEngine/Orc/Debugging/VTuneSupportPlugin.cpp b/llvm/lib/ExecutionEngine/Orc/Debugging/VTuneSupportPlugin.cpp
index 9715a503629bf..1f4557217cf24 100644
--- a/llvm/lib/ExecutionEngine/Orc/Debugging/VTuneSupportPlugin.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Debugging/VTuneSupportPlugin.cpp
@@ -63,7 +63,7 @@ static VTuneMethodBatch getMethodBatch(LinkGraph &G, bool EmitDebugInfo) {
     if (!EmitDebugInfo)
       continue;
 
-    auto &Section = Sym->getBlock().getSection();
+    auto &Section = Sym->getSection();
     auto Addr = Sym->getAddress();
     auto SAddr =
         object::SectionedAddress{Addr.getValue(), Section.getOrdinal()};
diff --git a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
index 44ae73671f5b1..845990d965b16 100644
--- a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
@@ -1071,7 +1071,7 @@ Error MachOPlatform::MachOPlatformPlugin::processObjCImageInfo(
       for (auto *B : Sec.blocks())
         for (auto &E : B->edges())
           if (E.getTarget().isDefined() &&
-              &E.getTarget().getBlock().getSection() == ObjCImageInfo)
+              &E.getTarget().getSection() == ObjCImageInfo)
             return make_error<StringError>(MachOObjCImageInfoSectionName +
                                                " is referenced within file " +
                                                G.getName(),
diff --git a/llvm/unittests/ExecutionEngine/JITLink/LinkGraphTests.cpp b/llvm/unittests/ExecutionEngine/JITLink/LinkGraphTests.cpp
index fb60acddf7821..c760873493528 100644
--- a/llvm/unittests/ExecutionEngine/JITLink/LinkGraphTests.cpp
+++ b/llvm/unittests/ExecutionEngine/JITLink/LinkGraphTests.cpp
@@ -54,6 +54,36 @@ TEST(LinkGraphTest, AddressAccess) {
   EXPECT_EQ(B1.getFixupAddress(E1), B1Addr + 8) << "Incorrect fixup address";
 }
 
+TEST(LinkGraphTest, DefinedSymbolProperties) {
+  // Check that Section::empty behaves as expected.
+  LinkGraph G("foo", std::make_shared<orc::SymbolStringPool>(),
+              Triple("x86_64-apple-darwin"), SubtargetFeatures(),
+              getGenericEdgeKindName);
+  auto &Sec =
+      G.createSection("__data", orc::MemProt::Read | orc::MemProt::Write);
+  auto &B =
+      G.createContentBlock(Sec, BlockContent, orc::ExecutorAddr(0x1000), 8, 0);
+  auto &S = G.addDefinedSymbol(B, 0, "sym", 4, Linkage::Strong, Scope::Default,
+                               false, false);
+
+  EXPECT_TRUE(S.hasName());
+  EXPECT_EQ(*S.getName(), "sym");
+  EXPECT_TRUE(S.isDefined());
+  EXPECT_FALSE(S.isLive());
+  EXPECT_FALSE(S.isCallable());
+  EXPECT_FALSE(S.isExternal());
+  EXPECT_FALSE(S.isAbsolute());
+  EXPECT_EQ(&S.getBlock(), &B);
+  EXPECT_EQ(&S.getSection(), &Sec);
+  EXPECT_EQ(S.getOffset(), 0U);
+  EXPECT_EQ(S.getSize(), 4U);
+  EXPECT_EQ(S.getRange(), orc::ExecutorAddrRange(B.getAddress(), 4));
+  EXPECT_EQ(S.getSymbolContent(), BlockContent.slice(0, 4));
+  EXPECT_EQ(S.getLinkage(), Linkage::Strong);
+  EXPECT_EQ(S.getScope(), Scope::Default);
+  EXPECT_EQ(S.getTargetFlags(), 0U);
+}
+
 TEST(LinkGraphTest, SectionEmpty) {
   // Check that Section::empty behaves as expected.
   LinkGraph G("foo", std::make_shared<orc::SymbolStringPool>(),

From 749bebc5dab440377bc8365ee0d278423d19cd92 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Thu, 6 Feb 2025 16:04:35 +1100
Subject: [PATCH 074/282] [JITLink] Handle compact-unwind records that depend
 on DWARF FDEs.

Compact-unwind encodings are more limited than DWARF frame descriptions. For
functions whose frame layout cannot be described by a compact unwind encoding,
the encoding for the function will specify "use DWARF", and the corresponding
unwind-info record will use the low bits of the encoding to point to the FDE
for the function.

We test this with a frame-pointer=none function, since these frame layouts
always triger a fall-back to DWARF on arm64.

(cherry picked from commit 9d88ffe7f7b4a46d3bcb7bbdf0d7eb037ab5ba04)
---
 .../JITLink/CompactUnwindSupport.h            | 54 ++++++++++++++-----
 .../ExecutionEngine/JITLink/MachO_arm64.cpp   |  1 +
 .../ExecutionEngine/JITLink/MachO_x86_64.cpp  |  1 +
 ...-throw-catch.ll => throw-catch-minimal.ll} |  0
 4 files changed, 43 insertions(+), 13 deletions(-)
 rename llvm/test/ExecutionEngine/Orc/{minimal-throw-catch.ll => throw-catch-minimal.ll} (100%)

diff --git a/llvm/lib/ExecutionEngine/JITLink/CompactUnwindSupport.h b/llvm/lib/ExecutionEngine/JITLink/CompactUnwindSupport.h
index c306264b6da5d..c3f8834949335 100644
--- a/llvm/lib/ExecutionEngine/JITLink/CompactUnwindSupport.h
+++ b/llvm/lib/ExecutionEngine/JITLink/CompactUnwindSupport.h
@@ -61,6 +61,14 @@ template <typename CRTPImpl, size_t PtrSize> struct CompactUnwindTraits {
     return support::endian::read32<CRTPImpl::Endianness>(RecordContent.data() +
                                                          EncodingFieldOffset);
   }
+
+  static std::optional<uint32_t> encodeDWARFOffset(size_t Delta) {
+    uint32_t Encoded =
+        static_cast<uint32_t>(Delta) & CRTPImpl::DWARFSectionOffsetMask;
+    if (Encoded != Delta)
+      return std::nullopt;
+    return Encoded;
+  }
 };
 
 /// Architecture specific implementation of CompactUnwindManager.
@@ -133,19 +141,22 @@ template <typename CURecTraits> class CompactUnwindManager {
                  << Fn.getName() << "\n";
         });
         continue;
-      } else {
-        LLVM_DEBUG({
-          dbgs() << "    Found record for function ";
-          if (Fn.hasName())
-            dbgs() << Fn.getName();
-          else
-            dbgs() << "<anon @ " << Fn.getAddress() << '>';
-          dbgs() << '\n';
-        });
       }
 
-      bool NeedsDWARF = CURecTraits::encodingSpecifiesDWARF(
-          CURecTraits::readEncoding(B->getContent()));
+      uint32_t Encoding = CURecTraits::readEncoding(B->getContent());
+      bool NeedsDWARF = CURecTraits::encodingSpecifiesDWARF(Encoding);
+
+      LLVM_DEBUG({
+        dbgs() << "    Found record for function ";
+        if (Fn.hasName())
+          dbgs() << Fn.getName();
+        else
+          dbgs() << "<anon @ " << Fn.getAddress() << '>';
+        dbgs() << ": encoding = " << formatv("{0:x}", Encoding);
+        if (NeedsDWARF)
+          dbgs() << " (needs DWARF)";
+        dbgs() << "\n";
+      });
 
       auto &CURecSym =
           G.addAnonymousSymbol(*B, 0, CURecTraits::Size, false, false);
@@ -170,7 +181,7 @@ template <typename CURecTraits> class CompactUnwindManager {
           KeepAliveAlreadyPresent = true;
           if (NeedsDWARF) {
             LLVM_DEBUG({
-              dbgs() << "      Needs DWARF: adding keep-alive edge to FDE at "
+              dbgs() << "      Adding keep-alive edge to FDE at "
                      << FDE.getAddress() << "\n";
             });
             B->addEdge(Edge::KeepAlive, 0, FDE, 0);
@@ -595,8 +606,24 @@ template <typename CURecTraits> class CompactUnwindManager {
             ", delta to function at " + formatv("{0:x}", R.Fn->getAddress()) +
             " exceeds 32 bits");
 
+      auto Encoding = R.Encoding;
+
+      if (LLVM_UNLIKELY(CURecTraits::encodingSpecifiesDWARF(R.Encoding))) {
+        if (!EHFrameBase)
+          EHFrameBase = SectionRange(R.FDE->getSection()).getStart();
+        auto FDEDelta = R.FDE->getAddress() - EHFrameBase;
+
+        if (auto EncodedFDEDelta = CURecTraits::encodeDWARFOffset(FDEDelta))
+          Encoding |= *EncodedFDEDelta;
+        else
+          return make_error<JITLinkError>(
+              "In " + G.getName() + " " + UnwindInfoSectionName +
+              ", cannot encode delta " + formatv("{0:x}", FDEDelta) +
+              " to FDE at " + formatv("{0:x}", R.FDE->getAddress()));
+      }
+
       cantFail(W.writeInteger<uint32_t>(FnDelta));
-      cantFail(W.writeInteger<uint32_t>(R.Encoding));
+      cantFail(W.writeInteger<uint32_t>(Encoding));
 
       ++RecordIdx;
     }
@@ -639,6 +666,7 @@ template <typename CURecTraits> class CompactUnwindManager {
   StringRef UnwindInfoSectionName;
   StringRef EHFrameSectionName;
   Symbol *CompactUnwindBase = nullptr;
+  orc::ExecutorAddr EHFrameBase;
 
   size_t NumLSDAs = 0;
   size_t NumSecondLevelPages = 0;
diff --git a/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp b/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp
index 4860db4f5eb37..3af0c0cdeb7c3 100644
--- a/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp
@@ -637,6 +637,7 @@ struct CompactUnwindTraits_MachO_arm64
   constexpr static endianness Endianness = endianness::little;
 
   constexpr static uint32_t EncodingModeMask = 0x0f000000;
+  constexpr static uint32_t DWARFSectionOffsetMask = 0x00ffffff;
 
   using GOTManager = aarch64::GOTTableManager;
 
diff --git a/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp
index d56dfdc07636d..bb5f3ab7ed43c 100644
--- a/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp
@@ -512,6 +512,7 @@ struct CompactUnwindTraits_MachO_x86_64
   constexpr static endianness Endianness = endianness::little;
 
   constexpr static uint32_t EncodingModeMask = 0x0f000000;
+  constexpr static uint32_t DWARFSectionOffsetMask = 0x00ffffff;
 
   using GOTManager = x86_64::GOTTableManager;
 
diff --git a/llvm/test/ExecutionEngine/Orc/minimal-throw-catch.ll b/llvm/test/ExecutionEngine/Orc/throw-catch-minimal.ll
similarity index 100%
rename from llvm/test/ExecutionEngine/Orc/minimal-throw-catch.ll
rename to llvm/test/ExecutionEngine/Orc/throw-catch-minimal.ll

From b3876b698d0a7b4c2c61f94fc2d0877a7dfd8e20 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Thu, 6 Feb 2025 17:01:36 +1100
Subject: [PATCH 075/282] [JITLink] Add missing testcase for
 compact-unwind-needs-dwarf.

This testcase was accidentally left out of 9d88ffe7f7b.

(cherry picked from commit 7a213e70eb24e621042f2fda043622048cb1f1df)
---
 .../Orc/throw-catch-no-frame-pointer.ll       | 51 +++++++++++++++++++
 1 file changed, 51 insertions(+)
 create mode 100644 llvm/test/ExecutionEngine/Orc/throw-catch-no-frame-pointer.ll

diff --git a/llvm/test/ExecutionEngine/Orc/throw-catch-no-frame-pointer.ll b/llvm/test/ExecutionEngine/Orc/throw-catch-no-frame-pointer.ll
new file mode 100644
index 0000000000000..8032e71ad2f35
--- /dev/null
+++ b/llvm/test/ExecutionEngine/Orc/throw-catch-no-frame-pointer.ll
@@ -0,0 +1,51 @@
+; REQUIRES: system-darwin && host-unwind-supports-jit
+; RUN: lli -jit-kind=orc %s
+;
+; Check that we can throw exceptions from no-fp functions. On systems that
+; support compact-unwind this implicitly tests that we correctly handle
+; unwind-info records that depend on DWARF FDEs.
+
+@_ZTIi = external constant ptr
+
+declare ptr @__cxa_allocate_exception(i64)
+declare void @__cxa_throw(ptr, ptr, ptr)
+declare ptr @__cxa_begin_catch(ptr)
+declare void @__cxa_end_catch()
+declare i32 @__gxx_personality_v0(...)
+declare i32 @llvm.eh.typeid.for.p0(ptr)
+
+define void @_Z3foov() "frame-pointer"="none" {
+entry:
+  %exception = tail call ptr @__cxa_allocate_exception(i64 4)
+  store i32 42, ptr %exception
+  tail call void @__cxa_throw(ptr %exception, ptr nonnull @_ZTIi, ptr null)
+  unreachable
+}
+
+define i32 @main(i32 %argc, ptr %argv) "frame-pointer"="all" personality ptr @__gxx_personality_v0 {
+entry:
+  invoke void @_Z3foov()
+          to label %return.unreachable unwind label %lpad
+
+lpad:
+  %0 = landingpad { ptr, i32 }
+          catch ptr @_ZTIi
+  %1 = extractvalue { ptr, i32 } %0, 1
+  %2 = tail call i32 @llvm.eh.typeid.for.p0(ptr nonnull @_ZTIi)
+  %matches = icmp eq i32 %1, %2
+  br i1 %matches, label %catch, label %eh.resume
+
+catch:
+  %3 = extractvalue { ptr, i32 } %0, 0
+  %4 = tail call ptr @__cxa_begin_catch(ptr %3)
+  %5 = load i32, ptr %4
+  %sub = sub nsw i32 42, %5
+  tail call void @__cxa_end_catch()
+  ret i32 %sub
+
+return.unreachable:
+  unreachable
+
+eh.resume:
+  resume { ptr, i32 } %0
+}

From a9b497606405d6f88fc7949f27c275f0cec486aa Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Thu, 6 Feb 2025 17:06:03 +1100
Subject: [PATCH 076/282] [ORC-RT] Use templates to express deeply nested
 function calls in testcase.

Makes this test smaller and more readable.

(cherry picked from commit e00f824e9a5ea73830bd346115968fa9ace84cbf)
---
 .../Generic/exceptions-stress-test-tower.cpp  | 1029 +----------------
 1 file changed, 7 insertions(+), 1022 deletions(-)

diff --git a/compiler-rt/test/orc/TestCases/Darwin/Generic/exceptions-stress-test-tower.cpp b/compiler-rt/test/orc/TestCases/Darwin/Generic/exceptions-stress-test-tower.cpp
index f7a39a9dccd6f..245afa084f5c0 100644
--- a/compiler-rt/test/orc/TestCases/Darwin/Generic/exceptions-stress-test-tower.cpp
+++ b/compiler-rt/test/orc/TestCases/Darwin/Generic/exceptions-stress-test-tower.cpp
@@ -7,1030 +7,15 @@
 // stack frames. The number (1022) is chosen to force emission of multiple
 // unwind info second-level pages.
 
-static void f_0() { throw 42; }
-static void f_1() { try { f_0(); } catch (...) { throw; } }
-static void f_2() { try { f_1(); } catch (...) { throw; } }
-static void f_3() { try { f_2(); } catch (...) { throw; } }
-static void f_4() { try { f_3(); } catch (...) { throw; } }
-static void f_5() { try { f_4(); } catch (...) { throw; } }
-static void f_6() { try { f_5(); } catch (...) { throw; } }
-static void f_7() { try { f_6(); } catch (...) { throw; } }
-static void f_8() { try { f_7(); } catch (...) { throw; } }
-static void f_9() { try { f_8(); } catch (...) { throw; } }
-static void f_10() { try { f_9(); } catch (...) { throw; } }
-static void f_11() { try { f_10(); } catch (...) { throw; } }
-static void f_12() { try { f_11(); } catch (...) { throw; } }
-static void f_13() { try { f_12(); } catch (...) { throw; } }
-static void f_14() { try { f_13(); } catch (...) { throw; } }
-static void f_15() { try { f_14(); } catch (...) { throw; } }
-static void f_16() { try { f_15(); } catch (...) { throw; } }
-static void f_17() { try { f_16(); } catch (...) { throw; } }
-static void f_18() { try { f_17(); } catch (...) { throw; } }
-static void f_19() { try { f_18(); } catch (...) { throw; } }
-static void f_20() { try { f_19(); } catch (...) { throw; } }
-static void f_21() { try { f_20(); } catch (...) { throw; } }
-static void f_22() { try { f_21(); } catch (...) { throw; } }
-static void f_23() { try { f_22(); } catch (...) { throw; } }
-static void f_24() { try { f_23(); } catch (...) { throw; } }
-static void f_25() { try { f_24(); } catch (...) { throw; } }
-static void f_26() { try { f_25(); } catch (...) { throw; } }
-static void f_27() { try { f_26(); } catch (...) { throw; } }
-static void f_28() { try { f_27(); } catch (...) { throw; } }
-static void f_29() { try { f_28(); } catch (...) { throw; } }
-static void f_30() { try { f_29(); } catch (...) { throw; } }
-static void f_31() { try { f_30(); } catch (...) { throw; } }
-static void f_32() { try { f_31(); } catch (...) { throw; } }
-static void f_33() { try { f_32(); } catch (...) { throw; } }
-static void f_34() { try { f_33(); } catch (...) { throw; } }
-static void f_35() { try { f_34(); } catch (...) { throw; } }
-static void f_36() { try { f_35(); } catch (...) { throw; } }
-static void f_37() { try { f_36(); } catch (...) { throw; } }
-static void f_38() { try { f_37(); } catch (...) { throw; } }
-static void f_39() { try { f_38(); } catch (...) { throw; } }
-static void f_40() { try { f_39(); } catch (...) { throw; } }
-static void f_41() { try { f_40(); } catch (...) { throw; } }
-static void f_42() { try { f_41(); } catch (...) { throw; } }
-static void f_43() { try { f_42(); } catch (...) { throw; } }
-static void f_44() { try { f_43(); } catch (...) { throw; } }
-static void f_45() { try { f_44(); } catch (...) { throw; } }
-static void f_46() { try { f_45(); } catch (...) { throw; } }
-static void f_47() { try { f_46(); } catch (...) { throw; } }
-static void f_48() { try { f_47(); } catch (...) { throw; } }
-static void f_49() { try { f_48(); } catch (...) { throw; } }
-static void f_50() { try { f_49(); } catch (...) { throw; } }
-static void f_51() { try { f_50(); } catch (...) { throw; } }
-static void f_52() { try { f_51(); } catch (...) { throw; } }
-static void f_53() { try { f_52(); } catch (...) { throw; } }
-static void f_54() { try { f_53(); } catch (...) { throw; } }
-static void f_55() { try { f_54(); } catch (...) { throw; } }
-static void f_56() { try { f_55(); } catch (...) { throw; } }
-static void f_57() { try { f_56(); } catch (...) { throw; } }
-static void f_58() { try { f_57(); } catch (...) { throw; } }
-static void f_59() { try { f_58(); } catch (...) { throw; } }
-static void f_60() { try { f_59(); } catch (...) { throw; } }
-static void f_61() { try { f_60(); } catch (...) { throw; } }
-static void f_62() { try { f_61(); } catch (...) { throw; } }
-static void f_63() { try { f_62(); } catch (...) { throw; } }
-static void f_64() { try { f_63(); } catch (...) { throw; } }
-static void f_65() { try { f_64(); } catch (...) { throw; } }
-static void f_66() { try { f_65(); } catch (...) { throw; } }
-static void f_67() { try { f_66(); } catch (...) { throw; } }
-static void f_68() { try { f_67(); } catch (...) { throw; } }
-static void f_69() { try { f_68(); } catch (...) { throw; } }
-static void f_70() { try { f_69(); } catch (...) { throw; } }
-static void f_71() { try { f_70(); } catch (...) { throw; } }
-static void f_72() { try { f_71(); } catch (...) { throw; } }
-static void f_73() { try { f_72(); } catch (...) { throw; } }
-static void f_74() { try { f_73(); } catch (...) { throw; } }
-static void f_75() { try { f_74(); } catch (...) { throw; } }
-static void f_76() { try { f_75(); } catch (...) { throw; } }
-static void f_77() { try { f_76(); } catch (...) { throw; } }
-static void f_78() { try { f_77(); } catch (...) { throw; } }
-static void f_79() { try { f_78(); } catch (...) { throw; } }
-static void f_80() { try { f_79(); } catch (...) { throw; } }
-static void f_81() { try { f_80(); } catch (...) { throw; } }
-static void f_82() { try { f_81(); } catch (...) { throw; } }
-static void f_83() { try { f_82(); } catch (...) { throw; } }
-static void f_84() { try { f_83(); } catch (...) { throw; } }
-static void f_85() { try { f_84(); } catch (...) { throw; } }
-static void f_86() { try { f_85(); } catch (...) { throw; } }
-static void f_87() { try { f_86(); } catch (...) { throw; } }
-static void f_88() { try { f_87(); } catch (...) { throw; } }
-static void f_89() { try { f_88(); } catch (...) { throw; } }
-static void f_90() { try { f_89(); } catch (...) { throw; } }
-static void f_91() { try { f_90(); } catch (...) { throw; } }
-static void f_92() { try { f_91(); } catch (...) { throw; } }
-static void f_93() { try { f_92(); } catch (...) { throw; } }
-static void f_94() { try { f_93(); } catch (...) { throw; } }
-static void f_95() { try { f_94(); } catch (...) { throw; } }
-static void f_96() { try { f_95(); } catch (...) { throw; } }
-static void f_97() { try { f_96(); } catch (...) { throw; } }
-static void f_98() { try { f_97(); } catch (...) { throw; } }
-static void f_99() { try { f_98(); } catch (...) { throw; } }
-static void f_100() { try { f_99(); } catch (...) { throw; } }
-static void f_101() { try { f_100(); } catch (...) { throw; } }
-static void f_102() { try { f_101(); } catch (...) { throw; } }
-static void f_103() { try { f_102(); } catch (...) { throw; } }
-static void f_104() { try { f_103(); } catch (...) { throw; } }
-static void f_105() { try { f_104(); } catch (...) { throw; } }
-static void f_106() { try { f_105(); } catch (...) { throw; } }
-static void f_107() { try { f_106(); } catch (...) { throw; } }
-static void f_108() { try { f_107(); } catch (...) { throw; } }
-static void f_109() { try { f_108(); } catch (...) { throw; } }
-static void f_110() { try { f_109(); } catch (...) { throw; } }
-static void f_111() { try { f_110(); } catch (...) { throw; } }
-static void f_112() { try { f_111(); } catch (...) { throw; } }
-static void f_113() { try { f_112(); } catch (...) { throw; } }
-static void f_114() { try { f_113(); } catch (...) { throw; } }
-static void f_115() { try { f_114(); } catch (...) { throw; } }
-static void f_116() { try { f_115(); } catch (...) { throw; } }
-static void f_117() { try { f_116(); } catch (...) { throw; } }
-static void f_118() { try { f_117(); } catch (...) { throw; } }
-static void f_119() { try { f_118(); } catch (...) { throw; } }
-static void f_120() { try { f_119(); } catch (...) { throw; } }
-static void f_121() { try { f_120(); } catch (...) { throw; } }
-static void f_122() { try { f_121(); } catch (...) { throw; } }
-static void f_123() { try { f_122(); } catch (...) { throw; } }
-static void f_124() { try { f_123(); } catch (...) { throw; } }
-static void f_125() { try { f_124(); } catch (...) { throw; } }
-static void f_126() { try { f_125(); } catch (...) { throw; } }
-static void f_127() { try { f_126(); } catch (...) { throw; } }
-static void f_128() { try { f_127(); } catch (...) { throw; } }
-static void f_129() { try { f_128(); } catch (...) { throw; } }
-static void f_130() { try { f_129(); } catch (...) { throw; } }
-static void f_131() { try { f_130(); } catch (...) { throw; } }
-static void f_132() { try { f_131(); } catch (...) { throw; } }
-static void f_133() { try { f_132(); } catch (...) { throw; } }
-static void f_134() { try { f_133(); } catch (...) { throw; } }
-static void f_135() { try { f_134(); } catch (...) { throw; } }
-static void f_136() { try { f_135(); } catch (...) { throw; } }
-static void f_137() { try { f_136(); } catch (...) { throw; } }
-static void f_138() { try { f_137(); } catch (...) { throw; } }
-static void f_139() { try { f_138(); } catch (...) { throw; } }
-static void f_140() { try { f_139(); } catch (...) { throw; } }
-static void f_141() { try { f_140(); } catch (...) { throw; } }
-static void f_142() { try { f_141(); } catch (...) { throw; } }
-static void f_143() { try { f_142(); } catch (...) { throw; } }
-static void f_144() { try { f_143(); } catch (...) { throw; } }
-static void f_145() { try { f_144(); } catch (...) { throw; } }
-static void f_146() { try { f_145(); } catch (...) { throw; } }
-static void f_147() { try { f_146(); } catch (...) { throw; } }
-static void f_148() { try { f_147(); } catch (...) { throw; } }
-static void f_149() { try { f_148(); } catch (...) { throw; } }
-static void f_150() { try { f_149(); } catch (...) { throw; } }
-static void f_151() { try { f_150(); } catch (...) { throw; } }
-static void f_152() { try { f_151(); } catch (...) { throw; } }
-static void f_153() { try { f_152(); } catch (...) { throw; } }
-static void f_154() { try { f_153(); } catch (...) { throw; } }
-static void f_155() { try { f_154(); } catch (...) { throw; } }
-static void f_156() { try { f_155(); } catch (...) { throw; } }
-static void f_157() { try { f_156(); } catch (...) { throw; } }
-static void f_158() { try { f_157(); } catch (...) { throw; } }
-static void f_159() { try { f_158(); } catch (...) { throw; } }
-static void f_160() { try { f_159(); } catch (...) { throw; } }
-static void f_161() { try { f_160(); } catch (...) { throw; } }
-static void f_162() { try { f_161(); } catch (...) { throw; } }
-static void f_163() { try { f_162(); } catch (...) { throw; } }
-static void f_164() { try { f_163(); } catch (...) { throw; } }
-static void f_165() { try { f_164(); } catch (...) { throw; } }
-static void f_166() { try { f_165(); } catch (...) { throw; } }
-static void f_167() { try { f_166(); } catch (...) { throw; } }
-static void f_168() { try { f_167(); } catch (...) { throw; } }
-static void f_169() { try { f_168(); } catch (...) { throw; } }
-static void f_170() { try { f_169(); } catch (...) { throw; } }
-static void f_171() { try { f_170(); } catch (...) { throw; } }
-static void f_172() { try { f_171(); } catch (...) { throw; } }
-static void f_173() { try { f_172(); } catch (...) { throw; } }
-static void f_174() { try { f_173(); } catch (...) { throw; } }
-static void f_175() { try { f_174(); } catch (...) { throw; } }
-static void f_176() { try { f_175(); } catch (...) { throw; } }
-static void f_177() { try { f_176(); } catch (...) { throw; } }
-static void f_178() { try { f_177(); } catch (...) { throw; } }
-static void f_179() { try { f_178(); } catch (...) { throw; } }
-static void f_180() { try { f_179(); } catch (...) { throw; } }
-static void f_181() { try { f_180(); } catch (...) { throw; } }
-static void f_182() { try { f_181(); } catch (...) { throw; } }
-static void f_183() { try { f_182(); } catch (...) { throw; } }
-static void f_184() { try { f_183(); } catch (...) { throw; } }
-static void f_185() { try { f_184(); } catch (...) { throw; } }
-static void f_186() { try { f_185(); } catch (...) { throw; } }
-static void f_187() { try { f_186(); } catch (...) { throw; } }
-static void f_188() { try { f_187(); } catch (...) { throw; } }
-static void f_189() { try { f_188(); } catch (...) { throw; } }
-static void f_190() { try { f_189(); } catch (...) { throw; } }
-static void f_191() { try { f_190(); } catch (...) { throw; } }
-static void f_192() { try { f_191(); } catch (...) { throw; } }
-static void f_193() { try { f_192(); } catch (...) { throw; } }
-static void f_194() { try { f_193(); } catch (...) { throw; } }
-static void f_195() { try { f_194(); } catch (...) { throw; } }
-static void f_196() { try { f_195(); } catch (...) { throw; } }
-static void f_197() { try { f_196(); } catch (...) { throw; } }
-static void f_198() { try { f_197(); } catch (...) { throw; } }
-static void f_199() { try { f_198(); } catch (...) { throw; } }
-static void f_200() { try { f_199(); } catch (...) { throw; } }
-static void f_201() { try { f_200(); } catch (...) { throw; } }
-static void f_202() { try { f_201(); } catch (...) { throw; } }
-static void f_203() { try { f_202(); } catch (...) { throw; } }
-static void f_204() { try { f_203(); } catch (...) { throw; } }
-static void f_205() { try { f_204(); } catch (...) { throw; } }
-static void f_206() { try { f_205(); } catch (...) { throw; } }
-static void f_207() { try { f_206(); } catch (...) { throw; } }
-static void f_208() { try { f_207(); } catch (...) { throw; } }
-static void f_209() { try { f_208(); } catch (...) { throw; } }
-static void f_210() { try { f_209(); } catch (...) { throw; } }
-static void f_211() { try { f_210(); } catch (...) { throw; } }
-static void f_212() { try { f_211(); } catch (...) { throw; } }
-static void f_213() { try { f_212(); } catch (...) { throw; } }
-static void f_214() { try { f_213(); } catch (...) { throw; } }
-static void f_215() { try { f_214(); } catch (...) { throw; } }
-static void f_216() { try { f_215(); } catch (...) { throw; } }
-static void f_217() { try { f_216(); } catch (...) { throw; } }
-static void f_218() { try { f_217(); } catch (...) { throw; } }
-static void f_219() { try { f_218(); } catch (...) { throw; } }
-static void f_220() { try { f_219(); } catch (...) { throw; } }
-static void f_221() { try { f_220(); } catch (...) { throw; } }
-static void f_222() { try { f_221(); } catch (...) { throw; } }
-static void f_223() { try { f_222(); } catch (...) { throw; } }
-static void f_224() { try { f_223(); } catch (...) { throw; } }
-static void f_225() { try { f_224(); } catch (...) { throw; } }
-static void f_226() { try { f_225(); } catch (...) { throw; } }
-static void f_227() { try { f_226(); } catch (...) { throw; } }
-static void f_228() { try { f_227(); } catch (...) { throw; } }
-static void f_229() { try { f_228(); } catch (...) { throw; } }
-static void f_230() { try { f_229(); } catch (...) { throw; } }
-static void f_231() { try { f_230(); } catch (...) { throw; } }
-static void f_232() { try { f_231(); } catch (...) { throw; } }
-static void f_233() { try { f_232(); } catch (...) { throw; } }
-static void f_234() { try { f_233(); } catch (...) { throw; } }
-static void f_235() { try { f_234(); } catch (...) { throw; } }
-static void f_236() { try { f_235(); } catch (...) { throw; } }
-static void f_237() { try { f_236(); } catch (...) { throw; } }
-static void f_238() { try { f_237(); } catch (...) { throw; } }
-static void f_239() { try { f_238(); } catch (...) { throw; } }
-static void f_240() { try { f_239(); } catch (...) { throw; } }
-static void f_241() { try { f_240(); } catch (...) { throw; } }
-static void f_242() { try { f_241(); } catch (...) { throw; } }
-static void f_243() { try { f_242(); } catch (...) { throw; } }
-static void f_244() { try { f_243(); } catch (...) { throw; } }
-static void f_245() { try { f_244(); } catch (...) { throw; } }
-static void f_246() { try { f_245(); } catch (...) { throw; } }
-static void f_247() { try { f_246(); } catch (...) { throw; } }
-static void f_248() { try { f_247(); } catch (...) { throw; } }
-static void f_249() { try { f_248(); } catch (...) { throw; } }
-static void f_250() { try { f_249(); } catch (...) { throw; } }
-static void f_251() { try { f_250(); } catch (...) { throw; } }
-static void f_252() { try { f_251(); } catch (...) { throw; } }
-static void f_253() { try { f_252(); } catch (...) { throw; } }
-static void f_254() { try { f_253(); } catch (...) { throw; } }
-static void f_255() { try { f_254(); } catch (...) { throw; } }
-static void f_256() { try { f_255(); } catch (...) { throw; } }
-static void f_257() { try { f_256(); } catch (...) { throw; } }
-static void f_258() { try { f_257(); } catch (...) { throw; } }
-static void f_259() { try { f_258(); } catch (...) { throw; } }
-static void f_260() { try { f_259(); } catch (...) { throw; } }
-static void f_261() { try { f_260(); } catch (...) { throw; } }
-static void f_262() { try { f_261(); } catch (...) { throw; } }
-static void f_263() { try { f_262(); } catch (...) { throw; } }
-static void f_264() { try { f_263(); } catch (...) { throw; } }
-static void f_265() { try { f_264(); } catch (...) { throw; } }
-static void f_266() { try { f_265(); } catch (...) { throw; } }
-static void f_267() { try { f_266(); } catch (...) { throw; } }
-static void f_268() { try { f_267(); } catch (...) { throw; } }
-static void f_269() { try { f_268(); } catch (...) { throw; } }
-static void f_270() { try { f_269(); } catch (...) { throw; } }
-static void f_271() { try { f_270(); } catch (...) { throw; } }
-static void f_272() { try { f_271(); } catch (...) { throw; } }
-static void f_273() { try { f_272(); } catch (...) { throw; } }
-static void f_274() { try { f_273(); } catch (...) { throw; } }
-static void f_275() { try { f_274(); } catch (...) { throw; } }
-static void f_276() { try { f_275(); } catch (...) { throw; } }
-static void f_277() { try { f_276(); } catch (...) { throw; } }
-static void f_278() { try { f_277(); } catch (...) { throw; } }
-static void f_279() { try { f_278(); } catch (...) { throw; } }
-static void f_280() { try { f_279(); } catch (...) { throw; } }
-static void f_281() { try { f_280(); } catch (...) { throw; } }
-static void f_282() { try { f_281(); } catch (...) { throw; } }
-static void f_283() { try { f_282(); } catch (...) { throw; } }
-static void f_284() { try { f_283(); } catch (...) { throw; } }
-static void f_285() { try { f_284(); } catch (...) { throw; } }
-static void f_286() { try { f_285(); } catch (...) { throw; } }
-static void f_287() { try { f_286(); } catch (...) { throw; } }
-static void f_288() { try { f_287(); } catch (...) { throw; } }
-static void f_289() { try { f_288(); } catch (...) { throw; } }
-static void f_290() { try { f_289(); } catch (...) { throw; } }
-static void f_291() { try { f_290(); } catch (...) { throw; } }
-static void f_292() { try { f_291(); } catch (...) { throw; } }
-static void f_293() { try { f_292(); } catch (...) { throw; } }
-static void f_294() { try { f_293(); } catch (...) { throw; } }
-static void f_295() { try { f_294(); } catch (...) { throw; } }
-static void f_296() { try { f_295(); } catch (...) { throw; } }
-static void f_297() { try { f_296(); } catch (...) { throw; } }
-static void f_298() { try { f_297(); } catch (...) { throw; } }
-static void f_299() { try { f_298(); } catch (...) { throw; } }
-static void f_300() { try { f_299(); } catch (...) { throw; } }
-static void f_301() { try { f_300(); } catch (...) { throw; } }
-static void f_302() { try { f_301(); } catch (...) { throw; } }
-static void f_303() { try { f_302(); } catch (...) { throw; } }
-static void f_304() { try { f_303(); } catch (...) { throw; } }
-static void f_305() { try { f_304(); } catch (...) { throw; } }
-static void f_306() { try { f_305(); } catch (...) { throw; } }
-static void f_307() { try { f_306(); } catch (...) { throw; } }
-static void f_308() { try { f_307(); } catch (...) { throw; } }
-static void f_309() { try { f_308(); } catch (...) { throw; } }
-static void f_310() { try { f_309(); } catch (...) { throw; } }
-static void f_311() { try { f_310(); } catch (...) { throw; } }
-static void f_312() { try { f_311(); } catch (...) { throw; } }
-static void f_313() { try { f_312(); } catch (...) { throw; } }
-static void f_314() { try { f_313(); } catch (...) { throw; } }
-static void f_315() { try { f_314(); } catch (...) { throw; } }
-static void f_316() { try { f_315(); } catch (...) { throw; } }
-static void f_317() { try { f_316(); } catch (...) { throw; } }
-static void f_318() { try { f_317(); } catch (...) { throw; } }
-static void f_319() { try { f_318(); } catch (...) { throw; } }
-static void f_320() { try { f_319(); } catch (...) { throw; } }
-static void f_321() { try { f_320(); } catch (...) { throw; } }
-static void f_322() { try { f_321(); } catch (...) { throw; } }
-static void f_323() { try { f_322(); } catch (...) { throw; } }
-static void f_324() { try { f_323(); } catch (...) { throw; } }
-static void f_325() { try { f_324(); } catch (...) { throw; } }
-static void f_326() { try { f_325(); } catch (...) { throw; } }
-static void f_327() { try { f_326(); } catch (...) { throw; } }
-static void f_328() { try { f_327(); } catch (...) { throw; } }
-static void f_329() { try { f_328(); } catch (...) { throw; } }
-static void f_330() { try { f_329(); } catch (...) { throw; } }
-static void f_331() { try { f_330(); } catch (...) { throw; } }
-static void f_332() { try { f_331(); } catch (...) { throw; } }
-static void f_333() { try { f_332(); } catch (...) { throw; } }
-static void f_334() { try { f_333(); } catch (...) { throw; } }
-static void f_335() { try { f_334(); } catch (...) { throw; } }
-static void f_336() { try { f_335(); } catch (...) { throw; } }
-static void f_337() { try { f_336(); } catch (...) { throw; } }
-static void f_338() { try { f_337(); } catch (...) { throw; } }
-static void f_339() { try { f_338(); } catch (...) { throw; } }
-static void f_340() { try { f_339(); } catch (...) { throw; } }
-static void f_341() { try { f_340(); } catch (...) { throw; } }
-static void f_342() { try { f_341(); } catch (...) { throw; } }
-static void f_343() { try { f_342(); } catch (...) { throw; } }
-static void f_344() { try { f_343(); } catch (...) { throw; } }
-static void f_345() { try { f_344(); } catch (...) { throw; } }
-static void f_346() { try { f_345(); } catch (...) { throw; } }
-static void f_347() { try { f_346(); } catch (...) { throw; } }
-static void f_348() { try { f_347(); } catch (...) { throw; } }
-static void f_349() { try { f_348(); } catch (...) { throw; } }
-static void f_350() { try { f_349(); } catch (...) { throw; } }
-static void f_351() { try { f_350(); } catch (...) { throw; } }
-static void f_352() { try { f_351(); } catch (...) { throw; } }
-static void f_353() { try { f_352(); } catch (...) { throw; } }
-static void f_354() { try { f_353(); } catch (...) { throw; } }
-static void f_355() { try { f_354(); } catch (...) { throw; } }
-static void f_356() { try { f_355(); } catch (...) { throw; } }
-static void f_357() { try { f_356(); } catch (...) { throw; } }
-static void f_358() { try { f_357(); } catch (...) { throw; } }
-static void f_359() { try { f_358(); } catch (...) { throw; } }
-static void f_360() { try { f_359(); } catch (...) { throw; } }
-static void f_361() { try { f_360(); } catch (...) { throw; } }
-static void f_362() { try { f_361(); } catch (...) { throw; } }
-static void f_363() { try { f_362(); } catch (...) { throw; } }
-static void f_364() { try { f_363(); } catch (...) { throw; } }
-static void f_365() { try { f_364(); } catch (...) { throw; } }
-static void f_366() { try { f_365(); } catch (...) { throw; } }
-static void f_367() { try { f_366(); } catch (...) { throw; } }
-static void f_368() { try { f_367(); } catch (...) { throw; } }
-static void f_369() { try { f_368(); } catch (...) { throw; } }
-static void f_370() { try { f_369(); } catch (...) { throw; } }
-static void f_371() { try { f_370(); } catch (...) { throw; } }
-static void f_372() { try { f_371(); } catch (...) { throw; } }
-static void f_373() { try { f_372(); } catch (...) { throw; } }
-static void f_374() { try { f_373(); } catch (...) { throw; } }
-static void f_375() { try { f_374(); } catch (...) { throw; } }
-static void f_376() { try { f_375(); } catch (...) { throw; } }
-static void f_377() { try { f_376(); } catch (...) { throw; } }
-static void f_378() { try { f_377(); } catch (...) { throw; } }
-static void f_379() { try { f_378(); } catch (...) { throw; } }
-static void f_380() { try { f_379(); } catch (...) { throw; } }
-static void f_381() { try { f_380(); } catch (...) { throw; } }
-static void f_382() { try { f_381(); } catch (...) { throw; } }
-static void f_383() { try { f_382(); } catch (...) { throw; } }
-static void f_384() { try { f_383(); } catch (...) { throw; } }
-static void f_385() { try { f_384(); } catch (...) { throw; } }
-static void f_386() { try { f_385(); } catch (...) { throw; } }
-static void f_387() { try { f_386(); } catch (...) { throw; } }
-static void f_388() { try { f_387(); } catch (...) { throw; } }
-static void f_389() { try { f_388(); } catch (...) { throw; } }
-static void f_390() { try { f_389(); } catch (...) { throw; } }
-static void f_391() { try { f_390(); } catch (...) { throw; } }
-static void f_392() { try { f_391(); } catch (...) { throw; } }
-static void f_393() { try { f_392(); } catch (...) { throw; } }
-static void f_394() { try { f_393(); } catch (...) { throw; } }
-static void f_395() { try { f_394(); } catch (...) { throw; } }
-static void f_396() { try { f_395(); } catch (...) { throw; } }
-static void f_397() { try { f_396(); } catch (...) { throw; } }
-static void f_398() { try { f_397(); } catch (...) { throw; } }
-static void f_399() { try { f_398(); } catch (...) { throw; } }
-static void f_400() { try { f_399(); } catch (...) { throw; } }
-static void f_401() { try { f_400(); } catch (...) { throw; } }
-static void f_402() { try { f_401(); } catch (...) { throw; } }
-static void f_403() { try { f_402(); } catch (...) { throw; } }
-static void f_404() { try { f_403(); } catch (...) { throw; } }
-static void f_405() { try { f_404(); } catch (...) { throw; } }
-static void f_406() { try { f_405(); } catch (...) { throw; } }
-static void f_407() { try { f_406(); } catch (...) { throw; } }
-static void f_408() { try { f_407(); } catch (...) { throw; } }
-static void f_409() { try { f_408(); } catch (...) { throw; } }
-static void f_410() { try { f_409(); } catch (...) { throw; } }
-static void f_411() { try { f_410(); } catch (...) { throw; } }
-static void f_412() { try { f_411(); } catch (...) { throw; } }
-static void f_413() { try { f_412(); } catch (...) { throw; } }
-static void f_414() { try { f_413(); } catch (...) { throw; } }
-static void f_415() { try { f_414(); } catch (...) { throw; } }
-static void f_416() { try { f_415(); } catch (...) { throw; } }
-static void f_417() { try { f_416(); } catch (...) { throw; } }
-static void f_418() { try { f_417(); } catch (...) { throw; } }
-static void f_419() { try { f_418(); } catch (...) { throw; } }
-static void f_420() { try { f_419(); } catch (...) { throw; } }
-static void f_421() { try { f_420(); } catch (...) { throw; } }
-static void f_422() { try { f_421(); } catch (...) { throw; } }
-static void f_423() { try { f_422(); } catch (...) { throw; } }
-static void f_424() { try { f_423(); } catch (...) { throw; } }
-static void f_425() { try { f_424(); } catch (...) { throw; } }
-static void f_426() { try { f_425(); } catch (...) { throw; } }
-static void f_427() { try { f_426(); } catch (...) { throw; } }
-static void f_428() { try { f_427(); } catch (...) { throw; } }
-static void f_429() { try { f_428(); } catch (...) { throw; } }
-static void f_430() { try { f_429(); } catch (...) { throw; } }
-static void f_431() { try { f_430(); } catch (...) { throw; } }
-static void f_432() { try { f_431(); } catch (...) { throw; } }
-static void f_433() { try { f_432(); } catch (...) { throw; } }
-static void f_434() { try { f_433(); } catch (...) { throw; } }
-static void f_435() { try { f_434(); } catch (...) { throw; } }
-static void f_436() { try { f_435(); } catch (...) { throw; } }
-static void f_437() { try { f_436(); } catch (...) { throw; } }
-static void f_438() { try { f_437(); } catch (...) { throw; } }
-static void f_439() { try { f_438(); } catch (...) { throw; } }
-static void f_440() { try { f_439(); } catch (...) { throw; } }
-static void f_441() { try { f_440(); } catch (...) { throw; } }
-static void f_442() { try { f_441(); } catch (...) { throw; } }
-static void f_443() { try { f_442(); } catch (...) { throw; } }
-static void f_444() { try { f_443(); } catch (...) { throw; } }
-static void f_445() { try { f_444(); } catch (...) { throw; } }
-static void f_446() { try { f_445(); } catch (...) { throw; } }
-static void f_447() { try { f_446(); } catch (...) { throw; } }
-static void f_448() { try { f_447(); } catch (...) { throw; } }
-static void f_449() { try { f_448(); } catch (...) { throw; } }
-static void f_450() { try { f_449(); } catch (...) { throw; } }
-static void f_451() { try { f_450(); } catch (...) { throw; } }
-static void f_452() { try { f_451(); } catch (...) { throw; } }
-static void f_453() { try { f_452(); } catch (...) { throw; } }
-static void f_454() { try { f_453(); } catch (...) { throw; } }
-static void f_455() { try { f_454(); } catch (...) { throw; } }
-static void f_456() { try { f_455(); } catch (...) { throw; } }
-static void f_457() { try { f_456(); } catch (...) { throw; } }
-static void f_458() { try { f_457(); } catch (...) { throw; } }
-static void f_459() { try { f_458(); } catch (...) { throw; } }
-static void f_460() { try { f_459(); } catch (...) { throw; } }
-static void f_461() { try { f_460(); } catch (...) { throw; } }
-static void f_462() { try { f_461(); } catch (...) { throw; } }
-static void f_463() { try { f_462(); } catch (...) { throw; } }
-static void f_464() { try { f_463(); } catch (...) { throw; } }
-static void f_465() { try { f_464(); } catch (...) { throw; } }
-static void f_466() { try { f_465(); } catch (...) { throw; } }
-static void f_467() { try { f_466(); } catch (...) { throw; } }
-static void f_468() { try { f_467(); } catch (...) { throw; } }
-static void f_469() { try { f_468(); } catch (...) { throw; } }
-static void f_470() { try { f_469(); } catch (...) { throw; } }
-static void f_471() { try { f_470(); } catch (...) { throw; } }
-static void f_472() { try { f_471(); } catch (...) { throw; } }
-static void f_473() { try { f_472(); } catch (...) { throw; } }
-static void f_474() { try { f_473(); } catch (...) { throw; } }
-static void f_475() { try { f_474(); } catch (...) { throw; } }
-static void f_476() { try { f_475(); } catch (...) { throw; } }
-static void f_477() { try { f_476(); } catch (...) { throw; } }
-static void f_478() { try { f_477(); } catch (...) { throw; } }
-static void f_479() { try { f_478(); } catch (...) { throw; } }
-static void f_480() { try { f_479(); } catch (...) { throw; } }
-static void f_481() { try { f_480(); } catch (...) { throw; } }
-static void f_482() { try { f_481(); } catch (...) { throw; } }
-static void f_483() { try { f_482(); } catch (...) { throw; } }
-static void f_484() { try { f_483(); } catch (...) { throw; } }
-static void f_485() { try { f_484(); } catch (...) { throw; } }
-static void f_486() { try { f_485(); } catch (...) { throw; } }
-static void f_487() { try { f_486(); } catch (...) { throw; } }
-static void f_488() { try { f_487(); } catch (...) { throw; } }
-static void f_489() { try { f_488(); } catch (...) { throw; } }
-static void f_490() { try { f_489(); } catch (...) { throw; } }
-static void f_491() { try { f_490(); } catch (...) { throw; } }
-static void f_492() { try { f_491(); } catch (...) { throw; } }
-static void f_493() { try { f_492(); } catch (...) { throw; } }
-static void f_494() { try { f_493(); } catch (...) { throw; } }
-static void f_495() { try { f_494(); } catch (...) { throw; } }
-static void f_496() { try { f_495(); } catch (...) { throw; } }
-static void f_497() { try { f_496(); } catch (...) { throw; } }
-static void f_498() { try { f_497(); } catch (...) { throw; } }
-static void f_499() { try { f_498(); } catch (...) { throw; } }
-static void f_500() { try { f_499(); } catch (...) { throw; } }
-static void f_501() { try { f_500(); } catch (...) { throw; } }
-static void f_502() { try { f_501(); } catch (...) { throw; } }
-static void f_503() { try { f_502(); } catch (...) { throw; } }
-static void f_504() { try { f_503(); } catch (...) { throw; } }
-static void f_505() { try { f_504(); } catch (...) { throw; } }
-static void f_506() { try { f_505(); } catch (...) { throw; } }
-static void f_507() { try { f_506(); } catch (...) { throw; } }
-static void f_508() { try { f_507(); } catch (...) { throw; } }
-static void f_509() { try { f_508(); } catch (...) { throw; } }
-static void f_510() { try { f_509(); } catch (...) { throw; } }
-static void f_511() { try { f_510(); } catch (...) { throw; } }
-static void f_512() { try { f_511(); } catch (...) { throw; } }
-static void f_513() { try { f_512(); } catch (...) { throw; } }
-static void f_514() { try { f_513(); } catch (...) { throw; } }
-static void f_515() { try { f_514(); } catch (...) { throw; } }
-static void f_516() { try { f_515(); } catch (...) { throw; } }
-static void f_517() { try { f_516(); } catch (...) { throw; } }
-static void f_518() { try { f_517(); } catch (...) { throw; } }
-static void f_519() { try { f_518(); } catch (...) { throw; } }
-static void f_520() { try { f_519(); } catch (...) { throw; } }
-static void f_521() { try { f_520(); } catch (...) { throw; } }
-static void f_522() { try { f_521(); } catch (...) { throw; } }
-static void f_523() { try { f_522(); } catch (...) { throw; } }
-static void f_524() { try { f_523(); } catch (...) { throw; } }
-static void f_525() { try { f_524(); } catch (...) { throw; } }
-static void f_526() { try { f_525(); } catch (...) { throw; } }
-static void f_527() { try { f_526(); } catch (...) { throw; } }
-static void f_528() { try { f_527(); } catch (...) { throw; } }
-static void f_529() { try { f_528(); } catch (...) { throw; } }
-static void f_530() { try { f_529(); } catch (...) { throw; } }
-static void f_531() { try { f_530(); } catch (...) { throw; } }
-static void f_532() { try { f_531(); } catch (...) { throw; } }
-static void f_533() { try { f_532(); } catch (...) { throw; } }
-static void f_534() { try { f_533(); } catch (...) { throw; } }
-static void f_535() { try { f_534(); } catch (...) { throw; } }
-static void f_536() { try { f_535(); } catch (...) { throw; } }
-static void f_537() { try { f_536(); } catch (...) { throw; } }
-static void f_538() { try { f_537(); } catch (...) { throw; } }
-static void f_539() { try { f_538(); } catch (...) { throw; } }
-static void f_540() { try { f_539(); } catch (...) { throw; } }
-static void f_541() { try { f_540(); } catch (...) { throw; } }
-static void f_542() { try { f_541(); } catch (...) { throw; } }
-static void f_543() { try { f_542(); } catch (...) { throw; } }
-static void f_544() { try { f_543(); } catch (...) { throw; } }
-static void f_545() { try { f_544(); } catch (...) { throw; } }
-static void f_546() { try { f_545(); } catch (...) { throw; } }
-static void f_547() { try { f_546(); } catch (...) { throw; } }
-static void f_548() { try { f_547(); } catch (...) { throw; } }
-static void f_549() { try { f_548(); } catch (...) { throw; } }
-static void f_550() { try { f_549(); } catch (...) { throw; } }
-static void f_551() { try { f_550(); } catch (...) { throw; } }
-static void f_552() { try { f_551(); } catch (...) { throw; } }
-static void f_553() { try { f_552(); } catch (...) { throw; } }
-static void f_554() { try { f_553(); } catch (...) { throw; } }
-static void f_555() { try { f_554(); } catch (...) { throw; } }
-static void f_556() { try { f_555(); } catch (...) { throw; } }
-static void f_557() { try { f_556(); } catch (...) { throw; } }
-static void f_558() { try { f_557(); } catch (...) { throw; } }
-static void f_559() { try { f_558(); } catch (...) { throw; } }
-static void f_560() { try { f_559(); } catch (...) { throw; } }
-static void f_561() { try { f_560(); } catch (...) { throw; } }
-static void f_562() { try { f_561(); } catch (...) { throw; } }
-static void f_563() { try { f_562(); } catch (...) { throw; } }
-static void f_564() { try { f_563(); } catch (...) { throw; } }
-static void f_565() { try { f_564(); } catch (...) { throw; } }
-static void f_566() { try { f_565(); } catch (...) { throw; } }
-static void f_567() { try { f_566(); } catch (...) { throw; } }
-static void f_568() { try { f_567(); } catch (...) { throw; } }
-static void f_569() { try { f_568(); } catch (...) { throw; } }
-static void f_570() { try { f_569(); } catch (...) { throw; } }
-static void f_571() { try { f_570(); } catch (...) { throw; } }
-static void f_572() { try { f_571(); } catch (...) { throw; } }
-static void f_573() { try { f_572(); } catch (...) { throw; } }
-static void f_574() { try { f_573(); } catch (...) { throw; } }
-static void f_575() { try { f_574(); } catch (...) { throw; } }
-static void f_576() { try { f_575(); } catch (...) { throw; } }
-static void f_577() { try { f_576(); } catch (...) { throw; } }
-static void f_578() { try { f_577(); } catch (...) { throw; } }
-static void f_579() { try { f_578(); } catch (...) { throw; } }
-static void f_580() { try { f_579(); } catch (...) { throw; } }
-static void f_581() { try { f_580(); } catch (...) { throw; } }
-static void f_582() { try { f_581(); } catch (...) { throw; } }
-static void f_583() { try { f_582(); } catch (...) { throw; } }
-static void f_584() { try { f_583(); } catch (...) { throw; } }
-static void f_585() { try { f_584(); } catch (...) { throw; } }
-static void f_586() { try { f_585(); } catch (...) { throw; } }
-static void f_587() { try { f_586(); } catch (...) { throw; } }
-static void f_588() { try { f_587(); } catch (...) { throw; } }
-static void f_589() { try { f_588(); } catch (...) { throw; } }
-static void f_590() { try { f_589(); } catch (...) { throw; } }
-static void f_591() { try { f_590(); } catch (...) { throw; } }
-static void f_592() { try { f_591(); } catch (...) { throw; } }
-static void f_593() { try { f_592(); } catch (...) { throw; } }
-static void f_594() { try { f_593(); } catch (...) { throw; } }
-static void f_595() { try { f_594(); } catch (...) { throw; } }
-static void f_596() { try { f_595(); } catch (...) { throw; } }
-static void f_597() { try { f_596(); } catch (...) { throw; } }
-static void f_598() { try { f_597(); } catch (...) { throw; } }
-static void f_599() { try { f_598(); } catch (...) { throw; } }
-static void f_600() { try { f_599(); } catch (...) { throw; } }
-static void f_601() { try { f_600(); } catch (...) { throw; } }
-static void f_602() { try { f_601(); } catch (...) { throw; } }
-static void f_603() { try { f_602(); } catch (...) { throw; } }
-static void f_604() { try { f_603(); } catch (...) { throw; } }
-static void f_605() { try { f_604(); } catch (...) { throw; } }
-static void f_606() { try { f_605(); } catch (...) { throw; } }
-static void f_607() { try { f_606(); } catch (...) { throw; } }
-static void f_608() { try { f_607(); } catch (...) { throw; } }
-static void f_609() { try { f_608(); } catch (...) { throw; } }
-static void f_610() { try { f_609(); } catch (...) { throw; } }
-static void f_611() { try { f_610(); } catch (...) { throw; } }
-static void f_612() { try { f_611(); } catch (...) { throw; } }
-static void f_613() { try { f_612(); } catch (...) { throw; } }
-static void f_614() { try { f_613(); } catch (...) { throw; } }
-static void f_615() { try { f_614(); } catch (...) { throw; } }
-static void f_616() { try { f_615(); } catch (...) { throw; } }
-static void f_617() { try { f_616(); } catch (...) { throw; } }
-static void f_618() { try { f_617(); } catch (...) { throw; } }
-static void f_619() { try { f_618(); } catch (...) { throw; } }
-static void f_620() { try { f_619(); } catch (...) { throw; } }
-static void f_621() { try { f_620(); } catch (...) { throw; } }
-static void f_622() { try { f_621(); } catch (...) { throw; } }
-static void f_623() { try { f_622(); } catch (...) { throw; } }
-static void f_624() { try { f_623(); } catch (...) { throw; } }
-static void f_625() { try { f_624(); } catch (...) { throw; } }
-static void f_626() { try { f_625(); } catch (...) { throw; } }
-static void f_627() { try { f_626(); } catch (...) { throw; } }
-static void f_628() { try { f_627(); } catch (...) { throw; } }
-static void f_629() { try { f_628(); } catch (...) { throw; } }
-static void f_630() { try { f_629(); } catch (...) { throw; } }
-static void f_631() { try { f_630(); } catch (...) { throw; } }
-static void f_632() { try { f_631(); } catch (...) { throw; } }
-static void f_633() { try { f_632(); } catch (...) { throw; } }
-static void f_634() { try { f_633(); } catch (...) { throw; } }
-static void f_635() { try { f_634(); } catch (...) { throw; } }
-static void f_636() { try { f_635(); } catch (...) { throw; } }
-static void f_637() { try { f_636(); } catch (...) { throw; } }
-static void f_638() { try { f_637(); } catch (...) { throw; } }
-static void f_639() { try { f_638(); } catch (...) { throw; } }
-static void f_640() { try { f_639(); } catch (...) { throw; } }
-static void f_641() { try { f_640(); } catch (...) { throw; } }
-static void f_642() { try { f_641(); } catch (...) { throw; } }
-static void f_643() { try { f_642(); } catch (...) { throw; } }
-static void f_644() { try { f_643(); } catch (...) { throw; } }
-static void f_645() { try { f_644(); } catch (...) { throw; } }
-static void f_646() { try { f_645(); } catch (...) { throw; } }
-static void f_647() { try { f_646(); } catch (...) { throw; } }
-static void f_648() { try { f_647(); } catch (...) { throw; } }
-static void f_649() { try { f_648(); } catch (...) { throw; } }
-static void f_650() { try { f_649(); } catch (...) { throw; } }
-static void f_651() { try { f_650(); } catch (...) { throw; } }
-static void f_652() { try { f_651(); } catch (...) { throw; } }
-static void f_653() { try { f_652(); } catch (...) { throw; } }
-static void f_654() { try { f_653(); } catch (...) { throw; } }
-static void f_655() { try { f_654(); } catch (...) { throw; } }
-static void f_656() { try { f_655(); } catch (...) { throw; } }
-static void f_657() { try { f_656(); } catch (...) { throw; } }
-static void f_658() { try { f_657(); } catch (...) { throw; } }
-static void f_659() { try { f_658(); } catch (...) { throw; } }
-static void f_660() { try { f_659(); } catch (...) { throw; } }
-static void f_661() { try { f_660(); } catch (...) { throw; } }
-static void f_662() { try { f_661(); } catch (...) { throw; } }
-static void f_663() { try { f_662(); } catch (...) { throw; } }
-static void f_664() { try { f_663(); } catch (...) { throw; } }
-static void f_665() { try { f_664(); } catch (...) { throw; } }
-static void f_666() { try { f_665(); } catch (...) { throw; } }
-static void f_667() { try { f_666(); } catch (...) { throw; } }
-static void f_668() { try { f_667(); } catch (...) { throw; } }
-static void f_669() { try { f_668(); } catch (...) { throw; } }
-static void f_670() { try { f_669(); } catch (...) { throw; } }
-static void f_671() { try { f_670(); } catch (...) { throw; } }
-static void f_672() { try { f_671(); } catch (...) { throw; } }
-static void f_673() { try { f_672(); } catch (...) { throw; } }
-static void f_674() { try { f_673(); } catch (...) { throw; } }
-static void f_675() { try { f_674(); } catch (...) { throw; } }
-static void f_676() { try { f_675(); } catch (...) { throw; } }
-static void f_677() { try { f_676(); } catch (...) { throw; } }
-static void f_678() { try { f_677(); } catch (...) { throw; } }
-static void f_679() { try { f_678(); } catch (...) { throw; } }
-static void f_680() { try { f_679(); } catch (...) { throw; } }
-static void f_681() { try { f_680(); } catch (...) { throw; } }
-static void f_682() { try { f_681(); } catch (...) { throw; } }
-static void f_683() { try { f_682(); } catch (...) { throw; } }
-static void f_684() { try { f_683(); } catch (...) { throw; } }
-static void f_685() { try { f_684(); } catch (...) { throw; } }
-static void f_686() { try { f_685(); } catch (...) { throw; } }
-static void f_687() { try { f_686(); } catch (...) { throw; } }
-static void f_688() { try { f_687(); } catch (...) { throw; } }
-static void f_689() { try { f_688(); } catch (...) { throw; } }
-static void f_690() { try { f_689(); } catch (...) { throw; } }
-static void f_691() { try { f_690(); } catch (...) { throw; } }
-static void f_692() { try { f_691(); } catch (...) { throw; } }
-static void f_693() { try { f_692(); } catch (...) { throw; } }
-static void f_694() { try { f_693(); } catch (...) { throw; } }
-static void f_695() { try { f_694(); } catch (...) { throw; } }
-static void f_696() { try { f_695(); } catch (...) { throw; } }
-static void f_697() { try { f_696(); } catch (...) { throw; } }
-static void f_698() { try { f_697(); } catch (...) { throw; } }
-static void f_699() { try { f_698(); } catch (...) { throw; } }
-static void f_700() { try { f_699(); } catch (...) { throw; } }
-static void f_701() { try { f_700(); } catch (...) { throw; } }
-static void f_702() { try { f_701(); } catch (...) { throw; } }
-static void f_703() { try { f_702(); } catch (...) { throw; } }
-static void f_704() { try { f_703(); } catch (...) { throw; } }
-static void f_705() { try { f_704(); } catch (...) { throw; } }
-static void f_706() { try { f_705(); } catch (...) { throw; } }
-static void f_707() { try { f_706(); } catch (...) { throw; } }
-static void f_708() { try { f_707(); } catch (...) { throw; } }
-static void f_709() { try { f_708(); } catch (...) { throw; } }
-static void f_710() { try { f_709(); } catch (...) { throw; } }
-static void f_711() { try { f_710(); } catch (...) { throw; } }
-static void f_712() { try { f_711(); } catch (...) { throw; } }
-static void f_713() { try { f_712(); } catch (...) { throw; } }
-static void f_714() { try { f_713(); } catch (...) { throw; } }
-static void f_715() { try { f_714(); } catch (...) { throw; } }
-static void f_716() { try { f_715(); } catch (...) { throw; } }
-static void f_717() { try { f_716(); } catch (...) { throw; } }
-static void f_718() { try { f_717(); } catch (...) { throw; } }
-static void f_719() { try { f_718(); } catch (...) { throw; } }
-static void f_720() { try { f_719(); } catch (...) { throw; } }
-static void f_721() { try { f_720(); } catch (...) { throw; } }
-static void f_722() { try { f_721(); } catch (...) { throw; } }
-static void f_723() { try { f_722(); } catch (...) { throw; } }
-static void f_724() { try { f_723(); } catch (...) { throw; } }
-static void f_725() { try { f_724(); } catch (...) { throw; } }
-static void f_726() { try { f_725(); } catch (...) { throw; } }
-static void f_727() { try { f_726(); } catch (...) { throw; } }
-static void f_728() { try { f_727(); } catch (...) { throw; } }
-static void f_729() { try { f_728(); } catch (...) { throw; } }
-static void f_730() { try { f_729(); } catch (...) { throw; } }
-static void f_731() { try { f_730(); } catch (...) { throw; } }
-static void f_732() { try { f_731(); } catch (...) { throw; } }
-static void f_733() { try { f_732(); } catch (...) { throw; } }
-static void f_734() { try { f_733(); } catch (...) { throw; } }
-static void f_735() { try { f_734(); } catch (...) { throw; } }
-static void f_736() { try { f_735(); } catch (...) { throw; } }
-static void f_737() { try { f_736(); } catch (...) { throw; } }
-static void f_738() { try { f_737(); } catch (...) { throw; } }
-static void f_739() { try { f_738(); } catch (...) { throw; } }
-static void f_740() { try { f_739(); } catch (...) { throw; } }
-static void f_741() { try { f_740(); } catch (...) { throw; } }
-static void f_742() { try { f_741(); } catch (...) { throw; } }
-static void f_743() { try { f_742(); } catch (...) { throw; } }
-static void f_744() { try { f_743(); } catch (...) { throw; } }
-static void f_745() { try { f_744(); } catch (...) { throw; } }
-static void f_746() { try { f_745(); } catch (...) { throw; } }
-static void f_747() { try { f_746(); } catch (...) { throw; } }
-static void f_748() { try { f_747(); } catch (...) { throw; } }
-static void f_749() { try { f_748(); } catch (...) { throw; } }
-static void f_750() { try { f_749(); } catch (...) { throw; } }
-static void f_751() { try { f_750(); } catch (...) { throw; } }
-static void f_752() { try { f_751(); } catch (...) { throw; } }
-static void f_753() { try { f_752(); } catch (...) { throw; } }
-static void f_754() { try { f_753(); } catch (...) { throw; } }
-static void f_755() { try { f_754(); } catch (...) { throw; } }
-static void f_756() { try { f_755(); } catch (...) { throw; } }
-static void f_757() { try { f_756(); } catch (...) { throw; } }
-static void f_758() { try { f_757(); } catch (...) { throw; } }
-static void f_759() { try { f_758(); } catch (...) { throw; } }
-static void f_760() { try { f_759(); } catch (...) { throw; } }
-static void f_761() { try { f_760(); } catch (...) { throw; } }
-static void f_762() { try { f_761(); } catch (...) { throw; } }
-static void f_763() { try { f_762(); } catch (...) { throw; } }
-static void f_764() { try { f_763(); } catch (...) { throw; } }
-static void f_765() { try { f_764(); } catch (...) { throw; } }
-static void f_766() { try { f_765(); } catch (...) { throw; } }
-static void f_767() { try { f_766(); } catch (...) { throw; } }
-static void f_768() { try { f_767(); } catch (...) { throw; } }
-static void f_769() { try { f_768(); } catch (...) { throw; } }
-static void f_770() { try { f_769(); } catch (...) { throw; } }
-static void f_771() { try { f_770(); } catch (...) { throw; } }
-static void f_772() { try { f_771(); } catch (...) { throw; } }
-static void f_773() { try { f_772(); } catch (...) { throw; } }
-static void f_774() { try { f_773(); } catch (...) { throw; } }
-static void f_775() { try { f_774(); } catch (...) { throw; } }
-static void f_776() { try { f_775(); } catch (...) { throw; } }
-static void f_777() { try { f_776(); } catch (...) { throw; } }
-static void f_778() { try { f_777(); } catch (...) { throw; } }
-static void f_779() { try { f_778(); } catch (...) { throw; } }
-static void f_780() { try { f_779(); } catch (...) { throw; } }
-static void f_781() { try { f_780(); } catch (...) { throw; } }
-static void f_782() { try { f_781(); } catch (...) { throw; } }
-static void f_783() { try { f_782(); } catch (...) { throw; } }
-static void f_784() { try { f_783(); } catch (...) { throw; } }
-static void f_785() { try { f_784(); } catch (...) { throw; } }
-static void f_786() { try { f_785(); } catch (...) { throw; } }
-static void f_787() { try { f_786(); } catch (...) { throw; } }
-static void f_788() { try { f_787(); } catch (...) { throw; } }
-static void f_789() { try { f_788(); } catch (...) { throw; } }
-static void f_790() { try { f_789(); } catch (...) { throw; } }
-static void f_791() { try { f_790(); } catch (...) { throw; } }
-static void f_792() { try { f_791(); } catch (...) { throw; } }
-static void f_793() { try { f_792(); } catch (...) { throw; } }
-static void f_794() { try { f_793(); } catch (...) { throw; } }
-static void f_795() { try { f_794(); } catch (...) { throw; } }
-static void f_796() { try { f_795(); } catch (...) { throw; } }
-static void f_797() { try { f_796(); } catch (...) { throw; } }
-static void f_798() { try { f_797(); } catch (...) { throw; } }
-static void f_799() { try { f_798(); } catch (...) { throw; } }
-static void f_800() { try { f_799(); } catch (...) { throw; } }
-static void f_801() { try { f_800(); } catch (...) { throw; } }
-static void f_802() { try { f_801(); } catch (...) { throw; } }
-static void f_803() { try { f_802(); } catch (...) { throw; } }
-static void f_804() { try { f_803(); } catch (...) { throw; } }
-static void f_805() { try { f_804(); } catch (...) { throw; } }
-static void f_806() { try { f_805(); } catch (...) { throw; } }
-static void f_807() { try { f_806(); } catch (...) { throw; } }
-static void f_808() { try { f_807(); } catch (...) { throw; } }
-static void f_809() { try { f_808(); } catch (...) { throw; } }
-static void f_810() { try { f_809(); } catch (...) { throw; } }
-static void f_811() { try { f_810(); } catch (...) { throw; } }
-static void f_812() { try { f_811(); } catch (...) { throw; } }
-static void f_813() { try { f_812(); } catch (...) { throw; } }
-static void f_814() { try { f_813(); } catch (...) { throw; } }
-static void f_815() { try { f_814(); } catch (...) { throw; } }
-static void f_816() { try { f_815(); } catch (...) { throw; } }
-static void f_817() { try { f_816(); } catch (...) { throw; } }
-static void f_818() { try { f_817(); } catch (...) { throw; } }
-static void f_819() { try { f_818(); } catch (...) { throw; } }
-static void f_820() { try { f_819(); } catch (...) { throw; } }
-static void f_821() { try { f_820(); } catch (...) { throw; } }
-static void f_822() { try { f_821(); } catch (...) { throw; } }
-static void f_823() { try { f_822(); } catch (...) { throw; } }
-static void f_824() { try { f_823(); } catch (...) { throw; } }
-static void f_825() { try { f_824(); } catch (...) { throw; } }
-static void f_826() { try { f_825(); } catch (...) { throw; } }
-static void f_827() { try { f_826(); } catch (...) { throw; } }
-static void f_828() { try { f_827(); } catch (...) { throw; } }
-static void f_829() { try { f_828(); } catch (...) { throw; } }
-static void f_830() { try { f_829(); } catch (...) { throw; } }
-static void f_831() { try { f_830(); } catch (...) { throw; } }
-static void f_832() { try { f_831(); } catch (...) { throw; } }
-static void f_833() { try { f_832(); } catch (...) { throw; } }
-static void f_834() { try { f_833(); } catch (...) { throw; } }
-static void f_835() { try { f_834(); } catch (...) { throw; } }
-static void f_836() { try { f_835(); } catch (...) { throw; } }
-static void f_837() { try { f_836(); } catch (...) { throw; } }
-static void f_838() { try { f_837(); } catch (...) { throw; } }
-static void f_839() { try { f_838(); } catch (...) { throw; } }
-static void f_840() { try { f_839(); } catch (...) { throw; } }
-static void f_841() { try { f_840(); } catch (...) { throw; } }
-static void f_842() { try { f_841(); } catch (...) { throw; } }
-static void f_843() { try { f_842(); } catch (...) { throw; } }
-static void f_844() { try { f_843(); } catch (...) { throw; } }
-static void f_845() { try { f_844(); } catch (...) { throw; } }
-static void f_846() { try { f_845(); } catch (...) { throw; } }
-static void f_847() { try { f_846(); } catch (...) { throw; } }
-static void f_848() { try { f_847(); } catch (...) { throw; } }
-static void f_849() { try { f_848(); } catch (...) { throw; } }
-static void f_850() { try { f_849(); } catch (...) { throw; } }
-static void f_851() { try { f_850(); } catch (...) { throw; } }
-static void f_852() { try { f_851(); } catch (...) { throw; } }
-static void f_853() { try { f_852(); } catch (...) { throw; } }
-static void f_854() { try { f_853(); } catch (...) { throw; } }
-static void f_855() { try { f_854(); } catch (...) { throw; } }
-static void f_856() { try { f_855(); } catch (...) { throw; } }
-static void f_857() { try { f_856(); } catch (...) { throw; } }
-static void f_858() { try { f_857(); } catch (...) { throw; } }
-static void f_859() { try { f_858(); } catch (...) { throw; } }
-static void f_860() { try { f_859(); } catch (...) { throw; } }
-static void f_861() { try { f_860(); } catch (...) { throw; } }
-static void f_862() { try { f_861(); } catch (...) { throw; } }
-static void f_863() { try { f_862(); } catch (...) { throw; } }
-static void f_864() { try { f_863(); } catch (...) { throw; } }
-static void f_865() { try { f_864(); } catch (...) { throw; } }
-static void f_866() { try { f_865(); } catch (...) { throw; } }
-static void f_867() { try { f_866(); } catch (...) { throw; } }
-static void f_868() { try { f_867(); } catch (...) { throw; } }
-static void f_869() { try { f_868(); } catch (...) { throw; } }
-static void f_870() { try { f_869(); } catch (...) { throw; } }
-static void f_871() { try { f_870(); } catch (...) { throw; } }
-static void f_872() { try { f_871(); } catch (...) { throw; } }
-static void f_873() { try { f_872(); } catch (...) { throw; } }
-static void f_874() { try { f_873(); } catch (...) { throw; } }
-static void f_875() { try { f_874(); } catch (...) { throw; } }
-static void f_876() { try { f_875(); } catch (...) { throw; } }
-static void f_877() { try { f_876(); } catch (...) { throw; } }
-static void f_878() { try { f_877(); } catch (...) { throw; } }
-static void f_879() { try { f_878(); } catch (...) { throw; } }
-static void f_880() { try { f_879(); } catch (...) { throw; } }
-static void f_881() { try { f_880(); } catch (...) { throw; } }
-static void f_882() { try { f_881(); } catch (...) { throw; } }
-static void f_883() { try { f_882(); } catch (...) { throw; } }
-static void f_884() { try { f_883(); } catch (...) { throw; } }
-static void f_885() { try { f_884(); } catch (...) { throw; } }
-static void f_886() { try { f_885(); } catch (...) { throw; } }
-static void f_887() { try { f_886(); } catch (...) { throw; } }
-static void f_888() { try { f_887(); } catch (...) { throw; } }
-static void f_889() { try { f_888(); } catch (...) { throw; } }
-static void f_890() { try { f_889(); } catch (...) { throw; } }
-static void f_891() { try { f_890(); } catch (...) { throw; } }
-static void f_892() { try { f_891(); } catch (...) { throw; } }
-static void f_893() { try { f_892(); } catch (...) { throw; } }
-static void f_894() { try { f_893(); } catch (...) { throw; } }
-static void f_895() { try { f_894(); } catch (...) { throw; } }
-static void f_896() { try { f_895(); } catch (...) { throw; } }
-static void f_897() { try { f_896(); } catch (...) { throw; } }
-static void f_898() { try { f_897(); } catch (...) { throw; } }
-static void f_899() { try { f_898(); } catch (...) { throw; } }
-static void f_900() { try { f_899(); } catch (...) { throw; } }
-static void f_901() { try { f_900(); } catch (...) { throw; } }
-static void f_902() { try { f_901(); } catch (...) { throw; } }
-static void f_903() { try { f_902(); } catch (...) { throw; } }
-static void f_904() { try { f_903(); } catch (...) { throw; } }
-static void f_905() { try { f_904(); } catch (...) { throw; } }
-static void f_906() { try { f_905(); } catch (...) { throw; } }
-static void f_907() { try { f_906(); } catch (...) { throw; } }
-static void f_908() { try { f_907(); } catch (...) { throw; } }
-static void f_909() { try { f_908(); } catch (...) { throw; } }
-static void f_910() { try { f_909(); } catch (...) { throw; } }
-static void f_911() { try { f_910(); } catch (...) { throw; } }
-static void f_912() { try { f_911(); } catch (...) { throw; } }
-static void f_913() { try { f_912(); } catch (...) { throw; } }
-static void f_914() { try { f_913(); } catch (...) { throw; } }
-static void f_915() { try { f_914(); } catch (...) { throw; } }
-static void f_916() { try { f_915(); } catch (...) { throw; } }
-static void f_917() { try { f_916(); } catch (...) { throw; } }
-static void f_918() { try { f_917(); } catch (...) { throw; } }
-static void f_919() { try { f_918(); } catch (...) { throw; } }
-static void f_920() { try { f_919(); } catch (...) { throw; } }
-static void f_921() { try { f_920(); } catch (...) { throw; } }
-static void f_922() { try { f_921(); } catch (...) { throw; } }
-static void f_923() { try { f_922(); } catch (...) { throw; } }
-static void f_924() { try { f_923(); } catch (...) { throw; } }
-static void f_925() { try { f_924(); } catch (...) { throw; } }
-static void f_926() { try { f_925(); } catch (...) { throw; } }
-static void f_927() { try { f_926(); } catch (...) { throw; } }
-static void f_928() { try { f_927(); } catch (...) { throw; } }
-static void f_929() { try { f_928(); } catch (...) { throw; } }
-static void f_930() { try { f_929(); } catch (...) { throw; } }
-static void f_931() { try { f_930(); } catch (...) { throw; } }
-static void f_932() { try { f_931(); } catch (...) { throw; } }
-static void f_933() { try { f_932(); } catch (...) { throw; } }
-static void f_934() { try { f_933(); } catch (...) { throw; } }
-static void f_935() { try { f_934(); } catch (...) { throw; } }
-static void f_936() { try { f_935(); } catch (...) { throw; } }
-static void f_937() { try { f_936(); } catch (...) { throw; } }
-static void f_938() { try { f_937(); } catch (...) { throw; } }
-static void f_939() { try { f_938(); } catch (...) { throw; } }
-static void f_940() { try { f_939(); } catch (...) { throw; } }
-static void f_941() { try { f_940(); } catch (...) { throw; } }
-static void f_942() { try { f_941(); } catch (...) { throw; } }
-static void f_943() { try { f_942(); } catch (...) { throw; } }
-static void f_944() { try { f_943(); } catch (...) { throw; } }
-static void f_945() { try { f_944(); } catch (...) { throw; } }
-static void f_946() { try { f_945(); } catch (...) { throw; } }
-static void f_947() { try { f_946(); } catch (...) { throw; } }
-static void f_948() { try { f_947(); } catch (...) { throw; } }
-static void f_949() { try { f_948(); } catch (...) { throw; } }
-static void f_950() { try { f_949(); } catch (...) { throw; } }
-static void f_951() { try { f_950(); } catch (...) { throw; } }
-static void f_952() { try { f_951(); } catch (...) { throw; } }
-static void f_953() { try { f_952(); } catch (...) { throw; } }
-static void f_954() { try { f_953(); } catch (...) { throw; } }
-static void f_955() { try { f_954(); } catch (...) { throw; } }
-static void f_956() { try { f_955(); } catch (...) { throw; } }
-static void f_957() { try { f_956(); } catch (...) { throw; } }
-static void f_958() { try { f_957(); } catch (...) { throw; } }
-static void f_959() { try { f_958(); } catch (...) { throw; } }
-static void f_960() { try { f_959(); } catch (...) { throw; } }
-static void f_961() { try { f_960(); } catch (...) { throw; } }
-static void f_962() { try { f_961(); } catch (...) { throw; } }
-static void f_963() { try { f_962(); } catch (...) { throw; } }
-static void f_964() { try { f_963(); } catch (...) { throw; } }
-static void f_965() { try { f_964(); } catch (...) { throw; } }
-static void f_966() { try { f_965(); } catch (...) { throw; } }
-static void f_967() { try { f_966(); } catch (...) { throw; } }
-static void f_968() { try { f_967(); } catch (...) { throw; } }
-static void f_969() { try { f_968(); } catch (...) { throw; } }
-static void f_970() { try { f_969(); } catch (...) { throw; } }
-static void f_971() { try { f_970(); } catch (...) { throw; } }
-static void f_972() { try { f_971(); } catch (...) { throw; } }
-static void f_973() { try { f_972(); } catch (...) { throw; } }
-static void f_974() { try { f_973(); } catch (...) { throw; } }
-static void f_975() { try { f_974(); } catch (...) { throw; } }
-static void f_976() { try { f_975(); } catch (...) { throw; } }
-static void f_977() { try { f_976(); } catch (...) { throw; } }
-static void f_978() { try { f_977(); } catch (...) { throw; } }
-static void f_979() { try { f_978(); } catch (...) { throw; } }
-static void f_980() { try { f_979(); } catch (...) { throw; } }
-static void f_981() { try { f_980(); } catch (...) { throw; } }
-static void f_982() { try { f_981(); } catch (...) { throw; } }
-static void f_983() { try { f_982(); } catch (...) { throw; } }
-static void f_984() { try { f_983(); } catch (...) { throw; } }
-static void f_985() { try { f_984(); } catch (...) { throw; } }
-static void f_986() { try { f_985(); } catch (...) { throw; } }
-static void f_987() { try { f_986(); } catch (...) { throw; } }
-static void f_988() { try { f_987(); } catch (...) { throw; } }
-static void f_989() { try { f_988(); } catch (...) { throw; } }
-static void f_990() { try { f_989(); } catch (...) { throw; } }
-static void f_991() { try { f_990(); } catch (...) { throw; } }
-static void f_992() { try { f_991(); } catch (...) { throw; } }
-static void f_993() { try { f_992(); } catch (...) { throw; } }
-static void f_994() { try { f_993(); } catch (...) { throw; } }
-static void f_995() { try { f_994(); } catch (...) { throw; } }
-static void f_996() { try { f_995(); } catch (...) { throw; } }
-static void f_997() { try { f_996(); } catch (...) { throw; } }
-static void f_998() { try { f_997(); } catch (...) { throw; } }
-static void f_999() { try { f_998(); } catch (...) { throw; } }
-static void f_1000() { try { f_999(); } catch (...) { throw; } }
-static void f_1001() { try { f_1000(); } catch (...) { throw; } }
-static void f_1002() { try { f_1001(); } catch (...) { throw; } }
-static void f_1003() { try { f_1002(); } catch (...) { throw; } }
-static void f_1004() { try { f_1003(); } catch (...) { throw; } }
-static void f_1005() { try { f_1004(); } catch (...) { throw; } }
-static void f_1006() { try { f_1005(); } catch (...) { throw; } }
-static void f_1007() { try { f_1006(); } catch (...) { throw; } }
-static void f_1008() { try { f_1007(); } catch (...) { throw; } }
-static void f_1009() { try { f_1008(); } catch (...) { throw; } }
-static void f_1010() { try { f_1009(); } catch (...) { throw; } }
-static void f_1011() { try { f_1010(); } catch (...) { throw; } }
-static void f_1012() { try { f_1011(); } catch (...) { throw; } }
-static void f_1013() { try { f_1012(); } catch (...) { throw; } }
-static void f_1014() { try { f_1013(); } catch (...) { throw; } }
-static void f_1015() { try { f_1014(); } catch (...) { throw; } }
-static void f_1016() { try { f_1015(); } catch (...) { throw; } }
-static void f_1017() { try { f_1016(); } catch (...) { throw; } }
-static void f_1018() { try { f_1017(); } catch (...) { throw; } }
-static void f_1019() { try { f_1018(); } catch (...) { throw; } }
-static void f_1020() { try { f_1019(); } catch (...) { throw; } }
+template <int N>
+void f() { try { f<N - 1>(); } catch (...) { throw; } }
+
+template <>
+void f<0>() { throw 42; }
+
 int main(int argc, char *argv[]) {
   try {
-    f_1020();
+    f<1020>();
   } catch (int n) {
     return 42 - n;
   }

From 1ba879965ff84ed4ef13b98b4144e23681a04d79 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Fri, 7 Feb 2025 14:28:01 +1100
Subject: [PATCH 077/282] [ORC] Add a FIXME. NFC.

(cherry picked from commit 7811c20bcd9d5b117a9543d74c71448fe9970fe6)
---
 llvm/lib/ExecutionEngine/Orc/ExecutorProcessControl.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llvm/lib/ExecutionEngine/Orc/ExecutorProcessControl.cpp b/llvm/lib/ExecutionEngine/Orc/ExecutorProcessControl.cpp
index b51fa24be76d1..a9dbcd166117b 100644
--- a/llvm/lib/ExecutionEngine/Orc/ExecutorProcessControl.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/ExecutorProcessControl.cpp
@@ -55,6 +55,9 @@ SelfExecutorProcessControl::SelfExecutorProcessControl(
       ExecutorAddr::fromPtr(&llvm_orc_deregisterEHFrameSectionWrapper);
 
 #ifdef __APPLE__
+  // FIXME: Don't add an UnwindInfoManager by default -- it's redundant when
+  //        the ORC runtime is loaded. We'll need a way to document this and
+  //        allow clients to choose.
   this->UnwindInfoMgr = UnwindInfoManager::TryCreate();
   if (this->UnwindInfoMgr)
     this->UnwindInfoMgr->addBootstrapSymbols(this->BootstrapSymbols);

From c7e085c5d32def57fffd292174a052bff6d6a241 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Fri, 7 Feb 2025 16:30:47 +1100
Subject: [PATCH 078/282] [ORC] Add ExecutionSession convenience methods to
 access bootstrap values.

The getBootstrapMap, getBootstrapMapValue, getBootstrapSymbolsMap, and
getBootstrapSymbols methods forward to their respective counterparts in
ExecutorProcessControl, similar to the callWrapper functions.

These methods will be used to simplify an upcoming patch that accesses
the bootstrap values.

(cherry picked from commit 63bb4ba84a22279c1cdd4953f0c19269b23a9d32)
---
 llvm/include/llvm/ExecutionEngine/Orc/Core.h | 24 ++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Core.h b/llvm/include/llvm/ExecutionEngine/Orc/Core.h
index 3eddaf4c9c59f..cecb4094c9a57 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/Core.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Core.h
@@ -1556,6 +1556,30 @@ class ExecutionSession {
     EPC->getDispatcher().dispatch(std::move(T));
   }
 
+  /// Returns the bootstrap map.
+  const StringMap<std::vector<char>> &getBootstrapMap() const {
+    return EPC->getBootstrapMap();
+  }
+
+  /// Look up and SPS-deserialize a bootstrap map value.
+  template <typename T, typename SPSTagT>
+  Error getBootstrapMapValue(StringRef Key, std::optional<T> &Val) const {
+    return EPC->getBootstrapMapValue<T, SPSTagT>(Key, Val);
+  }
+
+  /// Returns the bootstrap symbol map.
+  const StringMap<ExecutorAddr> &getBootstrapSymbolsMap() const {
+    return EPC->getBootstrapSymbolsMap();
+  }
+
+  /// For each (ExecutorAddr&, StringRef) pair, looks up the string in the
+  /// bootstrap symbols map and writes its address to the ExecutorAddr if
+  /// found. If any symbol is not found then the function returns an error.
+  Error getBootstrapSymbols(
+      ArrayRef<std::pair<ExecutorAddr &, StringRef>> Pairs) const {
+    return EPC->getBootstrapSymbols(Pairs);
+  }
+
   /// Run a wrapper function in the executor.
   ///
   /// The wrapper function should be callable as:

From bf2828f65823264f46e9cca60f0f4e9988ce2f57 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Fri, 7 Feb 2025 16:51:52 +1100
Subject: [PATCH 079/282] [ORC] Force eh-frame use for older Darwins on x86-64
 in MachOPlatform, LLJIT.

The system libunwind on older Darwins does not support JIT registration of
compact-unwind. Since the CompactUnwindManager utility discards redundant
eh-frame FDEs by default we need to remove the compact-unwind section first
when targeting older libunwinds in order to preserve eh-frames.

While LLJIT was already doing this as of eae6d6d18bd, MachOPlatform was not.
This was causing buildbot failures in the ORC runtime (e.g. in
https://green.lab.llvm.org/job/llvm.org/job/clang-stage1-RA/3479/).

This patch updates both LLJIT and MachOPlatform to check a bootstrap value,
"darwin-use-ehframes-only", to determine whether to forcibly preserve
eh-frame sections. If this value is present and set to true then compact-unwind
sections will be discarded, causing eh-frames to be preserved. If the value is
absent or set to false then compact-unwind will be used and redundant FDEs in
eh-frames discarded (FDEs that are needed by the compact-unwind section are
always preserved).

rdar://143895614
(cherry picked from commit e2eaf8ded78507100513a17e8193e2c4b094f8da)
---
 .../llvm/ExecutionEngine/Orc/MachOPlatform.h  |  1 +
 .../DefaultHostBootstrapValues.h              | 28 +++++++++++++++
 .../Orc/ExecutorProcessControl.cpp            |  8 ++---
 llvm/lib/ExecutionEngine/Orc/LLJIT.cpp        | 33 ++++++++++++-----
 .../lib/ExecutionEngine/Orc/MachOPlatform.cpp | 15 ++++++++
 .../Orc/TargetProcess/CMakeLists.txt          |  1 +
 .../DefaultHostBootstrapValues.cpp            | 36 +++++++++++++++++++
 .../Orc/TargetProcess/OrcRTBootstrap.cpp      |  4 ---
 .../llvm-jitlink-executor.cpp                 |  3 ++
 9 files changed, 110 insertions(+), 19 deletions(-)
 create mode 100644 llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/DefaultHostBootstrapValues.h
 create mode 100644 llvm/lib/ExecutionEngine/Orc/TargetProcess/DefaultHostBootstrapValues.cpp

diff --git a/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h b/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h
index 6e99f6c03a7c6..91842714f6c4c 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h
@@ -368,6 +368,7 @@ class MachOPlatform : public Platform {
   DenseMap<JITDylib *, SymbolLookupSet> RegisteredInitSymbols;
 
   std::mutex PlatformMutex;
+  bool ForceEHFrames = false;
   BootstrapInfo *Bootstrap = nullptr;
   DenseMap<JITDylib *, ExecutorAddr> JITDylibToHeaderAddr;
   DenseMap<ExecutorAddr, JITDylib *> HeaderAddrToJITDylib;
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/DefaultHostBootstrapValues.h b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/DefaultHostBootstrapValues.h
new file mode 100644
index 0000000000000..d3277e61eeb7b
--- /dev/null
+++ b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/DefaultHostBootstrapValues.h
@@ -0,0 +1,28 @@
+//===- DefaultHostBootstrapValues.h - Defaults for host process -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Set sensible default bootstrap values for JIT execution in the host process.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_DEFAULTHOSTBOOTSTRAPVALUES_H
+#define LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_DEFAULTHOSTBOOTSTRAPVALUES_H
+
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h"
+#include <vector>
+
+namespace llvm::orc {
+
+void addDefaultBootstrapValuesForHostProcess(
+    StringMap<std::vector<char>> &BootstrapMap,
+    StringMap<ExecutorAddr> &BootstrapSymbols);
+
+} // namespace llvm::orc
+
+#endif // LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_DEFAULTHOSTBOOTSTRAPVALUES_H
diff --git a/llvm/lib/ExecutionEngine/Orc/ExecutorProcessControl.cpp b/llvm/lib/ExecutionEngine/Orc/ExecutorProcessControl.cpp
index a9dbcd166117b..7b38150ab4b65 100644
--- a/llvm/lib/ExecutionEngine/Orc/ExecutorProcessControl.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/ExecutorProcessControl.cpp
@@ -9,8 +9,7 @@
 #include "llvm/ExecutionEngine/Orc/ExecutorProcessControl.h"
 
 #include "llvm/ExecutionEngine/Orc/Core.h"
-#include "llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h"
-#include "llvm/ExecutionEngine/Orc/TargetProcess/RegisterEHFrames.h"
+#include "llvm/ExecutionEngine/Orc/TargetProcess/DefaultHostBootstrapValues.h"
 #include "llvm/ExecutionEngine/Orc/TargetProcess/TargetExecutionUtils.h"
 #include "llvm/Support/Process.h"
 #include "llvm/TargetParser/Host.h"
@@ -49,10 +48,7 @@ SelfExecutorProcessControl::SelfExecutorProcessControl(
   if (this->TargetTriple.isOSBinFormatMachO())
     GlobalManglingPrefix = '_';
 
-  this->BootstrapSymbols[rt::RegisterEHFrameSectionWrapperName] =
-      ExecutorAddr::fromPtr(&llvm_orc_registerEHFrameSectionWrapper);
-  this->BootstrapSymbols[rt::DeregisterEHFrameSectionWrapperName] =
-      ExecutorAddr::fromPtr(&llvm_orc_deregisterEHFrameSectionWrapper);
+  addDefaultBootstrapValuesForHostProcess(BootstrapMap, BootstrapSymbols);
 
 #ifdef __APPLE__
   // FIXME: Don't add an UnwindInfoManager by default -- it's redundant when
diff --git a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
index 938fe58ef85cf..dd844ae3a42bc 100644
--- a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
@@ -1221,22 +1221,37 @@ Expected<JITDylibSP> setUpGenericLLVMIRPlatform(LLJIT &J) {
 
   if (auto *OLL = dyn_cast<ObjectLinkingLayer>(&J.getObjLinkingLayer())) {
 
-    bool CompactUnwindInfoSupported = false;
+    bool UseEHFrames = true;
 
     // Enable compact-unwind support if possible.
     if (J.getTargetTriple().isOSDarwin() ||
         J.getTargetTriple().isOSBinFormatMachO()) {
-      if (auto UIRP = UnwindInfoRegistrationPlugin::Create(
-              J.getIRCompileLayer(), PlatformJD)) {
-        CompactUnwindInfoSupported = true;
-        OLL->addPlugin(std::move(*UIRP));
-        LLVM_DEBUG(dbgs() << "Enabled compact-unwind support.\n");
-      } else
-        consumeError(UIRP.takeError());
+
+      // Check if the bootstrap map says that we should force eh-frames:
+      // Older libunwinds require this as they don't have a dynamic
+      // registration API for compact-unwind.
+      std::optional<bool> ForceEHFrames;
+      if (auto Err = J.getExecutionSession().getBootstrapMapValue<bool, bool>(
+              "darwin-use-ehframes-only", ForceEHFrames))
+        return Err;
+      if (ForceEHFrames.has_value())
+        UseEHFrames = *ForceEHFrames;
+      else
+        UseEHFrames = false;
+
+      // If UseEHFrames hasn't been set then we're good to use compact-unwind.
+      if (!UseEHFrames) {
+        if (auto UIRP = UnwindInfoRegistrationPlugin::Create(
+                J.getIRCompileLayer(), PlatformJD)) {
+          OLL->addPlugin(std::move(*UIRP));
+          LLVM_DEBUG(dbgs() << "Enabled compact-unwind support.\n");
+        } else
+          return UIRP.takeError();
+      }
     }
 
     // Otherwise fall back to standard unwind registration.
-    if (!CompactUnwindInfoSupported) {
+    if (UseEHFrames) {
       auto &ES = J.getExecutionSession();
       if (auto EHFrameRegistrar = EPCEHFrameRegistrar::Create(ES)) {
         OLL->addPlugin(std::make_unique<EHFrameRegistrationPlugin>(
diff --git a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
index 845990d965b16..d4e341a96f5b1 100644
--- a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
@@ -481,6 +481,15 @@ MachOPlatform::MachOPlatform(
   ObjLinkingLayer.addPlugin(std::make_unique<MachOPlatformPlugin>(*this));
   PlatformJD.addGenerator(std::move(OrcRuntimeGenerator));
 
+  {
+    // Check for force-eh-frame
+    std::optional<bool> ForceEHFrames;
+    if ((Err = ES.getBootstrapMapValue<bool, bool>("darwin-use-ehframes-only",
+                                                   ForceEHFrames)))
+      return;
+    this->ForceEHFrames = ForceEHFrames.has_value() ? *ForceEHFrames : false;
+  }
+
   BootstrapInfo BI;
   Bootstrap = &BI;
 
@@ -811,6 +820,12 @@ void MachOPlatform::MachOPlatformPlugin::modifyPassConfig(
       HeaderAddr = I->second;
   }
 
+  // If we're forcing eh-frame use then discard the compact-unwind section
+  // immediately to prevent FDEs from being stripped.
+  if (MP.ForceEHFrames)
+    if (auto *CUSec = LG.findSectionByName(MachOCompactUnwindSectionName))
+      LG.removeSection(*CUSec);
+
   // Point the libunwind dso-base absolute symbol at the header for the
   // JITDylib. This will prevent us from synthesizing a new header for
   // every object.
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/CMakeLists.txt b/llvm/lib/ExecutionEngine/Orc/TargetProcess/CMakeLists.txt
index ffc1bbfa121b3..282d4cfc2ddad 100644
--- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/CMakeLists.txt
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/CMakeLists.txt
@@ -11,6 +11,7 @@ endif()
 
 add_llvm_component_library(LLVMOrcTargetProcess
   ExecutorSharedMemoryMapperService.cpp
+  DefaultHostBootstrapValues.cpp
   JITLoaderGDB.cpp
   JITLoaderPerf.cpp
   JITLoaderVTune.cpp
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/DefaultHostBootstrapValues.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/DefaultHostBootstrapValues.cpp
new file mode 100644
index 0000000000000..c95b7ac5159fe
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/DefaultHostBootstrapValues.cpp
@@ -0,0 +1,36 @@
+//===----- DefaultHostBootstrapValues.cpp - Defaults for host process -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/TargetProcess/DefaultHostBootstrapValues.h"
+
+#include "llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h"
+#include "llvm/ExecutionEngine/Orc/TargetProcess/RegisterEHFrames.h"
+
+#ifdef __APPLE__
+#include <dlfcn.h>
+#endif // __APPLE__
+
+namespace llvm::orc {
+
+void addDefaultBootstrapValuesForHostProcess(
+    StringMap<std::vector<char>> &BootstrapMap,
+    StringMap<ExecutorAddr> &BootstrapSymbols) {
+
+  // FIXME: We probably shouldn't set these on Windows?
+  BootstrapSymbols[rt::RegisterEHFrameSectionWrapperName] =
+      ExecutorAddr::fromPtr(&llvm_orc_registerEHFrameSectionWrapper);
+  BootstrapSymbols[rt::DeregisterEHFrameSectionWrapperName] =
+      ExecutorAddr::fromPtr(&llvm_orc_deregisterEHFrameSectionWrapper);
+
+#ifdef __APPLE__
+  if (!dlsym(RTLD_DEFAULT, "__unw_add_find_dynamic_unwind_sections"))
+    BootstrapMap["darwin-use-ehframes-only"].push_back(1);
+#endif // __APPLE__
+}
+
+} // namespace llvm::orc
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/OrcRTBootstrap.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/OrcRTBootstrap.cpp
index d88fbbfc86385..c4f201b353d27 100644
--- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/OrcRTBootstrap.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/OrcRTBootstrap.cpp
@@ -106,10 +106,6 @@ void addTo(StringMap<ExecutorAddr> &M) {
       ExecutorAddr::fromPtr(&writeBuffersWrapper);
   M[rt::MemoryWritePointersWrapperName] =
       ExecutorAddr::fromPtr(&writePointersWrapper);
-  M[rt::RegisterEHFrameSectionWrapperName] =
-      ExecutorAddr::fromPtr(&llvm_orc_registerEHFrameSectionWrapper);
-  M[rt::DeregisterEHFrameSectionWrapperName] =
-      ExecutorAddr::fromPtr(&llvm_orc_deregisterEHFrameSectionWrapper);
   M[rt::RunAsMainWrapperName] = ExecutorAddr::fromPtr(&runAsMainWrapper);
   M[rt::RunAsVoidFunctionWrapperName] =
       ExecutorAddr::fromPtr(&runAsVoidFunctionWrapper);
diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink-executor/llvm-jitlink-executor.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink-executor/llvm-jitlink-executor.cpp
index 86b89a38c1760..dbdec77327774 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink-executor/llvm-jitlink-executor.cpp
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink-executor/llvm-jitlink-executor.cpp
@@ -12,6 +12,7 @@
 
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Config/llvm-config.h" // for LLVM_ON_UNIX, LLVM_ENABLE_THREADS
+#include "llvm/ExecutionEngine/Orc/TargetProcess/DefaultHostBootstrapValues.h"
 #include "llvm/ExecutionEngine/Orc/TargetProcess/ExecutorSharedMemoryMapperService.h"
 #include "llvm/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.h"
 #include "llvm/ExecutionEngine/Orc/TargetProcess/RegisterEHFrames.h"
@@ -187,6 +188,8 @@ int main(int argc, char *argv[]) {
                 std::make_unique<SimpleRemoteEPCServer::ThreadDispatcher>());
             S.bootstrapSymbols() =
                 SimpleRemoteEPCServer::defaultBootstrapSymbols();
+            addDefaultBootstrapValuesForHostProcess(S.bootstrapMap(),
+                                                    S.bootstrapSymbols());
             S.services().push_back(
                 std::make_unique<rt_bootstrap::SimpleExecutorMemoryManager>());
             S.services().push_back(

From 75e20e0b801ef754c17eec8d0231d23cf8f17aed Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 6 Feb 2025 16:27:10 +0100
Subject: [PATCH 080/282] [Mips] Use getSignedConstant() in or combine

Fixes https://github.com/llvm/llvm-project/issues/125954.

(cherry picked from commit 88b1d16c4a040cc3d919be13f0baf44f30a5aa13)
---
 llvm/lib/Target/Mips/MipsISelLowering.cpp |  2 +-
 llvm/test/CodeGen/Mips/dins.ll            | 53 +++++++++++++++++++++++
 2 files changed, 54 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp
index 7c4257c222c0b..85e05bc1b129a 100644
--- a/llvm/lib/Target/Mips/MipsISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp
@@ -1004,7 +1004,7 @@ static SDValue performORCombine(SDNode *N, SelectionDAG &DAG,
       return DAG.getNode(
           MipsISD::Ins, DL, N->getValueType(0),
           isConstCase
-              ? DAG.getConstant(CN1->getSExtValue() >> SMPos0, DL, ValTy)
+              ? DAG.getSignedConstant(CN1->getSExtValue() >> SMPos0, DL, ValTy)
               : SrlX,
           DAG.getConstant(SMPos0, DL, MVT::i32),
           DAG.getConstant(ValTy.getSizeInBits() / 8 < 8 ? SMSize0 & 31
diff --git a/llvm/test/CodeGen/Mips/dins.ll b/llvm/test/CodeGen/Mips/dins.ll
index cdb8f419eb2be..2e9816e7d3642 100644
--- a/llvm/test/CodeGen/Mips/dins.ll
+++ b/llvm/test/CodeGen/Mips/dins.ll
@@ -304,3 +304,56 @@ entry:
   store volatile i32 %or, ptr %x.addr, align 4
   ret i32 %and
 }
+
+define i32 @pr125954(i32 %arg, i1 %c) {
+; MIPS64R2-LABEL: pr125954:
+; MIPS64R2:       # %bb.0:
+; MIPS64R2-NEXT:    sll $1, $4, 0
+; MIPS64R2-NEXT:    addiu $2, $zero, -1
+; MIPS64R2-NEXT:    move $3, $1
+; MIPS64R2-NEXT:    ins $3, $2, 8, 24
+; MIPS64R2-NEXT:    andi $2, $1, 255
+; MIPS64R2-NEXT:    sll $1, $5, 0
+; MIPS64R2-NEXT:    andi $1, $1, 1
+; MIPS64R2-NEXT:    jr $ra
+; MIPS64R2-NEXT:    movn $2, $3, $1
+;
+; MIPS32R2-LABEL: pr125954:
+; MIPS32R2:       # %bb.0:
+; MIPS32R2-NEXT:    andi $2, $4, 255
+; MIPS32R2-NEXT:    addiu $1, $zero, -256
+; MIPS32R2-NEXT:    or $1, $2, $1
+; MIPS32R2-NEXT:    andi $3, $5, 1
+; MIPS32R2-NEXT:    jr $ra
+; MIPS32R2-NEXT:    movn $2, $1, $3
+;
+; MIPS16-LABEL: pr125954:
+; MIPS16:       # %bb.0:
+; MIPS16-NEXT:    li $6, 1
+; MIPS16-NEXT:    and $6, $5
+; MIPS16-NEXT:    li $2, 255
+; MIPS16-NEXT:    and $2, $4
+; MIPS16-NEXT:    move $3, $zero
+; MIPS16-NEXT:    beqz $6, $BB2_2 # 16 bit inst
+; MIPS16-NEXT:  # %bb.1:
+; MIPS16-NEXT:    addiu $3, -256
+; MIPS16-NEXT:    or $2, $3
+; MIPS16-NEXT:  $BB2_2:
+; MIPS16-NEXT:    jrc $ra
+;
+; MIPS64R2N32-LABEL: pr125954:
+; MIPS64R2N32:       # %bb.0:
+; MIPS64R2N32-NEXT:    sll $1, $4, 0
+; MIPS64R2N32-NEXT:    addiu $2, $zero, -1
+; MIPS64R2N32-NEXT:    move $3, $1
+; MIPS64R2N32-NEXT:    ins $3, $2, 8, 24
+; MIPS64R2N32-NEXT:    andi $2, $1, 255
+; MIPS64R2N32-NEXT:    sll $1, $5, 0
+; MIPS64R2N32-NEXT:    andi $1, $1, 1
+; MIPS64R2N32-NEXT:    jr $ra
+; MIPS64R2N32-NEXT:    movn $2, $3, $1
+  %and = and i32 %arg, 255
+  %or = or i32 %and, -256
+  %sel = select i1 %c, i32 %or, i32 %and
+  ret i32 %sel
+}

From 5b48526fe347235fd6ac8ecc51928749b0e5deda Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1@linux.ibm.com>
Date: Fri, 31 Jan 2025 06:58:01 -0600
Subject: [PATCH 081/282] [SystemZ] Replace SELRMux with COPY in case of
 identical operands. (#125108)

If both operands of a SELRMux use the same register which is killed, and
the SELRMux is expanded to a jump sequence, a broken MIR results if the
kill flag is not removed.

This patch replaces the SELRMux with a COPY in these cases.

(cherry picked from commit eb1a571114a799f532a12b2f062746d3b92fed88)
---
 .../lib/Target/SystemZ/SystemZPostRewrite.cpp | 12 +++++++++++
 llvm/test/CodeGen/SystemZ/cond-move-10.mir    | 21 +++++++++++++++++++
 2 files changed, 33 insertions(+)
 create mode 100644 llvm/test/CodeGen/SystemZ/cond-move-10.mir

diff --git a/llvm/lib/Target/SystemZ/SystemZPostRewrite.cpp b/llvm/lib/Target/SystemZ/SystemZPostRewrite.cpp
index e15f9027cc209..cf3073f0f2090 100644
--- a/llvm/lib/Target/SystemZ/SystemZPostRewrite.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZPostRewrite.cpp
@@ -107,6 +107,18 @@ void SystemZPostRewrite::selectSELRMux(MachineBasicBlock &MBB,
   bool Src1IsHigh = SystemZ::isHighReg(Src1Reg);
   bool Src2IsHigh = SystemZ::isHighReg(Src2Reg);
 
+  // In rare cases both sources are the same register (after
+  // machine-cse). This must be handled as it may lead to wrong-code (after
+  // machine-cp) if the kill flag on Src1 isn't cleared (with
+  // expandCondMove()).
+  if (Src1Reg == Src2Reg) {
+    BuildMI(*MBBI->getParent(), MBBI, MBBI->getDebugLoc(),
+            TII->get(SystemZ::COPY), DestReg)
+        .addReg(MBBI->getOperand(1).getReg(), getRegState(MBBI->getOperand(1)));
+    MBBI->eraseFromParent();
+    return;
+  }
+
   // If sources and destination aren't all high or all low, we may be able to
   // simplify the operation by moving one of the sources to the destination
   // first.  But only if this doesn't clobber the other source.
diff --git a/llvm/test/CodeGen/SystemZ/cond-move-10.mir b/llvm/test/CodeGen/SystemZ/cond-move-10.mir
new file mode 100644
index 0000000000000..1db960829729e
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/cond-move-10.mir
@@ -0,0 +1,21 @@
+# RUN: llc -o - %s -mtriple=s390x-linux-gnu -mcpu=z15 -run-pass=systemz-post-rewrite \
+# RUN:   2>&1 | FileCheck %s
+
+# The SELRMux has two identical sources - replace with a copy instruction.
+# CHECK:      name: fun0
+# CHECK:      renamable $r1l = AHIMuxK killed renamable $r1l, -1, implicit-def dead $cc
+# CHECK-NEXT: CHIMux renamable $r5h, 9, implicit-def $cc
+# CHECK-NEXT: $r14h = COPY killed renamable $r1l
+---
+name:            fun0
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $r1l, $r5h
+    renamable $r1l = AHIMuxK killed renamable $r1l, -1, implicit-def dead $cc
+    CHIMux renamable $r5h, 9, implicit-def $cc
+    renamable $r14h = SELRMux killed renamable $r1l, renamable $r1l, 14, 8, implicit $cc
+    $r14l = COPY killed renamable $r14h
+    $r14d = LGFR $r14l
+    Return implicit $r14d
+...

From f7c7db9b6fb14464d8ac6e224b801f6de81638d9 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Sat, 1 Feb 2025 21:40:58 +0000
Subject: [PATCH 082/282] [VPlan] Check VPWidenIntrinsicSC in
 VPRecipeWithIRFlags::classof.

When VPWidenIntrinsicRecipe was changed to inhert from VPRecipeWithIRFlags,
VPRecipeWithIRFlags::classof wasn't updated accordingly. Also check for
VPWidenIntrinsicSC in VPRecipeWithIRFlags::classof.

Fixes https://github.com/llvm/llvm-project/issues/125301.

(cherry picked from commit 75b922dccfc35ec25a520b1941e6682a300802b8)
---
 llvm/lib/Transforms/Vectorize/VPlan.h         |   1 +
 .../AArch64/drop-poison-generating-flags.ll   | 151 ++++++++++++++++++
 2 files changed, 152 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/drop-poison-generating-flags.ll

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index a1ff684b2b801..6c95b08a02014 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1058,6 +1058,7 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe {
            R->getVPDefID() == VPRecipeBase::VPWidenEVLSC ||
            R->getVPDefID() == VPRecipeBase::VPWidenGEPSC ||
            R->getVPDefID() == VPRecipeBase::VPWidenCastSC ||
+           R->getVPDefID() == VPRecipeBase::VPWidenIntrinsicSC ||
            R->getVPDefID() == VPRecipeBase::VPReplicateSC ||
            R->getVPDefID() == VPRecipeBase::VPReverseVectorPointerSC ||
            R->getVPDefID() == VPRecipeBase::VPVectorPointerSC;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/drop-poison-generating-flags.ll b/llvm/test/Transforms/LoopVectorize/AArch64/drop-poison-generating-flags.ll
new file mode 100644
index 0000000000000..53bd2d119c1ae
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/drop-poison-generating-flags.ll
@@ -0,0 +1,151 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
+; RUN: opt -p loop-vectorize -mcpu=neoverse-v2 -force-vector-width=4 -S %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux"
+
+; Test case where we visit a VPWidenIntrinsic (for @llvm.fabs) with nnan flags.
+; For https://github.com/llvm/llvm-project/issues/125301.
+define void @check_widen_intrinsic_with_nnan(ptr noalias %dst.0, ptr noalias %dst.1, ptr noalias %src.1, ptr %src.2) {
+; CHECK-LABEL: define void @check_widen_intrinsic_with_nnan(
+; CHECK-SAME: ptr noalias [[DST_0:%.*]], ptr noalias [[DST_1:%.*]], ptr noalias [[SRC_1:%.*]], ptr [[SRC_2:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE6:.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, ptr [[SRC_1]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.fabs.v4f64(<4 x double> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[TMP4:%.*]] = fcmp olt <4 x double> [[TMP3]], splat (double 1.000000e+00)
+; CHECK-NEXT:    [[TMP5:%.*]] = xor <4 x i1> [[TMP4]], splat (i1 true)
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[TMP0]], -1
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr double, ptr [[DST_0]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr double, ptr [[TMP7]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> zeroinitializer, ptr [[TMP8]], i32 8, <4 x i1> [[TMP5]])
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i1> [[TMP4]], i32 0
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; CHECK:       [[PRED_LOAD_IF]]:
+; CHECK-NEXT:    [[TMP10:%.*]] = load double, ptr [[SRC_2]], align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x double> poison, double [[TMP10]], i32 0
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
+; CHECK:       [[PRED_LOAD_CONTINUE]]:
+; CHECK-NEXT:    [[TMP12:%.*]] = phi <4 x double> [ poison, %[[VECTOR_BODY]] ], [ [[TMP11]], %[[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i1> [[TMP4]], i32 1
+; CHECK-NEXT:    br i1 [[TMP13]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2:.*]]
+; CHECK:       [[PRED_LOAD_IF1]]:
+; CHECK-NEXT:    [[TMP14:%.*]] = load double, ptr [[SRC_2]], align 8
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <4 x double> [[TMP12]], double [[TMP14]], i32 1
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE2]]
+; CHECK:       [[PRED_LOAD_CONTINUE2]]:
+; CHECK-NEXT:    [[TMP16:%.*]] = phi <4 x double> [ [[TMP12]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP15]], %[[PRED_LOAD_IF1]] ]
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x i1> [[TMP4]], i32 2
+; CHECK-NEXT:    br i1 [[TMP17]], label %[[PRED_LOAD_IF3:.*]], label %[[PRED_LOAD_CONTINUE4:.*]]
+; CHECK:       [[PRED_LOAD_IF3]]:
+; CHECK-NEXT:    [[TMP18:%.*]] = load double, ptr [[SRC_2]], align 8
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x double> [[TMP16]], double [[TMP18]], i32 2
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE4]]
+; CHECK:       [[PRED_LOAD_CONTINUE4]]:
+; CHECK-NEXT:    [[TMP20:%.*]] = phi <4 x double> [ [[TMP16]], %[[PRED_LOAD_CONTINUE2]] ], [ [[TMP19]], %[[PRED_LOAD_IF3]] ]
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i1> [[TMP4]], i32 3
+; CHECK-NEXT:    br i1 [[TMP21]], label %[[PRED_LOAD_IF5:.*]], label %[[PRED_LOAD_CONTINUE6]]
+; CHECK:       [[PRED_LOAD_IF5]]:
+; CHECK-NEXT:    [[TMP22:%.*]] = load double, ptr [[SRC_2]], align 8
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <4 x double> [[TMP20]], double [[TMP22]], i32 3
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE6]]
+; CHECK:       [[PRED_LOAD_CONTINUE6]]:
+; CHECK-NEXT:    [[TMP24:%.*]] = phi <4 x double> [ [[TMP20]], %[[PRED_LOAD_CONTINUE4]] ], [ [[TMP23]], %[[PRED_LOAD_IF5]] ]
+; CHECK-NEXT:    [[TMP25:%.*]] = add i64 [[TMP0]], -1
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr double, ptr [[DST_0]], i64 [[TMP25]]
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr double, ptr [[TMP26]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> zeroinitializer, ptr [[TMP27]], i32 8, <4 x i1> [[TMP4]])
+; CHECK-NEXT:    [[TMP28:%.*]] = fcmp oeq <4 x double> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[TMP29:%.*]] = select <4 x i1> [[TMP4]], <4 x i1> [[TMP28]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP30:%.*]] = or <4 x i1> [[TMP5]], [[TMP29]]
+; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <4 x i1> [[TMP29]], i32 0
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select i1 [[TMP31]], i64 [[TMP25]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr i32, ptr [[DST_1]], i64 [[PREDPHI]]
+; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr i32, ptr [[TMP32]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> splat (i32 10), ptr [[TMP33]], i32 4, <4 x i1> [[TMP30]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP34]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr inbounds double, ptr [[SRC_1]], i64 [[IV]]
+; CHECK-NEXT:    [[L_1:%.*]] = load double, ptr [[GEP_SRC_1]], align 8
+; CHECK-NEXT:    [[ABS:%.*]] = tail call nnan double @llvm.fabs.f64(double [[L_1]])
+; CHECK-NEXT:    [[C_0:%.*]] = fcmp olt double [[ABS]], 1.000000e+00
+; CHECK-NEXT:    br i1 [[C_0]], label %[[THEN:.*]], label %[[ELSE:.*]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    [[L_2:%.*]] = load double, ptr [[SRC_2]], align 8
+; CHECK-NEXT:    [[IV_SUB_1:%.*]] = add nsw i64 [[IV]], -1
+; CHECK-NEXT:    [[GEP_IV_SUB_1:%.*]] = getelementptr double, ptr [[DST_0]], i64 [[IV_SUB_1]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[GEP_IV_SUB_1]], align 8
+; CHECK-NEXT:    [[C_1:%.*]] = fcmp oeq double [[L_2]], 0.000000e+00
+; CHECK-NEXT:    br i1 [[C_1]], label %[[MERGE:.*]], label %[[LOOP_LATCH]]
+; CHECK:       [[ELSE]]:
+; CHECK-NEXT:    [[IV_SUB_2:%.*]] = add nsw i64 [[IV]], -1
+; CHECK-NEXT:    [[GEP_IV_SUB_2:%.*]] = getelementptr double, ptr [[DST_0]], i64 [[IV_SUB_2]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[GEP_IV_SUB_2]], align 8
+; CHECK-NEXT:    br label %[[MERGE]]
+; CHECK:       [[MERGE]]:
+; CHECK-NEXT:    [[MERGE_IV:%.*]] = phi i64 [ [[IV_SUB_2]], %[[ELSE]] ], [ [[IV_SUB_1]], %[[THEN]] ]
+; CHECK-NEXT:    [[GEP_DST_1:%.*]] = getelementptr inbounds i32, ptr [[DST_1]], i64 [[MERGE_IV]]
+; CHECK-NEXT:    store i32 10, ptr [[GEP_DST_1]], align 4
+; CHECK-NEXT:    br label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %gep.src.1 = getelementptr inbounds double, ptr %src.1, i64 %iv
+  %l.1 = load double, ptr %gep.src.1, align 8
+  %abs = tail call nnan double @llvm.fabs.f64(double %l.1)
+  %c.0 = fcmp olt double %abs, 1.000000e+00
+  br i1 %c.0, label %then, label %else
+
+then:
+  %l.2 = load double, ptr %src.2, align 8
+  %iv.sub.1 = add nsw i64 %iv, -1
+  %gep.iv.sub.1 = getelementptr double, ptr %dst.0, i64 %iv.sub.1
+  store double 0.000000e+00, ptr %gep.iv.sub.1, align 8
+  %c.1 = fcmp oeq double %l.2, 0.000000e+00
+  br i1 %c.1, label %merge, label %loop.latch
+
+else:
+  %iv.sub.2 = add nsw i64 %iv, -1
+  %gep.iv.sub.2 = getelementptr double, ptr %dst.0, i64 %iv.sub.2
+  store double 0.000000e+00, ptr %gep.iv.sub.2, align 8
+  br label %merge
+
+merge:
+  %merge.iv = phi i64 [ %iv.sub.2, %else ], [ %iv.sub.1, %then ]
+  %gep.dst.1 = getelementptr inbounds i32, ptr %dst.1, i64 %merge.iv
+  store i32 10, ptr %gep.dst.1, align 4
+  br label %loop.latch
+
+loop.latch:
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond.not, label %exit, label %loop.header
+
+exit:
+ret void
+}
+
+declare double @llvm.fabs.f64(double)

From 820c8c79c87532b44c62a53cc3865cda8a0846ee Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 3 Feb 2025 13:48:42 -0800
Subject: [PATCH 083/282] [RISCV] Check isFixedLengthVector before calling
 getVectorNumElements in getSingleShuffleSrc. (#125455)

I have been unsuccessful at further reducing the test. The
failure requires a shuffle with 2 scalable->fixed extracts with
the same source. 0 is the only valid index for a scalable->fixed
extract so the 2 sources must be the same extract. Shuffles with
the same source are aggressively canonicalized to a unary shuffle.
So it requires the extracts to become identical through other
optimizations without the shuffle being canonicalized before it is
lowered.

Fixes #125306.

(cherry picked from commit 7c5100d36d8027dd205d6ec410a63c3930a1d9c1)
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp |   3 +-
 llvm/test/CodeGen/RISCV/rvv/pr125306.ll     | 118 ++++++++++++++++++++
 2 files changed, 120 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/RISCV/rvv/pr125306.ll

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 8b5ee3e67ce63..585af30f8c8a1 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -4512,7 +4512,8 @@ static SDValue getSingleShuffleSrc(MVT VT, MVT ContainerVT, SDValue V1,
 
   // Src needs to have twice the number of elements.
   unsigned NumElts = VT.getVectorNumElements();
-  if (Src.getValueType().getVectorNumElements() != (NumElts * 2))
+  if (!Src.getValueType().isFixedLengthVector() ||
+      Src.getValueType().getVectorNumElements() != (NumElts * 2))
     return SDValue();
 
   // The extracts must extract the two halves of the source.
diff --git a/llvm/test/CodeGen/RISCV/rvv/pr125306.ll b/llvm/test/CodeGen/RISCV/rvv/pr125306.ll
new file mode 100644
index 0000000000000..111f87de220db
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/pr125306.ll
@@ -0,0 +1,118 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v | FileCheck %s
+
+; Test for an "Invalid size request on a scalable vector". Attempts to reduce
+; the test faurther were not successful. The failure requires a shuffle with 2
+; scalable->fixed extracts from the same vector. 0 is the only valid index for a
+; scalable->fixed extract so the 2 extract must be the same. Shuffles with the
+; same source are aggressively canonicalized to a unary shuffle so it requires
+; the extracts to become identical through other optimizations without the
+; shuffle being canonicalized before it is lowered.
+
+define <2 x i32> @main(ptr %0) {
+; CHECK-LABEL: main:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vse32.v v8, (zero)
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    li a2, 64
+; CHECK-NEXT:    sw zero, 80(zero)
+; CHECK-NEXT:    lui a1, 7
+; CHECK-NEXT:    lui a3, 1
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:    vid.v v11
+; CHECK-NEXT:    li a4, 16
+; CHECK-NEXT:    lui a5, 2
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vse32.v v10, (a2)
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    li a2, 24
+; CHECK-NEXT:    sh zero, -392(a3)
+; CHECK-NEXT:    sh zero, 534(a3)
+; CHECK-NEXT:    sh zero, 1460(a3)
+; CHECK-NEXT:    li a3, 32
+; CHECK-NEXT:    vse32.v v10, (a2)
+; CHECK-NEXT:    li a2, 40
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    sh zero, -1710(a5)
+; CHECK-NEXT:    sh zero, -784(a5)
+; CHECK-NEXT:    sh zero, 142(a5)
+; CHECK-NEXT:    lw a5, -304(a1)
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:    vadd.vi v9, v11, -1
+; CHECK-NEXT:    vse32.v v10, (a3)
+; CHECK-NEXT:    sh zero, 0(a0)
+; CHECK-NEXT:    lw a0, -188(a1)
+; CHECK-NEXT:    vse32.v v10, (a2)
+; CHECK-NEXT:    lw a2, -188(a1)
+; CHECK-NEXT:    lw a3, 1244(a1)
+; CHECK-NEXT:    vmv.v.x v8, a0
+; CHECK-NEXT:    lw a0, 1244(a1)
+; CHECK-NEXT:    lw a1, -304(a1)
+; CHECK-NEXT:    vmv.v.x v10, a3
+; CHECK-NEXT:    vmv.v.x v11, a5
+; CHECK-NEXT:    vslide1down.vx v8, v8, zero
+; CHECK-NEXT:    vslide1down.vx v10, v10, zero
+; CHECK-NEXT:    vmin.vv v8, v10, v8
+; CHECK-NEXT:    vmv.v.x v10, a0
+; CHECK-NEXT:    vslide1down.vx v11, v11, zero
+; CHECK-NEXT:    vmin.vx v10, v10, a2
+; CHECK-NEXT:    vmin.vx v10, v10, a1
+; CHECK-NEXT:    vmin.vv v11, v8, v11
+; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    vand.vv v9, v11, v9
+; CHECK-NEXT:    vslideup.vi v8, v10, 1
+; CHECK-NEXT:    vse32.v v9, (a4)
+; CHECK-NEXT:    sh zero, 0(zero)
+; CHECK-NEXT:    ret
+entry:
+  store <16 x i32> zeroinitializer, ptr null, align 4
+  store <8 x i32> zeroinitializer, ptr %0, align 4
+  store <4 x i32> zeroinitializer, ptr getelementptr inbounds nuw (i8, ptr null, i64 64), align 4
+  store i32 0, ptr getelementptr inbounds nuw (i8, ptr null, i64 80), align 4
+  %1 = load i32, ptr getelementptr inbounds nuw (i8, ptr null, i64 29916), align 4
+  %broadcast.splatinsert53 = insertelement <4 x i32> zeroinitializer, i32 %1, i64 0
+  %2 = load i32, ptr getelementptr inbounds nuw (i8, ptr null, i64 28484), align 4
+  %broadcast.splatinsert55 = insertelement <4 x i32> zeroinitializer, i32 %2, i64 0
+  %3 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %broadcast.splatinsert53, <4 x i32> %broadcast.splatinsert55)
+  %4 = load i32, ptr getelementptr inbounds nuw (i8, ptr null, i64 28368), align 4
+  %broadcast.splatinsert57 = insertelement <4 x i32> zeroinitializer, i32 %4, i64 0
+  %5 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %3, <4 x i32> %broadcast.splatinsert57)
+  store i16 0, ptr getelementptr inbounds nuw (i8, ptr null, i64 3704), align 2
+  store i16 0, ptr getelementptr inbounds nuw (i8, ptr null, i64 4630), align 2
+  %6 = shufflevector <4 x i32> %5, <4 x i32> zeroinitializer, <2 x i32> <i32 0, i32 4>
+  store <2 x i32> %6, ptr getelementptr inbounds nuw (i8, ptr null, i64 16), align 4
+  store i16 0, ptr getelementptr inbounds nuw (i8, ptr null, i64 5556), align 2
+  store i16 0, ptr getelementptr inbounds nuw (i8, ptr null, i64 6482), align 2
+  store <2 x i32> zeroinitializer, ptr getelementptr inbounds nuw (i8, ptr null, i64 24), align 4
+  store i16 0, ptr getelementptr inbounds nuw (i8, ptr null, i64 7408), align 2
+  store i16 0, ptr getelementptr inbounds nuw (i8, ptr null, i64 8334), align 2
+  store <2 x i32> zeroinitializer, ptr getelementptr inbounds nuw (i8, ptr null, i64 32), align 4
+  store i16 0, ptr %0, align 2
+  store <2 x i32> zeroinitializer, ptr getelementptr inbounds nuw (i8, ptr null, i64 40), align 4
+  %7 = load i32, ptr getelementptr inbounds nuw (i8, ptr null, i64 29916), align 4
+  %broadcast.splatinsert165 = insertelement <4 x i32> poison, i32 %7, i64 0
+  %8 = load i32, ptr getelementptr inbounds nuw (i8, ptr null, i64 28484), align 4
+  %broadcast.splatinsert167 = insertelement <4 x i32> poison, i32 %8, i64 0
+  %9 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %broadcast.splatinsert165, <4 x i32> %broadcast.splatinsert167)
+  %10 = load i32, ptr getelementptr inbounds nuw (i8, ptr null, i64 28368), align 4
+  %broadcast.splatinsert169 = insertelement <4 x i32> poison, i32 %10, i64 0
+  %11 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %9, <4 x i32> %broadcast.splatinsert169)
+  store i16 0, ptr null, align 2
+  %12 = load i32, ptr getelementptr inbounds nuw (i8, ptr null, i64 29916), align 4
+  %broadcast.splatinsert179 = insertelement <4 x i32> poison, i32 %12, i64 0
+  %13 = load i32, ptr getelementptr inbounds nuw (i8, ptr null, i64 28484), align 4
+  %broadcast.splatinsert181 = insertelement <4 x i32> poison, i32 %13, i64 0
+  %14 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %broadcast.splatinsert179, <4 x i32> %broadcast.splatinsert181)
+  %15 = load i32, ptr getelementptr inbounds nuw (i8, ptr null, i64 28368), align 4
+  %broadcast.splatinsert183 = insertelement <4 x i32> poison, i32 %15, i64 0
+  %16 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %14, <4 x i32> %broadcast.splatinsert183)
+  %17 = shufflevector <4 x i32> %11, <4 x i32> %16, <2 x i32> <i32 0, i32 4>
+  ret <2 x i32> %17
+}

From 0d363c30b875e91a9f8e20a17b5dc7281e58cf2e Mon Sep 17 00:00:00 2001
From: Chuanqi Xu <yedeng.yd@linux.alibaba.com>
Date: Wed, 5 Feb 2025 13:41:48 +0800
Subject: [PATCH 084/282] [C++20] [Modules] Don't diagnose duplicated friend
 declarations between modules incorrectly

Close https://github.com/llvm/llvm-project/issues/125521

We shouldn't use the ownership information for friend declarations to do
anything.

(cherry picked from commit c5a9a72b3cd118a23193d01bf9393fbf1d4b90ae)
---
 clang/lib/AST/ASTContext.cpp              |  3 +-
 clang/lib/Serialization/ASTReaderDecl.cpp |  5 ++
 clang/test/Modules/pr125521.cppm          | 57 +++++++++++++++++++++++
 3 files changed, 64 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/Modules/pr125521.cppm

diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index e58091ce95f62..dbeb3d105ad17 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -1055,7 +1055,8 @@ void ASTContext::PrintStats() const {
 void ASTContext::mergeDefinitionIntoModule(NamedDecl *ND, Module *M,
                                            bool NotifyListeners) {
   if (NotifyListeners)
-    if (auto *Listener = getASTMutationListener())
+    if (auto *Listener = getASTMutationListener();
+        Listener && !ND->isUnconditionallyVisible())
       Listener->RedefinedHiddenDefinition(ND, M);
 
   MergedDefModules[cast<NamedDecl>(ND->getCanonicalDecl())].push_back(M);
diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index 8210eb2143acf..1aa94d5a22abe 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -3746,6 +3746,11 @@ void ASTDeclReader::checkMultipleDefinitionInNamedModules(ASTReader &Reader,
       Func && Func->getTemplateSpecializationInfo())
     return;
 
+  // The module ownership of in-class friend declaration is not straightforward.
+  // Avoid diagnosing such cases.
+  if (D->getFriendObjectKind() || Previous->getFriendObjectKind())
+    return;
+
   Module *M = Previous->getOwningModule();
   if (!M)
     return;
diff --git a/clang/test/Modules/pr125521.cppm b/clang/test/Modules/pr125521.cppm
new file mode 100644
index 0000000000000..d064cdfe3eb73
--- /dev/null
+++ b/clang/test/Modules/pr125521.cppm
@@ -0,0 +1,57 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// RUN: %clang_cc1 -std=c++20 %t/mod2.cppm -emit-module-interface -o %t/mod2.pcm
+// RUN: %clang_cc1 -std=c++20 %t/mod1.cppm -emit-module-interface -o %t/mod1.pcm \
+// RUN:     -fmodule-file=Mod2=%t/mod2.pcm
+// RUN: %clang_cc1 -std=c++20 %t/test.cc -fmodule-file=Mod2=%t/mod2.pcm -fmodule-file=Mod=%t/mod1.pcm \
+// RUN:     -fsyntax-only -verify
+
+// RUN: %clang_cc1 -std=c++20 %t/mod2.cppm -emit-module-interface -o %t/mod2.pcm
+// RUN: %clang_cc1 -std=c++20 %t/mod1.cppm -emit-module-interface -o %t/mod1.pcm \
+// RUN:     -fmodule-file=Mod2=%t/mod2.pcm
+// RUN: %clang_cc1 -std=c++20 %t/mod1.pcm  -fmodule-file=Mod2=%t/mod2.pcm -emit-llvm -o - \
+// RUN:     | FileCheck %t/mod1.cppm
+
+//--- hello.h
+template <typename V> int get() noexcept {return 0;};
+
+template <typename T>
+class List
+{
+    template <typename V> friend int get() noexcept;
+};
+
+//--- mod2.cppm
+module;
+#include "hello.h"
+export module Mod2;
+export const char *modFn2() {
+    List<int> a;
+    return "hello";
+}
+
+//--- mod1.cppm
+module;
+#include "hello.h"
+export module Mod;
+import Mod2;
+export extern "C" const char *modFn() {
+    List<int> a;
+    List<double> b;
+    return modFn2();
+}
+
+// Fine enough to check it won't crash.
+// CHECK: define {{.*}}@modFn
+
+//--- test.cc
+// expected-no-diagnostics
+import Mod;
+import Mod2;
+
+void test() {
+    modFn();
+    modFn2();
+}

From 04d55131ce724ad4a8ca2686ee54df83484f94e9 Mon Sep 17 00:00:00 2001
From: Diego Caballero <dieg0ca6aller0@gmail.com>
Date: Thu, 30 Jan 2025 16:09:19 -0800
Subject: [PATCH 085/282] [mlir][cmake] Add missing MLIRTestDialect
 dependencies

This cherry picks
[mlir] Fix build race condition in Pass Manager tests (d906da5ead2764579395e5006c517f2ec9afd46f)
to the 20.x release branch.

This addresses issues that started with
https://github.com/llvm/llvm-project/pull/123910, which is already on the 20.x branch.

Linaro noticed this on our flang dylib (shared library) build bot.

In file included from /home/tcwg-buildbot/worker/flang-aarch64-dylib/llvm-project/mlir/test/lib/Pass/TestPassManager.cpp:10:
/home/tcwg-buildbot/worker/flang-aarch64-dylib/llvm-project/mlir/test/lib/Pass/../Dialect/Test/TestOps.h:148:10: fatal error: 'TestOps.h.inc' file not found
  148 | #include "TestOps.h.inc"
      |          ^~~~~~~~~~~~~~~

We have tested these changes on the buildbot for the last 2 days and had no problems.
Whereas before it was failing maybe 1 in 10 builds, enough that multiple people
in the community noticed it.

Reported in https://github.com/llvm/llvm-project/issues/124485.
---
 mlir/test/lib/Pass/CMakeLists.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mlir/test/lib/Pass/CMakeLists.txt b/mlir/test/lib/Pass/CMakeLists.txt
index 6698af86b8ae6..c5d0bab8ec749 100644
--- a/mlir/test/lib/Pass/CMakeLists.txt
+++ b/mlir/test/lib/Pass/CMakeLists.txt
@@ -10,12 +10,14 @@ add_mlir_library(MLIRTestPass
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Pass
+
+  LINK_LIBS PUBLIC
+  MLIRTestDialect
   )
 mlir_target_link_libraries(MLIRTestPass PUBLIC
   ${conversion_libs}
   MLIRIR
   MLIRPass
-  MLIRTestDialect
   )
 
 target_include_directories(MLIRTestPass

From 38333d5e4258edaaaf449b720391d8bf2888e096 Mon Sep 17 00:00:00 2001
From: Oliver Stannard <oliver.stannard@arm.com>
Date: Fri, 31 Jan 2025 09:03:01 +0000
Subject: [PATCH 086/282] [ARM] Empty structs are 1-byte for C++ ABI (#124762)

For C++ (but not C), empty structs should be passed to functions as if
they are a 1 byte object with 1 byte alignment.

This is defined in Arm's CPPABI32:
  https://github.com/ARM-software/abi-aa/blob/main/cppabi32/cppabi32.rst
  For the purposes of parameter passing in AAPCS32, a parameter whose
  type is an empty class shall be treated as if its type were an
  aggregate with a single member of type unsigned byte.

The AArch64 equivalent of this has an exception for structs containing
an array of size zero, I've kept that logic for ARM. I've not found a
reason for this exception, but I've checked that GCC does have the same
behaviour for ARM as it does for AArch64.

The AArch64 version has an Apple ABI with different rules, which ignores
empty structs in both C and C++. This is documented at
https://developer.apple.com/documentation/xcode/writing-arm64-code-for-apple-platforms.
The ARM equivalent of that appears to be AAPCS16_VFP, used for WatchOS,
but I can't find any documentation for that ABI, so I'm not sure what
rules it should follow. For now I've left it following the AArch64 Apple
rules.
---
 clang/docs/ReleaseNotes.rst             |   5 +
 clang/include/clang/Basic/LangOptions.h |   2 +
 clang/lib/CodeGen/Targets/ARM.cpp       |  45 +++++++-
 clang/test/CodeGen/arm-empty-args.cpp   | 131 ++++++++++++++++++++++++
 4 files changed, 178 insertions(+), 5 deletions(-)
 create mode 100644 clang/test/CodeGen/arm-empty-args.cpp

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 3281ac0c4dbe2..b61563ade0a1e 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -1199,6 +1199,11 @@ Arm and AArch64 Support
 - Runtime detection of depended-on Function Multi Versioning features has been added
   in accordance with the Arm C Language Extensions (ACLE).
 
+- The ARM calling convention for empty structs in C++ mode was changed to pass
+  them as if they have a size of 1 byte, matching the AAPCS32 specification and
+  GCC's implementation. The previous behaviour of ignoring the argument can be
+  restored using the -fclang-abi-compat=19 (or earlier) option.
+
 Android Support
 ^^^^^^^^^^^^^^^
 
diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h
index 114a5d34a008b..16c35bcf49339 100644
--- a/clang/include/clang/Basic/LangOptions.h
+++ b/clang/include/clang/Basic/LangOptions.h
@@ -246,6 +246,8 @@ class LangOptionsBase {
     ///   construction vtable because it hasn't added 'type' as a substitution.
     ///   - Skip mangling enclosing class templates of member-like friend
     ///   function templates.
+    ///   - Ignore empty struct arguments in C++ mode for ARM, instead of
+    ///   passing them as if they had a size of 1 byte.
     Ver19,
 
     /// Conform to the underlying platform's C and C++ ABIs as closely
diff --git a/clang/lib/CodeGen/Targets/ARM.cpp b/clang/lib/CodeGen/Targets/ARM.cpp
index 2d858fa2f3c3a..47e31ceeaf294 100644
--- a/clang/lib/CodeGen/Targets/ARM.cpp
+++ b/clang/lib/CodeGen/Targets/ARM.cpp
@@ -71,6 +71,7 @@ class ARMABIInfo : public ABIInfo {
                                   unsigned functionCallConv) const;
   ABIArgInfo classifyHomogeneousAggregate(QualType Ty, const Type *Base,
                                           uint64_t Members) const;
+  bool shouldIgnoreEmptyArg(QualType Ty) const;
   ABIArgInfo coerceIllegalVector(QualType Ty) const;
   bool isIllegalVectorType(QualType Ty) const;
   bool containsAnyFP16Vectors(QualType Ty) const;
@@ -328,6 +329,31 @@ ABIArgInfo ARMABIInfo::classifyHomogeneousAggregate(QualType Ty,
   return ABIArgInfo::getDirect(nullptr, 0, nullptr, false, Align);
 }
 
+bool ARMABIInfo::shouldIgnoreEmptyArg(QualType Ty) const {
+  uint64_t Size = getContext().getTypeSize(Ty);
+  assert((isEmptyRecord(getContext(), Ty, true) || Size == 0) &&
+         "Arg is not empty");
+
+  // Empty records are ignored in C mode, and in C++ on WatchOS.
+  if (!getContext().getLangOpts().CPlusPlus ||
+      getABIKind() == ARMABIKind::AAPCS16_VFP)
+    return true;
+
+  // In C++ mode, arguments which have sizeof() == 0 are ignored. This is not a
+  // situation which is defined by any C++ standard or ABI, but this matches
+  // GCC's de facto ABI.
+  if (Size == 0)
+    return true;
+
+  // Clang 19.0 and earlier always ignored empty struct arguments in C++ mode.
+  if (getContext().getLangOpts().getClangABICompat() <=
+      LangOptions::ClangABI::Ver19)
+    return true;
+
+  // Otherwise, they are passed as if they have a size of 1 byte.
+  return false;
+}
+
 ABIArgInfo ARMABIInfo::classifyArgumentType(QualType Ty, bool isVariadic,
                                             unsigned functionCallConv) const {
   // 6.1.2.1 The following argument types are VFP CPRCs:
@@ -366,9 +392,15 @@ ABIArgInfo ARMABIInfo::classifyArgumentType(QualType Ty, bool isVariadic,
     return getNaturalAlignIndirect(Ty, RAA == CGCXXABI::RAA_DirectInMemory);
   }
 
-  // Ignore empty records.
-  if (isEmptyRecord(getContext(), Ty, true))
-    return ABIArgInfo::getIgnore();
+  // Empty records are either ignored completely or passed as if they were a
+  // 1-byte object, depending on the ABI and language standard.
+  if (isEmptyRecord(getContext(), Ty, true) ||
+      getContext().getTypeSize(Ty) == 0) {
+    if (shouldIgnoreEmptyArg(Ty))
+      return ABIArgInfo::getIgnore();
+    else
+      return ABIArgInfo::getDirect(llvm::Type::getInt8Ty(getVMContext()));
+  }
 
   if (IsAAPCS_VFP) {
     // Homogeneous Aggregates need to be expanded when we can fit the aggregate
@@ -588,7 +620,8 @@ ABIArgInfo ARMABIInfo::classifyReturnType(QualType RetTy, bool isVariadic,
 
   // Otherwise this is an AAPCS variant.
 
-  if (isEmptyRecord(getContext(), RetTy, true))
+  if (isEmptyRecord(getContext(), RetTy, true) ||
+      getContext().getTypeSize(RetTy) == 0)
     return ABIArgInfo::getIgnore();
 
   // Check for homogeneous aggregates with AAPCS-VFP.
@@ -752,7 +785,9 @@ RValue ARMABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
   CharUnits SlotSize = CharUnits::fromQuantity(4);
 
   // Empty records are ignored for parameter passing purposes.
-  if (isEmptyRecord(getContext(), Ty, true))
+  uint64_t Size = getContext().getTypeSize(Ty);
+  bool IsEmpty = isEmptyRecord(getContext(), Ty, true);
+  if ((IsEmpty || Size == 0) && shouldIgnoreEmptyArg(Ty))
     return Slot.asRValue();
 
   CharUnits TySize = getContext().getTypeSizeInChars(Ty);
diff --git a/clang/test/CodeGen/arm-empty-args.cpp b/clang/test/CodeGen/arm-empty-args.cpp
new file mode 100644
index 0000000000000..4e61c78b73ab9
--- /dev/null
+++ b/clang/test/CodeGen/arm-empty-args.cpp
@@ -0,0 +1,131 @@
+// RUN: %clang_cc1 -triple armv7a-linux-gnueabi -emit-llvm -o - -x c %s | FileCheck %s --check-prefixes=CHECK,C
+// RUN: %clang_cc1 -triple armv7a-linux-gnueabi -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,CXX
+// RUN: %clang_cc1 -triple armv7a-linux-gnueabi -emit-llvm -o - %s -fclang-abi-compat=19 | FileCheck %s --check-prefixes=CHECK,CXXCLANG19
+// RUN: %clang_cc1 -triple thumbv7k-apple-watchos2.0 -target-abi aapcs16 -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,WATCHOS
+
+// Empty structs are ignored for PCS purposes on WatchOS and in C mode
+// elsewhere.  In C++ mode they consume a register slot though. Functions are
+// slightly bigger than minimal to make confirmation against actual GCC
+// behaviour easier.
+
+#if __cplusplus
+#define EXTERNC extern "C"
+#else
+#define EXTERNC
+#endif
+
+struct Empty {};
+
+// C: define{{.*}} i32 @empty_arg(i32 noundef %a)
+// CXX: define{{.*}} i32 @empty_arg(i8 %e.coerce, i32 noundef %a)
+// CXXCLANG19: define{{.*}} i32 @empty_arg(i32 noundef %a)
+// WATCHOS: define{{.*}} i32 @empty_arg(i32 noundef %a)
+EXTERNC int empty_arg(struct Empty e, int a) {
+  return a;
+}
+
+// C: define{{.*}} void @empty_ret()
+// CXX: define{{.*}} void @empty_ret()
+// CXXCLANG19: define{{.*}} void @empty_ret()
+// WATCHOS: define{{.*}} void @empty_ret()
+EXTERNC struct Empty empty_ret(void) {
+  struct Empty e;
+  return e;
+}
+
+// However, what counts as "empty" is a baroque mess. This is super-empty, it's
+// ignored even in C++ mode. It also has sizeof == 0, violating C++, but that's
+// legacy for you:
+
+struct SuperEmpty {
+  int arr[0];
+};
+
+// C: define{{.*}} i32 @super_empty_arg(i32 noundef %a)
+// CXX: define{{.*}} i32 @super_empty_arg(i32 noundef %a)
+// CXXCLANG19: define{{.*}} i32 @super_empty_arg(i32 noundef %a)
+// WATCHOS: define{{.*}} i32 @super_empty_arg(i32 noundef %a)
+EXTERNC int super_empty_arg(struct SuperEmpty e, int a) {
+  return a;
+}
+
+struct SortOfEmpty {
+  struct SuperEmpty e;
+};
+
+// C: define{{.*}} i32 @sort_of_empty_arg(i32 noundef %a)
+// CXX: define{{.*}} i32 @sort_of_empty_arg(i8 %e.coerce, i32 noundef %a)
+// CXXCLANG19: define{{.*}} i32 @sort_of_empty_arg(i32 noundef %a)
+// WATCHOS: define{{.*}} i32 @sort_of_empty_arg(i32 noundef %a)
+EXTERNC int sort_of_empty_arg(struct Empty e, int a) {
+  return a;
+}
+
+// C: define{{.*}} void @sort_of_empty_ret()
+// CXX: define{{.*}} void @sort_of_empty_ret()
+// CXXCLANG19: define{{.*}} void @sort_of_empty_ret()
+// WATCHOS: define{{.*}} void @sort_of_empty_ret()
+EXTERNC struct SortOfEmpty sort_of_empty_ret(void) {
+  struct SortOfEmpty e;
+  return e;
+}
+
+#include <stdarg.h>
+
+// va_arg matches the above rules, consuming an incoming argument in cases
+// where one would be passed, and not doing so when the argument should be
+// ignored.
+
+EXTERNC int empty_arg_variadic(int a, ...) {
+// CHECK-LABEL: @empty_arg_variadic(
+// C: %argp.next = getelementptr inbounds i8, ptr %argp.cur, i32 4
+// C-NOT: {{ getelementptr }}
+// CXX: %argp.next = getelementptr inbounds i8, ptr %argp.cur, i32 4
+// CXX: %argp.next2 = getelementptr inbounds i8, ptr %argp.cur1, i32 4
+// CXXCLANG19: %argp.next = getelementptr inbounds i8, ptr %argp.cur, i32 4
+// CXXCLANG19-NOT: {{ getelementptr }}
+// WATCHOS: %argp.next = getelementptr inbounds i8, ptr %argp.cur, i32 4
+// WATCHOS-NOT: {{ getelementptr }}
+  va_list vl;
+  va_start(vl, a);
+  struct Empty b = va_arg(vl, struct Empty);
+  int c = va_arg(vl, int);
+  va_end(vl);
+  return c;
+}
+
+EXTERNC int super_empty_arg_variadic(int a, ...) {
+// CHECK-LABEL: @super_empty_arg_variadic(
+// C: %argp.next = getelementptr inbounds i8, ptr %argp.cur, i32 4
+// C-NOT: {{ getelementptr }}
+// CXX: %argp.next = getelementptr inbounds i8, ptr %argp.cur, i32 4
+// CXX-NOT: {{ getelementptr }}
+// CXXCLANG19: %argp.next = getelementptr inbounds i8, ptr %argp.cur, i32 4
+// CXXCLANG19-NOT: {{ getelementptr }}
+// WATCHOS: %argp.next = getelementptr inbounds i8, ptr %argp.cur, i32 4
+// WATCHOS-NOT: {{ getelementptr }}
+  va_list vl;
+  va_start(vl, a);
+  struct SuperEmpty b = va_arg(vl, struct SuperEmpty);
+  int c = va_arg(vl, int);
+  va_end(vl);
+  return c;
+}
+
+EXTERNC int sort_of_empty_arg_variadic(int a, ...) {
+// CHECK-LABEL: @sort_of_empty_arg_variadic(
+// C: %argp.next = getelementptr inbounds i8, ptr %argp.cur, i32 4
+// C-NOT: {{ getelementptr }}
+// CXX: %argp.next = getelementptr inbounds i8, ptr %argp.cur, i32 4
+// CXX-NOT: {{ getelementptr }}
+// CXXCLANG19: %argp.next = getelementptr inbounds i8, ptr %argp.cur, i32 4
+// CXXCLANG19-NOT: {{ getelementptr }}
+// WATCHOS: %argp.next = getelementptr inbounds i8, ptr %argp.cur, i32 4
+// WATCHOS-NOT: {{ getelementptr }}
+  va_list vl;
+  va_start(vl, a);
+  struct SortOfEmpty b = va_arg(vl, struct SortOfEmpty);
+  int c = va_arg(vl, int);
+  va_end(vl);
+  return c;
+}

From 5f2d66a9d22537332a9630708b0a7602848babc6 Mon Sep 17 00:00:00 2001
From: Amr Hesham <amr96@programmer.net>
Date: Sat, 8 Feb 2025 14:14:16 +0100
Subject: [PATCH 087/282] [llvm-objcopy] Fix prints wrong path when
 dump-section output path doesn't exist (#125345)

Fix printing the correct file path in the error message when the output
file specified by `--dump-section` cannot be opened

Fixes: #125113 on ELF, MachO, Wasm
(cherry picked from commit 66bea0df75ccdd5ffed41d06c7301a116d11abcb)
---
 llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp           | 59 ++++++++++---------
 llvm/lib/ObjCopy/MachO/MachOObjcopy.cpp       | 27 +++++----
 llvm/lib/ObjCopy/wasm/WasmObjcopy.cpp         | 15 ++---
 .../tools/llvm-objcopy/ELF/dump-section.test  |  4 ++
 .../llvm-objcopy/MachO/dump-section.test      |  4 ++
 .../tools/llvm-objcopy/wasm/dump-section.test |  4 ++
 6 files changed, 64 insertions(+), 49 deletions(-)

diff --git a/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp b/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp
index 5aa0079f3fbc7..9c78f7433ad33 100644
--- a/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp
+++ b/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp
@@ -186,27 +186,28 @@ static std::unique_ptr<Writer> createWriter(const CommonConfig &Config,
 }
 
 static Error dumpSectionToFile(StringRef SecName, StringRef Filename,
-                               Object &Obj) {
+                               StringRef InputFilename, Object &Obj) {
   for (auto &Sec : Obj.sections()) {
     if (Sec.Name == SecName) {
       if (Sec.Type == SHT_NOBITS)
-        return createStringError(object_error::parse_failed,
-                                 "cannot dump section '%s': it has no contents",
-                                 SecName.str().c_str());
+        return createFileError(InputFilename, object_error::parse_failed,
+                               "cannot dump section '%s': it has no contents",
+                               SecName.str().c_str());
       Expected<std::unique_ptr<FileOutputBuffer>> BufferOrErr =
           FileOutputBuffer::create(Filename, Sec.OriginalData.size());
       if (!BufferOrErr)
-        return BufferOrErr.takeError();
+        return createFileError(Filename, BufferOrErr.takeError());
       std::unique_ptr<FileOutputBuffer> Buf = std::move(*BufferOrErr);
       std::copy(Sec.OriginalData.begin(), Sec.OriginalData.end(),
                 Buf->getBufferStart());
       if (Error E = Buf->commit())
-        return E;
+        return createFileError(Filename, std::move(E));
       return Error::success();
     }
   }
-  return createStringError(object_error::parse_failed, "section '%s' not found",
-                           SecName.str().c_str());
+
+  return createFileError(InputFilename, object_error::parse_failed,
+                         "section '%s' not found", SecName.str().c_str());
 }
 
 Error Object::compressOrDecompressSections(const CommonConfig &Config) {
@@ -798,7 +799,8 @@ static Error handleArgs(const CommonConfig &Config, const ELFConfig &ELFConfig,
     StringRef SectionName;
     StringRef FileName;
     std::tie(SectionName, FileName) = Flag.split('=');
-    if (Error E = dumpSectionToFile(SectionName, FileName, Obj))
+    if (Error E =
+            dumpSectionToFile(SectionName, FileName, Config.InputFilename, Obj))
       return E;
   }
 
@@ -807,10 +809,10 @@ static Error handleArgs(const CommonConfig &Config, const ELFConfig &ELFConfig,
   // us to avoid reporting the inappropriate errors about removing symbols
   // named in relocations.
   if (Error E = replaceAndRemoveSections(Config, ELFConfig, Obj))
-    return E;
+    return createFileError(Config.InputFilename, std::move(E));
 
   if (Error E = updateAndRemoveSymbols(Config, ELFConfig, Obj))
-    return E;
+    return createFileError(Config.InputFilename, std::move(E));
 
   if (!Config.SetSectionAlignment.empty()) {
     for (SectionBase &Sec : Obj.sections()) {
@@ -826,8 +828,8 @@ static Error handleArgs(const CommonConfig &Config, const ELFConfig &ELFConfig,
         if (Config.ChangeSectionLMAValAll > 0 &&
             Seg.PAddr > std::numeric_limits<uint64_t>::max() -
                             Config.ChangeSectionLMAValAll) {
-          return createStringError(
-              errc::invalid_argument,
+          return createFileError(
+              Config.InputFilename, errc::invalid_argument,
               "address 0x" + Twine::utohexstr(Seg.PAddr) +
                   " cannot be increased by 0x" +
                   Twine::utohexstr(Config.ChangeSectionLMAValAll) +
@@ -835,8 +837,8 @@ static Error handleArgs(const CommonConfig &Config, const ELFConfig &ELFConfig,
         } else if (Config.ChangeSectionLMAValAll < 0 &&
                    Seg.PAddr < std::numeric_limits<uint64_t>::min() -
                                    Config.ChangeSectionLMAValAll) {
-          return createStringError(
-              errc::invalid_argument,
+          return createFileError(
+              Config.InputFilename, errc::invalid_argument,
               "address 0x" + Twine::utohexstr(Seg.PAddr) +
                   " cannot be decreased by 0x" +
                   Twine::utohexstr(std::abs(Config.ChangeSectionLMAValAll)) +
@@ -849,10 +851,9 @@ static Error handleArgs(const CommonConfig &Config, const ELFConfig &ELFConfig,
 
   if (!Config.ChangeSectionAddress.empty()) {
     if (Obj.Type != ELF::ET_REL)
-      return createStringError(
-          object_error::invalid_file_type,
+      return createFileError(
+          Config.InputFilename, object_error::invalid_file_type,
           "cannot change section address in a non-relocatable file");
-
     StringMap<AddressUpdate> SectionsToUpdateAddress;
     for (const SectionPatternAddressUpdate &PatternUpdate :
          make_range(Config.ChangeSectionAddress.rbegin(),
@@ -863,8 +864,8 @@ static Error handleArgs(const CommonConfig &Config, const ELFConfig &ELFConfig,
                 .second) {
           if (PatternUpdate.Update.Kind == AdjustKind::Subtract &&
               Sec.Addr < PatternUpdate.Update.Value) {
-            return createStringError(
-                errc::invalid_argument,
+            return createFileError(
+                Config.InputFilename, errc::invalid_argument,
                 "address 0x" + Twine::utohexstr(Sec.Addr) +
                     " cannot be decreased by 0x" +
                     Twine::utohexstr(PatternUpdate.Update.Value) +
@@ -873,8 +874,8 @@ static Error handleArgs(const CommonConfig &Config, const ELFConfig &ELFConfig,
           if (PatternUpdate.Update.Kind == AdjustKind::Add &&
               Sec.Addr > std::numeric_limits<uint64_t>::max() -
                              PatternUpdate.Update.Value) {
-            return createStringError(
-                errc::invalid_argument,
+            return createFileError(
+                Config.InputFilename, errc::invalid_argument,
                 "address 0x" + Twine::utohexstr(Sec.Addr) +
                     " cannot be increased by 0x" +
                     Twine::utohexstr(PatternUpdate.Update.Value) +
@@ -909,7 +910,7 @@ static Error handleArgs(const CommonConfig &Config, const ELFConfig &ELFConfig,
   if (!ELFConfig.NotesToRemove.empty()) {
     if (Error Err =
             removeNotes(Obj, E, ELFConfig.NotesToRemove, Config.ErrorCallback))
-      return Err;
+      return createFileError(Config.InputFilename, std::move(Err));
   }
 
   for (const NewSectionInfo &AddedSection : Config.AddSection) {
@@ -924,7 +925,7 @@ static Error handleArgs(const CommonConfig &Config, const ELFConfig &ELFConfig,
       return Error::success();
     };
     if (Error E = handleUserSection(AddedSection, AddSection))
-      return E;
+      return createFileError(Config.InputFilename, std::move(E));
   }
 
   for (const NewSectionInfo &NewSection : Config.UpdateSection) {
@@ -932,7 +933,7 @@ static Error handleArgs(const CommonConfig &Config, const ELFConfig &ELFConfig,
       return Obj.updateSection(Name, Data);
     };
     if (Error E = handleUserSection(NewSection, UpdateSection))
-      return E;
+      return createFileError(Config.InputFilename, std::move(E));
   }
 
   if (!Config.AddGnuDebugLink.empty())
@@ -943,7 +944,7 @@ static Error handleArgs(const CommonConfig &Config, const ELFConfig &ELFConfig,
   // before adding new symbols.
   if (!Obj.SymbolTable && !Config.SymbolsToAdd.empty())
     if (Error E = Obj.addNewSymbolTable())
-      return E;
+      return createFileError(Config.InputFilename, std::move(E));
 
   for (const NewSymbolInfo &SI : Config.SymbolsToAdd)
     addSymbol(Obj, SI, ELFConfig.NewSymbolVisibility);
@@ -955,7 +956,7 @@ static Error handleArgs(const CommonConfig &Config, const ELFConfig &ELFConfig,
       if (Iter != Config.SetSectionFlags.end()) {
         const SectionFlagsUpdate &SFU = Iter->second;
         if (Error E = setSectionFlagsAndType(Sec, SFU.NewFlags, Obj.Machine))
-          return E;
+          return createFileError(Config.InputFilename, std::move(E));
       }
       auto It2 = Config.SetSectionType.find(Sec.Name);
       if (It2 != Config.SetSectionType.end())
@@ -974,7 +975,7 @@ static Error handleArgs(const CommonConfig &Config, const ELFConfig &ELFConfig,
         Sec.Name = std::string(SR.NewName);
         if (SR.NewFlags) {
           if (Error E = setSectionFlagsAndType(Sec, *SR.NewFlags, Obj.Machine))
-            return E;
+            return createFileError(Config.InputFilename, std::move(E));
         }
         RenamedSections.insert(&Sec);
       } else if (RelocSec && !(Sec.Flags & SHF_ALLOC))
@@ -1091,7 +1092,7 @@ Error objcopy::elf::executeObjcopyOnBinary(const CommonConfig &Config,
                                     : getOutputElfType(In);
 
   if (Error E = handleArgs(Config, ELFConfig, OutputElfType, **Obj))
-    return createFileError(Config.InputFilename, std::move(E));
+    return E;
 
   if (Error E = writeOutput(Config, **Obj, Out, OutputElfType))
     return createFileError(Config.InputFilename, std::move(E));
diff --git a/llvm/lib/ObjCopy/MachO/MachOObjcopy.cpp b/llvm/lib/ObjCopy/MachO/MachOObjcopy.cpp
index a188425b283fa..682edffc84f34 100644
--- a/llvm/lib/ObjCopy/MachO/MachOObjcopy.cpp
+++ b/llvm/lib/ObjCopy/MachO/MachOObjcopy.cpp
@@ -306,25 +306,25 @@ static Error processLoadCommands(const MachOConfig &MachOConfig, Object &Obj) {
 }
 
 static Error dumpSectionToFile(StringRef SecName, StringRef Filename,
-                               Object &Obj) {
+                               StringRef InputFilename, Object &Obj) {
   for (LoadCommand &LC : Obj.LoadCommands)
     for (const std::unique_ptr<Section> &Sec : LC.Sections) {
       if (Sec->CanonicalName == SecName) {
         Expected<std::unique_ptr<FileOutputBuffer>> BufferOrErr =
             FileOutputBuffer::create(Filename, Sec->Content.size());
         if (!BufferOrErr)
-          return BufferOrErr.takeError();
+          return createFileError(Filename, BufferOrErr.takeError());
         std::unique_ptr<FileOutputBuffer> Buf = std::move(*BufferOrErr);
         llvm::copy(Sec->Content, Buf->getBufferStart());
 
         if (Error E = Buf->commit())
-          return E;
+          return createFileError(Filename, std::move(E));
         return Error::success();
       }
     }
 
-  return createStringError(object_error::parse_failed, "section '%s' not found",
-                           SecName.str().c_str());
+  return createFileError(InputFilename, object_error::parse_failed,
+                         "section '%s' not found", SecName.str().c_str());
 }
 
 static Error addSection(const NewSectionInfo &NewSection, Object &Obj) {
@@ -426,12 +426,13 @@ static Error handleArgs(const CommonConfig &Config,
     StringRef SectionName;
     StringRef FileName;
     std::tie(SectionName, FileName) = Flag.split('=');
-    if (Error E = dumpSectionToFile(SectionName, FileName, Obj))
+    if (Error E =
+            dumpSectionToFile(SectionName, FileName, Config.InputFilename, Obj))
       return E;
   }
 
   if (Error E = removeSections(Config, Obj))
-    return E;
+    return createFileError(Config.InputFilename, std::move(E));
 
   // Mark symbols to determine which symbols are still needed.
   if (Config.StripAll)
@@ -446,20 +447,20 @@ static Error handleArgs(const CommonConfig &Config,
 
   for (const NewSectionInfo &NewSection : Config.AddSection) {
     if (Error E = isValidMachOCannonicalName(NewSection.SectionName))
-      return E;
+      return createFileError(Config.InputFilename, std::move(E));
     if (Error E = addSection(NewSection, Obj))
-      return E;
+      return createFileError(Config.InputFilename, std::move(E));
   }
 
   for (const NewSectionInfo &NewSection : Config.UpdateSection) {
     if (Error E = isValidMachOCannonicalName(NewSection.SectionName))
-      return E;
+      return createFileError(Config.InputFilename, std::move(E));
     if (Error E = updateSection(NewSection, Obj))
-      return E;
+      return createFileError(Config.InputFilename, std::move(E));
   }
 
   if (Error E = processLoadCommands(MachOConfig, Obj))
-    return E;
+    return createFileError(Config.InputFilename, std::move(E));
 
   return Error::success();
 }
@@ -479,7 +480,7 @@ Error objcopy::macho::executeObjcopyOnBinary(const CommonConfig &Config,
                              Config.InputFilename.str().c_str());
 
   if (Error E = handleArgs(Config, MachOConfig, **O))
-    return createFileError(Config.InputFilename, std::move(E));
+    return E;
 
   // Page size used for alignment of segment sizes in Mach-O executables and
   // dynamic libraries.
diff --git a/llvm/lib/ObjCopy/wasm/WasmObjcopy.cpp b/llvm/lib/ObjCopy/wasm/WasmObjcopy.cpp
index cf3d884bee3bd..57fd0f5ad233c 100644
--- a/llvm/lib/ObjCopy/wasm/WasmObjcopy.cpp
+++ b/llvm/lib/ObjCopy/wasm/WasmObjcopy.cpp
@@ -38,23 +38,23 @@ static bool isCommentSection(const Section &Sec) {
 }
 
 static Error dumpSectionToFile(StringRef SecName, StringRef Filename,
-                               Object &Obj) {
+                               StringRef InputFilename, Object &Obj) {
   for (const Section &Sec : Obj.Sections) {
     if (Sec.Name == SecName) {
       ArrayRef<uint8_t> Contents = Sec.Contents;
       Expected<std::unique_ptr<FileOutputBuffer>> BufferOrErr =
           FileOutputBuffer::create(Filename, Contents.size());
       if (!BufferOrErr)
-        return BufferOrErr.takeError();
+        return createFileError(Filename, BufferOrErr.takeError());
       std::unique_ptr<FileOutputBuffer> Buf = std::move(*BufferOrErr);
       std::copy(Contents.begin(), Contents.end(), Buf->getBufferStart());
       if (Error E = Buf->commit())
-        return E;
+        return createFileError(Filename, std::move(E));
       return Error::success();
     }
   }
-  return createStringError(errc::invalid_argument, "section '%s' not found",
-                           SecName.str().c_str());
+  return createFileError(Filename, errc::invalid_argument,
+                         "section '%s' not found", SecName.str().c_str());
 }
 
 static void removeSections(const CommonConfig &Config, Object &Obj) {
@@ -115,8 +115,9 @@ static Error handleArgs(const CommonConfig &Config, Object &Obj) {
     StringRef SecName;
     StringRef FileName;
     std::tie(SecName, FileName) = Flag.split("=");
-    if (Error E = dumpSectionToFile(SecName, FileName, Obj))
-      return createFileError(FileName, std::move(E));
+    if (Error E =
+            dumpSectionToFile(SecName, FileName, Config.InputFilename, Obj))
+      return E;
   }
 
   removeSections(Config, Obj);
diff --git a/llvm/test/tools/llvm-objcopy/ELF/dump-section.test b/llvm/test/tools/llvm-objcopy/ELF/dump-section.test
index 037ec86090e55..2dbbcc0ca568e 100644
--- a/llvm/test/tools/llvm-objcopy/ELF/dump-section.test
+++ b/llvm/test/tools/llvm-objcopy/ELF/dump-section.test
@@ -64,3 +64,7 @@ ProgramHeaders:
 # RUN: not llvm-objcopy --dump-section .text= %t /dev/null 2>&1 | FileCheck %s --check-prefix=ERR2
 
 # ERR2: error: bad format for --dump-section, expected section=file
+
+# RUN: not llvm-objcopy --dump-section .text=not_exists/text-section %t 2>&1 \
+# RUN:   | FileCheck -DMSG=%errc_ENOENT %s -DINPUT=%t --check-prefix=NO-SUCH-PATH
+# NO-SUCH-PATH: error: 'not_exists/text-section': [[MSG]]
diff --git a/llvm/test/tools/llvm-objcopy/MachO/dump-section.test b/llvm/test/tools/llvm-objcopy/MachO/dump-section.test
index 9a1227cdbbda1..d54a50b557bb7 100644
--- a/llvm/test/tools/llvm-objcopy/MachO/dump-section.test
+++ b/llvm/test/tools/llvm-objcopy/MachO/dump-section.test
@@ -21,6 +21,10 @@
 # RUN:   | FileCheck %s -DINPUT=%t --check-prefix=NO-SUCH-SECTION
 # NO-SUCH-SECTION: error: '[[INPUT]]': section '__TEXT,__foo' not found
 
+# RUN: not llvm-objcopy --dump-section __TEXT,__text=not_exists/text-section %t 2>&1 \
+# RUN:   | FileCheck -DMSG=%errc_ENOENT %s -DINPUT=%t --check-prefix=NO-SUCH-PATH
+# NO-SUCH-PATH: error: 'not_exists/text-section': [[MSG]]
+
 --- !mach-o
 FileHeader:
   magic:           0xFEEDFACF
diff --git a/llvm/test/tools/llvm-objcopy/wasm/dump-section.test b/llvm/test/tools/llvm-objcopy/wasm/dump-section.test
index 983a581e03fe2..2d1533f06df10 100644
--- a/llvm/test/tools/llvm-objcopy/wasm/dump-section.test
+++ b/llvm/test/tools/llvm-objcopy/wasm/dump-section.test
@@ -28,6 +28,10 @@
 
 # REMOVED-NOT: producers
 
+# RUN: not llvm-objcopy --dump-section producers=not_exists/text-section %t 2>&1 \
+# RUN:   | FileCheck -DMSG=%errc_ENOENT %s -DINPUT=%t --check-prefix=NO-SUCH-PATH
+# NO-SUCH-PATH: error: 'not_exists/text-section': [[MSG]]
+
 --- !WASM
 FileHeader:
   Version: 0x00000001

From ed762db1e0088a0ad5c7d72e8ad2b08a5b1cf1be Mon Sep 17 00:00:00 2001
From: Amr Hesham <amr96@programmer.net>
Date: Thu, 6 Feb 2025 13:11:11 +0100
Subject: [PATCH 088/282] [LLVM][Support] Add new CreateFileError functions
 (#125906)

Add new CreateFileError functions to create a StringError with the
specified error code and prepend the file path to it

Needed for: #125345

(cherry picked from commit 2464f4ba6e0e50bb30c31b6526fa0bdd5a531217)
---
 llvm/include/llvm/Support/Error.h    | 17 +++++++++++++++++
 llvm/unittests/Support/ErrorTest.cpp | 11 +++++++++++
 2 files changed, 28 insertions(+)

diff --git a/llvm/include/llvm/Support/Error.h b/llvm/include/llvm/Support/Error.h
index 90120156ec2ea..c1b809a09bb80 100644
--- a/llvm/include/llvm/Support/Error.h
+++ b/llvm/include/llvm/Support/Error.h
@@ -1404,6 +1404,23 @@ inline Error createFileError(const Twine &F, size_t Line, std::error_code EC) {
   return createFileError(F, Line, errorCodeToError(EC));
 }
 
+/// Create a StringError with the specified error code and prepend the file path
+/// to it.
+inline Error createFileError(const Twine &F, std::error_code EC,
+                             const Twine &S) {
+  Error E = createStringError(EC, S);
+  return createFileError(F, std::move(E));
+}
+
+/// Create a StringError with the specified error code and prepend the file path
+/// to it.
+template <typename... Ts>
+inline Error createFileError(const Twine &F, std::error_code EC,
+                             char const *Fmt, const Ts &...Vals) {
+  Error E = createStringError(EC, Fmt, Vals...);
+  return createFileError(F, std::move(E));
+}
+
 Error createFileError(const Twine &F, ErrorSuccess) = delete;
 
 /// Helper for check-and-exit error handling.
diff --git a/llvm/unittests/Support/ErrorTest.cpp b/llvm/unittests/Support/ErrorTest.cpp
index 98d19e8d2a15a..00c562ecc059d 100644
--- a/llvm/unittests/Support/ErrorTest.cpp
+++ b/llvm/unittests/Support/ErrorTest.cpp
@@ -976,6 +976,17 @@ TEST(Error, FileErrorTest) {
   handleAllErrors(std::move(FE6), [](std::unique_ptr<FileError> F) {
     EXPECT_EQ(F->messageWithoutFileInfo(), "CustomError {6}");
   });
+
+  Error FE7 =
+      createFileError("file.bin", make_error_code(std::errc::invalid_argument),
+                      "invalid argument");
+  EXPECT_EQ(toString(std::move(FE7)), "'file.bin': invalid argument");
+
+  StringRef Argument = "arg";
+  Error FE8 =
+      createFileError("file.bin", make_error_code(std::errc::invalid_argument),
+                      "invalid argument '%s'", Argument.str().c_str());
+  EXPECT_EQ(toString(std::move(FE8)), "'file.bin': invalid argument 'arg'");
 }
 
 TEST(Error, FileErrorErrorCode) {

From f0f59e3ecc17ba60ebf5c45cc0564628959466f8 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Mon, 10 Feb 2025 09:00:31 -0500
Subject: [PATCH 089/282] [clang] Expose -f(no-)strict-overflow as a clang-cl
 option (#126512)

Also move the -fno-strict-overflow option definition next to the
-fstrict-overflow one while here.

Also add test coverage for f(no-)wrapv-pointer being a clang-cl option.

(cherry picked from commit 71adb054024a1e9bd5ed4566beda74dea65362cd)
---
 clang/include/clang/Driver/Options.td | 6 +++---
 clang/test/Driver/cl-options.c        | 4 ++++
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index a2b47b943ef90..02e5c4cbb4bff 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -3505,8 +3505,6 @@ def fno_strict_aliasing : Flag<["-"], "fno-strict-aliasing">, Group<f_Group>,
 def fstruct_path_tbaa : Flag<["-"], "fstruct-path-tbaa">, Group<f_Group>;
 def fno_struct_path_tbaa : Flag<["-"], "fno-struct-path-tbaa">, Group<f_Group>;
 def fno_strict_enums : Flag<["-"], "fno-strict-enums">, Group<f_Group>;
-def fno_strict_overflow : Flag<["-"], "fno-strict-overflow">, Group<f_Group>,
-  Visibility<[ClangOption, FlangOption]>;
 defm init_global_zero : BoolOptionWithoutMarshalling<"f", "init-global-zero",
   PosFlag<SetTrue, [], [FlangOption, FC1Option],
           "Zero initialize globals without default initialization (default)">,
@@ -4023,7 +4021,9 @@ defm strict_vtable_pointers : BoolFOption<"strict-vtable-pointers",
             " overwriting polymorphic C++ objects">,
   NegFlag<SetFalse>>;
 def fstrict_overflow : Flag<["-"], "fstrict-overflow">, Group<f_Group>,
-  Visibility<[ClangOption, FlangOption]>;
+  Visibility<[ClangOption, CLOption, FlangOption]>;
+def fno_strict_overflow : Flag<["-"], "fno-strict-overflow">, Group<f_Group>,
+  Visibility<[ClangOption, CLOption, FlangOption]>;
 def fpointer_tbaa : Flag<["-"], "fpointer-tbaa">, Group<f_Group>;
 def fdriver_only : Flag<["-"], "fdriver-only">, Flags<[NoXarchOption]>,
   Visibility<[ClangOption, CLOption, DXCOption]>,
diff --git a/clang/test/Driver/cl-options.c b/clang/test/Driver/cl-options.c
index 29a0fcbc17ac6..9f9ca1bf1a8fd 100644
--- a/clang/test/Driver/cl-options.c
+++ b/clang/test/Driver/cl-options.c
@@ -738,9 +738,13 @@
 // RUN:     -fno-modules-search-all \
 // RUN:     -fimplicit-modules \
 // RUN:     -fno-implicit-modules \
+// RUN:     -fstrict-overflow \
+// RUN:     -fno-strict-overflow \
 // RUN:     -ftrivial-auto-var-init=zero \
 // RUN:     -fwrapv \
 // RUN:     -fno-wrapv \
+// RUN:     -fwrapv-pointer \
+// RUN:     -fno-wrapv-pointer \
 // RUN:     --version \
 // RUN:     -Werror /Zs -- %s 2>&1
 

From c7f18bae1a9c188706912dc3379afb1de7f8921e Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Fri, 7 Feb 2025 16:37:05 +0000
Subject: [PATCH 090/282] [AArch64] Fix op mask detection in
 performZExtDeinterleaveShuffleCombine (#126054)

Given a zext from an extract vector, with a shuffle mask like <4, 0, 0, 4> we
would previously recognize the top half as a deinterleave. In order to convert
into a uzp we should have been checking that the bottom half is also poison.

Fixes #125989

(cherry picked from commit 2c43479683651f0eb208c97bf12e49bacbea4e6f)
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |   3 +
 llvm/test/CodeGen/AArch64/zext-shuffle.ll     | 143 ++++++++++++++++++
 2 files changed, 146 insertions(+)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index bd9994bcb669c..b5cca88b6b511 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -22364,6 +22364,9 @@ static SDValue performZExtDeinterleaveShuffleCombine(SDNode *N,
   if (!IsDeInterleave)
     IsUndefDeInterleave =
         Shuffle->getOperand(1).isUndef() &&
+        all_of(
+            Shuffle->getMask().slice(ExtOffset, VT.getVectorNumElements() / 2),
+            [](int M) { return M < 0; }) &&
         ShuffleVectorInst::isDeInterleaveMaskOfFactor(
             Shuffle->getMask().slice(ExtOffset + VT.getVectorNumElements() / 2,
                                      VT.getVectorNumElements() / 2),
diff --git a/llvm/test/CodeGen/AArch64/zext-shuffle.ll b/llvm/test/CodeGen/AArch64/zext-shuffle.ll
index 2965996ddcb02..20d2071d7fe54 100644
--- a/llvm/test/CodeGen/AArch64/zext-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/zext-shuffle.ll
@@ -543,3 +543,146 @@ define <8 x double> @uitofp_load_fadd(ptr %p) {
     ret <8 x double> %c
 }
 
+define <4 x i32> @isUndefDeInterleave_b0(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: isUndefDeInterleave_b0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v1.2d, #0x00ffff0000ffff
+; CHECK-NEXT:    uzp1 v0.4s, v0.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %2 = shufflevector <8 x i16> %a, <8 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 0, i32 4, i32 poison, i32 poison, i32 poison, i32 poison>
+  %s2 = shufflevector <8 x i16> %2, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = zext <4 x i16> %s2 to <4 x i32>
+  ret <4 x i32> %3
+}
+
+define <4 x i32> @isUndefDeInterleave_b1(<8 x i16> %a) {
+; CHECK-LABEL: isUndefDeInterleave_b1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uzp1 v0.4s, v0.4s, v0.4s
+; CHECK-NEXT:    ushr v0.4s, v0.4s, #16
+; CHECK-NEXT:    ret
+  %2 = shufflevector <8 x i16> %a, <8 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 1, i32 5, i32 poison, i32 poison, i32 poison, i32 poison>
+  %s2 = shufflevector <8 x i16> %2, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = zext <4 x i16> %s2 to <4 x i32>
+  ret <4 x i32> %3
+}
+
+define <4 x i32> @isUndefDeInterleave_b2(<8 x i16> %a) {
+; CHECK-LABEL: isUndefDeInterleave_b2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v1.2d, #0x00ffff0000ffff
+; CHECK-NEXT:    uzp2 v0.4s, v0.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %2 = shufflevector <8 x i16> %a, <8 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 2, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
+  %s2 = shufflevector <8 x i16> %2, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = zext <4 x i16> %s2 to <4 x i32>
+  ret <4 x i32> %3
+}
+
+define <4 x i32> @isUndefDeInterleave_b3(<8 x i16> %a) {
+; CHECK-LABEL: isUndefDeInterleave_b3:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uzp2 v0.4s, v0.4s, v0.4s
+; CHECK-NEXT:    ushr v0.4s, v0.4s, #16
+; CHECK-NEXT:    ret
+  %2 = shufflevector <8 x i16> %a, <8 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 3, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+  %s2 = shufflevector <8 x i16> %2, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = zext <4 x i16> %s2 to <4 x i32>
+  ret <4 x i32> %3
+}
+
+define <4 x i32> @isUndefDeInterleave_t0(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: isUndefDeInterleave_t0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v1.2d, #0x00ffff0000ffff
+; CHECK-NEXT:    uzp1 v0.4s, v0.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %2 = shufflevector <8 x i16> %a, <8 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 4>
+  %s2 = shufflevector <8 x i16> %2, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %3 = zext <4 x i16> %s2 to <4 x i32>
+  ret <4 x i32> %3
+}
+
+define <4 x i32> @isUndefDeInterleave_t1(<8 x i16> %a) {
+; CHECK-LABEL: isUndefDeInterleave_t1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uzp1 v0.4s, v0.4s, v0.4s
+; CHECK-NEXT:    ushr v0.4s, v0.4s, #16
+; CHECK-NEXT:    ret
+  %2 = shufflevector <8 x i16> %a, <8 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 1, i32 5>
+  %s2 = shufflevector <8 x i16> %2, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %3 = zext <4 x i16> %s2 to <4 x i32>
+  ret <4 x i32> %3
+}
+
+define <4 x i32> @isUndefDeInterleave_t2(<8 x i16> %a) {
+; CHECK-LABEL: isUndefDeInterleave_t2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v1.2d, #0x00ffff0000ffff
+; CHECK-NEXT:    uzp2 v0.4s, v0.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %2 = shufflevector <8 x i16> %a, <8 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 2, i32 6>
+  %s2 = shufflevector <8 x i16> %2, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %3 = zext <4 x i16> %s2 to <4 x i32>
+  ret <4 x i32> %3
+}
+
+define <4 x i32> @isUndefDeInterleave_t3(<8 x i16> %a) {
+; CHECK-LABEL: isUndefDeInterleave_t3:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uzp2 v0.4s, v0.4s, v0.4s
+; CHECK-NEXT:    ushr v0.4s, v0.4s, #16
+; CHECK-NEXT:    ret
+  %2 = shufflevector <8 x i16> %a, <8 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 3, i32 7>
+  %s2 = shufflevector <8 x i16> %2, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %3 = zext <4 x i16> %s2 to <4 x i32>
+  ret <4 x i32> %3
+}
+
+define <4 x i32> @isUndefDeInterleave_b0_bad(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: isUndefDeInterleave_b0_bad:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .LCPI40_0
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI40_0]
+; CHECK-NEXT:    tbl v0.16b, { v0.16b }, v1.16b
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    ret
+  %2 = shufflevector <8 x i16> %a, <8 x i16> poison, <8 x i32> <i32 4, i32 4, i32 0, i32 4, i32 poison, i32 poison, i32 poison, i32 poison>
+  %s2 = shufflevector <8 x i16> %2, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = zext <4 x i16> %s2 to <4 x i32>
+  ret <4 x i32> %3
+}
+
+define <4 x i32> @isUndefDeInterleave_t1_bad(<8 x i16> %a) {
+; CHECK-LABEL: isUndefDeInterleave_t1_bad:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .LCPI41_0
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI41_0]
+; CHECK-NEXT:    tbl v0.16b, { v0.16b }, v1.16b
+; CHECK-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-NEXT:    ret
+  %2 = shufflevector <8 x i16> %a, <8 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 4, i32 4, i32 1, i32 5>
+  %s2 = shufflevector <8 x i16> %2, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %3 = zext <4 x i16> %s2 to <4 x i32>
+  ret <4 x i32> %3
+}
+
+define i16 @undeftop(<8 x i16> %0) {
+; CHECK-LABEL: undeftop:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v0.8h, v0.h[4]
+; CHECK-NEXT:    uaddl v0.4s, v0.4h, v0.4h
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    umov w0, v0.h[0]
+; CHECK-NEXT:    ret
+  %2 = shufflevector <8 x i16> %0, <8 x i16> zeroinitializer, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 9, i32 7, i32 5, i32 3>
+  %3 = zext <8 x i16> %2 to <8 x i64>
+  %new0 = add <8 x i64> %3, %3
+  %last = trunc <8 x i64> %new0 to <8 x i16>
+  %4 = extractelement <8 x i16> %last, i32 0
+  ret i16 %4
+}

From 5c75c290f87aacd2227619748ccb0ffd4b578ea4 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Mon, 10 Feb 2025 16:29:42 +0000
Subject: [PATCH 091/282] [LV] Forget LCSSA phi with new pred before other SCEV
 invalidation. (#119897)

`forgetLcssaPhiWithNewPredecessor` performs additional invalidation if
there is an existing SCEV for the phi, but earlier
`forgetBlockAndLoopDispositions` or `forgetLoop` may already invalidate
the SCEV for the phi.

Change the order to first call `forgetLcssaPhiWithNewPredecessor` to
ensure it runs before its SCEV gets invalidated too eagerly.

Fixes https://github.com/llvm/llvm-project/issues/119665.

PR: https://github.com/llvm/llvm-project/pull/119897
(cherry picked from commit 3706dfef660097f24fb5efbac0d7f14b424492ed)
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |   8 +-
 ...idate-scev-at-scope-after-vectorization.ll | 118 ++++++++++++++++++
 2 files changed, 122 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/invalidate-scev-at-scope-after-vectorization.ll

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 06c2a91f89b1c..0ceeec48487f6 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2888,10 +2888,6 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
   if (EnableVPlanNativePath)
     fixNonInductionPHIs(State);
 
-  // Forget the original basic block.
-  PSE.getSE()->forgetLoop(OrigLoop);
-  PSE.getSE()->forgetBlockAndLoopDispositions();
-
   // After vectorization, the exit blocks of the original loop will have
   // additional predecessors. Invalidate SCEVs for the exit phis in case SE
   // looked through single-entry phis.
@@ -2901,6 +2897,10 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
     for (PHINode &PN : Exit->phis())
       PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN);
 
+  // Forget the original basic block.
+  PSE.getSE()->forgetLoop(OrigLoop);
+  PSE.getSE()->forgetBlockAndLoopDispositions();
+
   // Don't apply optimizations below when no vector region remains, as they all
   // require a vector loop at the moment.
   if (!State.Plan->getVectorLoopRegion())
diff --git a/llvm/test/Transforms/LoopVectorize/invalidate-scev-at-scope-after-vectorization.ll b/llvm/test/Transforms/LoopVectorize/invalidate-scev-at-scope-after-vectorization.ll
new file mode 100644
index 0000000000000..235a8f0fa34a8
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/invalidate-scev-at-scope-after-vectorization.ll
@@ -0,0 +1,118 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes='print<scalar-evolution>,loop-vectorize' -force-vector-width=4 -scalar-evolution-classify-expressions=false -S %s | FileCheck %s
+
+; Test case for https://github.com/llvm/llvm-project/issues/119665.
+
+; %loop.2's backedge-taken-count depends on %add.1 from %loop.1 via its
+; corresponding SCEV at the scope of %loop.2. After vectorizing %loop.1, %add.1
+; isn't available at the entry of %loop.2 anymore and %add.1 at %loop.2's scope
+; must be invalidated, as well as %loop.2's backedge-taken count.
+define void @test_invalidate_scevs_at_scope(ptr %p) {
+; CHECK-LABEL: define void @test_invalidate_scevs_at_scope(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[P]], align 4
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 100
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+; CHECK-NEXT:    br i1 false, label %[[EXIT_1:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 100, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP_1:.*]]
+; CHECK:       [[LOOP_1]]:
+; CHECK-NEXT:    [[IV_1:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], %[[LOOP_1]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[P]], align 4
+; CHECK-NEXT:    [[ADD_1:%.*]] = add i32 [[TMP4]], [[IV_1]]
+; CHECK-NEXT:    [[IV_1_NEXT]] = add i32 [[IV_1]], 1
+; CHECK-NEXT:    [[C_1:%.*]] = icmp eq i32 [[IV_1]], 100
+; CHECK-NEXT:    br i1 [[C_1]], label %[[EXIT_1]], label %[[LOOP_1]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[EXIT_1]]:
+; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD_1]], %[[LOOP_1]] ], [ [[TMP3]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[ADD_LCSSA]], i32 100)
+; CHECK-NEXT:    [[TMP5:%.*]] = sub i32 [[SMAX]], [[ADD_LCSSA]]
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i32 [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = add nuw nsw i64 [[TMP6]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP7]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH2:.*]], label %[[VECTOR_PH3:.*]]
+; CHECK:       [[VECTOR_PH3]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP7]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP7]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY4:.*]]
+; CHECK:       [[VECTOR_BODY4]]:
+; CHECK-NEXT:    [[INDEX5:%.*]] = phi i64 [ 0, %[[VECTOR_PH3]] ], [ [[INDEX_NEXT8:%.*]], %[[VECTOR_BODY4]] ]
+; CHECK-NEXT:    [[VEC_IND6:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH3]] ], [ [[VEC_IND_NEXT7:%.*]], %[[VECTOR_BODY4]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX5]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP9]], i32 0
+; CHECK-NEXT:    store <4 x i64> [[VEC_IND6]], ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT8]] = add nuw i64 [[INDEX5]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT7]] = add <4 x i64> [[VEC_IND6]], splat (i64 4)
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT8]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK1:.*]], label %[[VECTOR_BODY4]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK1]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP7]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT_2:.*]], label %[[SCALAR_PH2]]
+; CHECK:       [[SCALAR_PH2]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL9:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK1]] ], [ 0, %[[EXIT_1]] ]
+; CHECK-NEXT:    br label %[[LOOP_2:.*]]
+; CHECK:       [[LOOP_2]]:
+; CHECK-NEXT:    [[IV_2:%.*]] = phi i64 [ [[BC_RESUME_VAL9]], %[[SCALAR_PH2]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP_2]] ]
+; CHECK-NEXT:    [[IV_2_TRUNC:%.*]] = trunc i64 [[IV_2]] to i32
+; CHECK-NEXT:    [[IV_2_NEXT]] = add i64 [[IV_2]], 1
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[IV_2]]
+; CHECK-NEXT:    store i64 [[IV_2]], ptr [[GEP]], align 4
+; CHECK-NEXT:    [[ADD_2:%.*]] = add i32 [[ADD_LCSSA]], [[IV_2_TRUNC]]
+; CHECK-NEXT:    [[C_2:%.*]] = icmp slt i32 [[ADD_2]], 100
+; CHECK-NEXT:    br i1 [[C_2]], label %[[LOOP_2]], label %[[EXIT_2]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       [[EXIT_2]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop.1
+
+loop.1:
+  %iv.1 = phi i32 [ 0, %entry ], [ %iv.1.next, %loop.1 ]
+  %1 = load i32, ptr %p, align 4
+  %add.1 = add i32 %1, %iv.1
+  %iv.1.next = add i32 %iv.1, 1
+  %c.1 = icmp eq i32 %iv.1, 100
+  br i1 %c.1, label %exit.1, label %loop.1
+
+exit.1:
+  %add.lcssa = phi i32 [ %add.1, %loop.1 ]
+  br label %loop.2
+
+loop.2:
+  %iv.2 = phi i64 [ 0, %exit.1 ], [ %iv.2.next, %loop.2 ]
+  %iv.2.trunc = trunc i64 %iv.2 to i32
+  %iv.2.next = add i64 %iv.2, 1
+  %gep = getelementptr inbounds i64, ptr %p, i64 %iv.2
+  store i64 %iv.2, ptr %gep
+  %add.2 = add i32 %add.lcssa, %iv.2.trunc
+  %c.2 = icmp slt i32 %add.2, 100
+  br i1 %c.2, label %loop.2, label %exit.2
+
+exit.2:
+  ret void
+}
+
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+;.

From 9e07fbf6411110b0f3e9b814a2e2bcac74c9eb88 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Mon, 10 Feb 2025 09:21:31 -0800
Subject: [PATCH 092/282] [ELF] --package-metadata: support
 %[0-9a-fA-F][0-9a-fA-F]

(This application-specific option is probably not appropriate as a
linker option (.o file offers more flexibility and decouples JSON
verification from linkers). However, the option has gained some traction
in Linux distributions, with support in GNU ld, gold, and mold.)

GNU ld has supported percent-encoded bytes and extensions like
`%[comma]` since November 2024.  mold supports just percent-encoded
bytes.  To prepare for potential adoption by Ubuntu, let's support
percent-encoded bytes.

Link: https://sourceware.org/bugzilla/show_bug.cgi?id=32003
Link: https://bugs.launchpad.net/ubuntu/+source/dpkg/+bug/2071468

Pull Request: https://github.com/llvm/llvm-project/pull/126396

(cherry picked from commit 0a470a926481d370251731cb2dd897531756335f)
---
 lld/ELF/Config.h                |  2 +-
 lld/ELF/Driver.cpp              | 23 ++++++++++++++++++++++-
 lld/ELF/Options.td              |  2 +-
 lld/docs/ld.lld.1               |  4 ++++
 lld/test/ELF/package-metadata.s | 20 +++++++++++++++-----
 5 files changed, 43 insertions(+), 8 deletions(-)

diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h
index c2aadb2cef520..98e52b52ea46a 100644
--- a/lld/ELF/Config.h
+++ b/lld/ELF/Config.h
@@ -407,7 +407,7 @@ struct Config {
   StringRef thinLTOJobs;
   unsigned timeTraceGranularity;
   int32_t splitStackAdjustSize;
-  StringRef packageMetadata;
+  SmallVector<uint8_t, 0> packageMetadata;
 
   // The following config options do not directly correspond to any
   // particular command line options.
diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index 9d0c992c1e851..2d8a5ade2fece 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -822,6 +822,26 @@ static ICFLevel getICF(opt::InputArgList &args) {
   return ICFLevel::All;
 }
 
+static void parsePackageMetadata(Ctx &ctx, const opt::Arg &arg) {
+  unsigned c0, c1;
+  SmallVector<uint8_t, 0> decoded;
+  StringRef s = arg.getValue();
+  for (size_t i = 0, e = s.size(); i != e; ++i) {
+    if (s[i] != '%') {
+      decoded.push_back(s[i]);
+    } else if (i + 2 < e && (c1 = hexDigitValue(s[i + 1])) != -1u &&
+               (c0 = hexDigitValue(s[i + 2])) != -1u) {
+      decoded.push_back(uint8_t(c1 * 16 + c0));
+      i += 2;
+    } else {
+      ErrAlways(ctx) << arg.getSpelling() << ": invalid % escape at byte " << i
+                     << "; supports only %[0-9a-fA-F][0-9a-fA-F]";
+      return;
+    }
+  }
+  ctx.arg.packageMetadata = std::move(decoded);
+}
+
 static StripPolicy getStrip(Ctx &ctx, opt::InputArgList &args) {
   if (args.hasArg(OPT_relocatable))
     return StripPolicy::None;
@@ -1383,7 +1403,8 @@ static void readConfigs(Ctx &ctx, opt::InputArgList &args) {
   ctx.arg.optimize = args::getInteger(args, OPT_O, 1);
   ctx.arg.orphanHandling = getOrphanHandling(ctx, args);
   ctx.arg.outputFile = args.getLastArgValue(OPT_o);
-  ctx.arg.packageMetadata = args.getLastArgValue(OPT_package_metadata);
+  if (auto *arg = args.getLastArg(OPT_package_metadata))
+    parsePackageMetadata(ctx, *arg);
   ctx.arg.pie = args.hasFlag(OPT_pie, OPT_no_pie, false);
   ctx.arg.printIcfSections =
       args.hasFlag(OPT_print_icf_sections, OPT_no_print_icf_sections, false);
diff --git a/lld/ELF/Options.td b/lld/ELF/Options.td
index c31875305952f..d9998176d0dd4 100644
--- a/lld/ELF/Options.td
+++ b/lld/ELF/Options.td
@@ -565,7 +565,7 @@ def z: JoinedOrSeparate<["-"], "z">, MetaVarName<"<option>">,
 def visual_studio_diagnostics_format : FF<"vs-diagnostics">,
 HelpText<"Format diagnostics for Visual Studio compatibility">;
 
-def package_metadata: JJ<"package-metadata=">, HelpText<"Emit package metadata note">;
+def package_metadata: JJ<"package-metadata=">, HelpText<"Emit a percent-encoded string to the .note.package section">;
 
 // Aliases
 def: Separate<["-"], "dT">, Alias<default_script>, HelpText<"Alias for --default-script">;
diff --git a/lld/docs/ld.lld.1 b/lld/docs/ld.lld.1
index b28c6082f68b0..b5c1816ce6e5f 100644
--- a/lld/docs/ld.lld.1
+++ b/lld/docs/ld.lld.1
@@ -493,6 +493,10 @@ If
 .Fl -use-android-relr-tags
 is specified, use SHT_ANDROID_RELR instead of SHT_RELR.
 .Pp
+.It Fl -package-metadata
+Emit a percent-encoded string to the
+.Cm .note.package
+section. For example, %25 decodes to a single %.
 .It Fl -pic-veneer
 Always generate position independent thunks.
 .It Fl -pie , Fl -pic-executable
diff --git a/lld/test/ELF/package-metadata.s b/lld/test/ELF/package-metadata.s
index 29df499d7e98d..a70a8940d7c68 100644
--- a/lld/test/ELF/package-metadata.s
+++ b/lld/test/ELF/package-metadata.s
@@ -1,12 +1,15 @@
 # REQUIRES: x86
+# RUN: rm -rf %t && split-file %s %t && cd %t
+# RUN: llvm-mc -filetype=obj -triple=x86_64 a.s -o a.o
 
-# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t.o
+# RUN: ld.lld a.o --package-metadata='{}'
+# RUN: llvm-readelf -n a.out | FileCheck %s --check-prefixes=NOTE,FIRST
 
-# RUN: ld.lld %t.o -o %t --package-metadata='{}'
-# RUN: llvm-readelf -n %t | FileCheck %s --check-prefixes=NOTE,FIRST
+# RUN: ld.lld a.o --package-metadata='{"abc":123}'
+# RUN: llvm-readelf -n a.out | FileCheck %s --check-prefixes=NOTE,SECOND
 
-# RUN: ld.lld %t.o -o %t --package-metadata='{"abc":123}'
-# RUN: llvm-readelf -n %t | FileCheck %s --check-prefixes=NOTE,SECOND
+# RUN: ld.lld a.o --package-metadata='%7b%22abc%22:123%7D'
+# RUN: llvm-readelf -n a.out | FileCheck %s --check-prefixes=NOTE,SECOND
 
 # NOTE: .note.package
 # NOTE-NEXT: Owner
@@ -14,6 +17,13 @@
 # FIRST-NEXT: description data: 7b 7d 00
 # SECOND-NEXT: description data: 7b 22 61 62 63 22 3a 31 32 33 7d 00
 
+# RUN: not ld.lld a.o --package-metadata='%7b%' 2>&1 | FileCheck %s --check-prefix=ERR
+# RUN: not ld.lld a.o --package-metadata='%7b%7' 2>&1 | FileCheck %s --check-prefix=ERR
+# RUN: not ld.lld a.o --package-metadata='%7b%7g' 2>&1 | FileCheck %s --check-prefix=ERR
+
+# ERR: error: --package-metadata=: invalid % escape at byte 3; supports only %[0-9a-fA-F][0-9a-fA-F]
+
+#--- a.s
 .globl _start
 _start:
   ret

From 7fda02c09d3b24b84b92334d0e491dc172648b71 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 6 Feb 2025 14:11:22 +0100
Subject: [PATCH 093/282] [flang] Use clang_target_link_libraries() for clang
 dependency (#126037)

This dependency is part of libclang-cpp, so it should use
clang_target_link_libraries.

(cherry picked from commit 7c695e4906300a11208d7979c470d63b0d037bb2)
---
 flang/unittests/Frontend/CMakeLists.txt | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/flang/unittests/Frontend/CMakeLists.txt b/flang/unittests/Frontend/CMakeLists.txt
index 9177997f41f53..2e3e7cb596c58 100644
--- a/flang/unittests/Frontend/CMakeLists.txt
+++ b/flang/unittests/Frontend/CMakeLists.txt
@@ -12,7 +12,6 @@ add_flang_unittest(FlangFrontendTests
 
 target_link_libraries(FlangFrontendTests
   PRIVATE
-  clangBasic
   flangFrontend
   flangFrontendTool
   FortranLower
@@ -22,6 +21,11 @@ target_link_libraries(FlangFrontendTests
   FortranEvaluate
 )
 
+clang_target_link_libraries(FlangFrontendTests
+  PRIVATE
+  clangBasic
+)
+
 mlir_target_link_libraries(FlangFrontendTests
   PRIVATE
   MLIRIR

From 56b4c1168f51aafb576226efe35430fe4ec119e8 Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Sun, 2 Feb 2025 19:04:23 +0800
Subject: [PATCH 094/282] [InstSimplify] Add additional checks when
 substituting pointers (#125385)

Compile-time impact:
https://llvm-compile-time-tracker.com/compare.php?from=d09b521624f263b5f1296f8d4771836b97e600cb&to=e437ba2cb83bb965e13ef00727671896f03ff84f&stat=instructions:u
IR diff looks acceptable.
Closes https://github.com/llvm/llvm-project/issues/115574

(cherry picked from commit 1af627b592dd15bbe58136f902ced46251fc344d)
---
 llvm/lib/Analysis/InstructionSimplify.cpp     | 17 ++++++----
 .../Transforms/InstSimplify/select-icmp.ll    | 32 +++++++++++++++++++
 2 files changed, 43 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index d69747e30f884..4e550f8fa6a9f 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -27,6 +27,7 @@
 #include "llvm/Analysis/CmpInstAnalysis.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstSimplifyFolder.h"
+#include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/OverflowInstAnalysis.h"
@@ -4737,12 +4738,16 @@ static Value *simplifySelectWithICmpCond(Value *CondVal, Value *TrueVal,
   // the arms of the select. See if substituting this value into the arm and
   // simplifying the result yields the same value as the other arm.
   if (Pred == ICmpInst::ICMP_EQ) {
-    if (Value *V = simplifySelectWithEquivalence({{CmpLHS, CmpRHS}}, TrueVal,
-                                                 FalseVal, Q, MaxRecurse))
-      return V;
-    if (Value *V = simplifySelectWithEquivalence({{CmpRHS, CmpLHS}}, TrueVal,
-                                                 FalseVal, Q, MaxRecurse))
-      return V;
+    if (CmpLHS->getType()->isIntOrIntVectorTy() ||
+        canReplacePointersIfEqual(CmpLHS, CmpRHS, Q.DL))
+      if (Value *V = simplifySelectWithEquivalence({{CmpLHS, CmpRHS}}, TrueVal,
+                                                   FalseVal, Q, MaxRecurse))
+        return V;
+    if (CmpLHS->getType()->isIntOrIntVectorTy() ||
+        canReplacePointersIfEqual(CmpRHS, CmpLHS, Q.DL))
+      if (Value *V = simplifySelectWithEquivalence({{CmpRHS, CmpLHS}}, TrueVal,
+                                                   FalseVal, Q, MaxRecurse))
+        return V;
 
     Value *X;
     Value *Y;
diff --git a/llvm/test/Transforms/InstSimplify/select-icmp.ll b/llvm/test/Transforms/InstSimplify/select-icmp.ll
index a6ef937760a58..64c0d1d7553fe 100755
--- a/llvm/test/Transforms/InstSimplify/select-icmp.ll
+++ b/llvm/test/Transforms/InstSimplify/select-icmp.ll
@@ -244,3 +244,35 @@ cond.true:                                        ; preds = %entry
 cond.end:                                         ; preds = %entry, %cond.true
   ret i8 0
 }
+
+define ptr @icmp_ptr_eq_replace(ptr %a, ptr %b) {
+; CHECK-LABEL: @icmp_ptr_eq_replace(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr [[A:%.*]], [[B1:%.*]]
+; CHECK-NEXT:    [[B:%.*]] = select i1 [[CMP]], ptr [[A]], ptr [[B1]]
+; CHECK-NEXT:    ret ptr [[B]]
+;
+  %cmp = icmp eq ptr %a, %b
+  %sel = select i1 %cmp, ptr %a, ptr %b
+  ret ptr %sel
+}
+
+define ptr @icmp_ptr_eq_replace_null(ptr %a) {
+; CHECK-LABEL: @icmp_ptr_eq_replace_null(
+; CHECK-NEXT:    ret ptr [[A:%.*]]
+;
+  %cmp = icmp eq ptr %a, null
+  %sel = select i1 %cmp, ptr null, ptr %a
+  ret ptr %sel
+}
+
+define ptr @ptr_eq_replace_same_underlying_object(ptr %st, i64 %i, i64 %j) {
+; CHECK-LABEL: @ptr_eq_replace_same_underlying_object(
+; CHECK-NEXT:    [[B:%.*]] = getelementptr inbounds i8, ptr [[ST:%.*]], i64 [[J:%.*]]
+; CHECK-NEXT:    ret ptr [[B]]
+;
+  %a = getelementptr inbounds i8, ptr %st, i64 %i
+  %b = getelementptr inbounds i8, ptr %st, i64 %j
+  %cmp = icmp eq ptr %a, %b
+  %sel = select i1 %cmp, ptr %a, ptr %b
+  ret ptr %sel
+}

From 2fb9effee42ebb3fe29a6635f600f51efaff248f Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Mon, 3 Feb 2025 22:43:43 -0500
Subject: [PATCH 095/282] [benchmark] Get number of CPUs with sysconf() on
 Linux (#125603)

(cherry picked from commit c24774dc4f4402c3ad150363321cc972ed2669e7)
(cherry picked from commit fbe470c1b215e3f953a41db6b91d20ce0bcf5c4e)
---
 third-party/benchmark/src/sysinfo.cc | 53 ++--------------------------
 1 file changed, 3 insertions(+), 50 deletions(-)

diff --git a/third-party/benchmark/src/sysinfo.cc b/third-party/benchmark/src/sysinfo.cc
index 2bed1663af2e9..8283a081ee80b 100644
--- a/third-party/benchmark/src/sysinfo.cc
+++ b/third-party/benchmark/src/sysinfo.cc
@@ -495,14 +495,14 @@ int GetNumCPUsImpl() {
   return sysinfo.dwNumberOfProcessors;  // number of logical
                                         // processors in the current
                                         // group
-#elif defined(BENCHMARK_OS_SOLARIS)
+#elif defined(__linux__) || defined(BENCHMARK_OS_SOLARIS)
   // Returns -1 in case of a failure.
-  long num_cpu = sysconf(_SC_NPROCESSORS_ONLN);
+  int num_cpu = static_cast<int>(sysconf(_SC_NPROCESSORS_ONLN));
   if (num_cpu < 0) {
     PrintErrorAndDie("sysconf(_SC_NPROCESSORS_ONLN) failed with error: ",
                      strerror(errno));
   }
-  return (int)num_cpu;
+  return num_cpu;
 #elif defined(BENCHMARK_OS_QNX)
   return static_cast<int>(_syspage_ptr->num_cpu);
 #elif defined(BENCHMARK_OS_QURT)
@@ -511,53 +511,6 @@ int GetNumCPUsImpl() {
     hardware_threads.max_hthreads = 1;
   }
   return hardware_threads.max_hthreads;
-#else
-  int num_cpus = 0;
-  int max_id = -1;
-  std::ifstream f("/proc/cpuinfo");
-  if (!f.is_open()) {
-    PrintErrorAndDie("Failed to open /proc/cpuinfo");
-  }
-#if defined(__alpha__)
-  const std::string Key = "cpus detected";
-#else
-  const std::string Key = "processor";
-#endif
-  std::string ln;
-  while (std::getline(f, ln)) {
-    if (ln.empty()) continue;
-    std::size_t split_idx = ln.find(':');
-    std::string value;
-#if defined(__s390__)
-    // s390 has another format in /proc/cpuinfo
-    // it needs to be parsed differently
-    if (split_idx != std::string::npos)
-      value = ln.substr(Key.size() + 1, split_idx - Key.size() - 1);
-#else
-    if (split_idx != std::string::npos) value = ln.substr(split_idx + 1);
-#endif
-    if (ln.size() >= Key.size() && ln.compare(0, Key.size(), Key) == 0) {
-      num_cpus++;
-      if (!value.empty()) {
-        const int cur_id = benchmark::stoi(value);
-        max_id = std::max(cur_id, max_id);
-      }
-    }
-  }
-  if (f.bad()) {
-    PrintErrorAndDie("Failure reading /proc/cpuinfo");
-  }
-  if (!f.eof()) {
-    PrintErrorAndDie("Failed to read to end of /proc/cpuinfo");
-  }
-  f.close();
-
-  if ((max_id + 1) != num_cpus) {
-    fprintf(stderr,
-            "CPU ID assignments in /proc/cpuinfo seem messed up."
-            " This is usually caused by a bad BIOS.\n");
-  }
-  return num_cpus;
 #endif
   BENCHMARK_UNREACHABLE();
 }

From 19662e3a4f5d9114d9293cebca9bc830d3baf7db Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Wed, 5 Feb 2025 08:57:22 -0500
Subject: [PATCH 096/282] [libc++] Also provide an alignment assumption for
 vector in C++03 mode (#124839)

There's no reason not to, and it's easy enough to do using enable_if. As
a drive-by change, also add a missing _LIBCPP_NO_CFI attribute on
__add_alignment_assumption.

(cherry picked from commit ccb08b9dab7d829f8d9703d8b46b98e2d6717d0e)
---
 libcxx/include/__vector/vector.h | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/libcxx/include/__vector/vector.h b/libcxx/include/__vector/vector.h
index 66cb622e20963..bad676a56a8e6 100644
--- a/libcxx/include/__vector/vector.h
+++ b/libcxx/include/__vector/vector.h
@@ -783,14 +783,18 @@ class _LIBCPP_TEMPLATE_VIS vector {
 
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(vector&, false_type) _NOEXCEPT {}
 
-  static _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI pointer __add_alignment_assumption(pointer __p) _NOEXCEPT {
-#ifndef _LIBCPP_CXX03_LANG
-    if constexpr (is_pointer<pointer>::value) {
-      if (!__libcpp_is_constant_evaluated()) {
-        return static_cast<pointer>(__builtin_assume_aligned(__p, alignof(decltype(*__p))));
-      }
+  template <class _Ptr = pointer, __enable_if_t<is_pointer<_Ptr>::value, int> = 0>
+  static _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI _LIBCPP_NO_CFI pointer
+  __add_alignment_assumption(_Ptr __p) _NOEXCEPT {
+    if (!__libcpp_is_constant_evaluated()) {
+      return static_cast<pointer>(__builtin_assume_aligned(__p, _LIBCPP_ALIGNOF(decltype(*__p))));
     }
-#endif
+    return __p;
+  }
+
+  template <class _Ptr = pointer, __enable_if_t<!is_pointer<_Ptr>::value, int> = 0>
+  static _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI _LIBCPP_NO_CFI pointer
+  __add_alignment_assumption(_Ptr __p) _NOEXCEPT {
     return __p;
   }
 };

From 758ac2a1b7cda77e1db5a7a460c120098830a8a0 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Wed, 5 Feb 2025 07:13:33 -0500
Subject: [PATCH 097/282] [CG][RISCV]Fix shuffling of odd number of input
 vectors

If the input contains odd number of shuffled vectors, the 2 last
shuffles are shuffled with the same first vector. Need to correctly
process such situation: when the first vector is requested for the first
time - extract it from the source vector, when it is requested the
second time - reuse previous result. The second vector should be
extracted in both cases.

Fixes #125269

Reviewers: topperc, preames

Reviewed By: preames

Pull Request: https://github.com/llvm/llvm-project/pull/125693

(cherry picked from commit 23b6a05ec9c2220844748487612761d1e09166b7)
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   | 24 +++++++++-------
 .../rvv/fixed-vectors-shuffle-exact-vlen.ll   | 28 +++++++++++++++++++
 2 files changed, 42 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 585af30f8c8a1..d4eaf06f857be 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -5215,17 +5215,21 @@ static SDValue lowerShuffleViaVRegSplitting(ShuffleVectorSDNode *SVN,
     SmallDenseMap<unsigned, SDValue, 4> Values;
     for (unsigned I : seq<unsigned>(Data.size())) {
       const auto &[Idx1, Idx2, _] = Data[I];
-      if (Values.contains(Idx1)) {
-        assert(Idx2 != UINT_MAX && Values.contains(Idx2) &&
-               "Expected both indices to be extracted already.");
-        break;
+      // If the shuffle contains permutation of odd number of elements,
+      // Idx1 might be used already in the first iteration.
+      //
+      // Idx1 = shuffle Idx1, Idx2
+      // Idx1 = shuffle Idx1, Idx3
+      SDValue &V = Values.try_emplace(Idx1).first->getSecond();
+      if (!V)
+        V = ExtractValue(Idx1 >= NumOfSrcRegs ? V2 : V1,
+                         (Idx1 % NumOfSrcRegs) * NumOpElts);
+      if (Idx2 != UINT_MAX) {
+        SDValue &V = Values.try_emplace(Idx2).first->getSecond();
+        if (!V)
+          V = ExtractValue(Idx2 >= NumOfSrcRegs ? V2 : V1,
+                           (Idx2 % NumOfSrcRegs) * NumOpElts);
       }
-      SDValue V = ExtractValue(Idx1 >= NumOfSrcRegs ? V2 : V1,
-                               (Idx1 % NumOfSrcRegs) * NumOpElts);
-      Values[Idx1] = V;
-      if (Idx2 != UINT_MAX)
-        Values[Idx2] = ExtractValue(Idx2 >= NumOfSrcRegs ? V2 : V1,
-                                    (Idx2 % NumOfSrcRegs) * NumOpElts);
     }
     SDValue V;
     for (const auto &[Idx1, Idx2, Mask] : Data) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
index afd560fd74d16..c0c17d4e0623e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
@@ -431,3 +431,31 @@ define void @shuffle_i256_ldst(ptr %p) vscale_range(2,2) {
   store <4 x i256> %res, ptr %p
   ret void
 }
+
+define void @shuffle_3_input_vectors() vscale_range(4,4) {
+; CHECK-LABEL: shuffle_3_input_vectors:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 1
+; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v0, 6
+; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-NEXT:    vmv.v.i v16, 0
+; CHECK-NEXT:    vsetivli zero, 4, e64, m1, ta, mu
+; CHECK-NEXT:    vslidedown.vi v20, v8, 1, v0.t
+; CHECK-NEXT:    vslideup.vi v20, v9, 3
+; CHECK-NEXT:    vslidedown.vi v21, v9, 1
+; CHECK-NEXT:    vmv1r.v v22, v8
+; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-NEXT:    vmsgt.vi v8, v16, 0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    sb a0, 0(zero)
+; CHECK-NEXT:    ret
+  %1 = shufflevector <32 x i64> zeroinitializer, <32 x i64> splat (i64 1), <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 poison, i32 poison, i32 33, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %2 = icmp slt <32 x i64> zeroinitializer, %1
+  %3 = bitcast <32 x i1> %2 to i32
+  %4 = trunc i32 %3 to i8
+  store i8 %4, ptr null, align 1
+  ret void
+}

From 1951944a9e4e759688018f10bef2f6796b064a73 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Mon, 3 Feb 2025 12:04:46 +0000
Subject: [PATCH 098/282] [AArch64][SME] Spill p-regs as z-regs when streaming
 hazards are possible (#123752)

This patch adds a new option `-aarch64-enable-zpr-predicate-spills`
(which is disabled by default), this option replaces predicate spills
with vector spills in streaming[-compatible] functions.

For example:

```
str	p8, [sp, #7, mul vl]            // 2-byte Folded Spill
// ...
ldr	p8, [sp, #7, mul vl]            // 2-byte Folded Reload
```

Becomes:

```
mov	z0.b, p8/z, #1
str	z0, [sp]                        // 16-byte Folded Spill
// ...
ldr	z0, [sp]                        // 16-byte Folded Reload
ptrue	p4.b
cmpne	p8.b, p4/z, z0.b, #0
```

This is done to avoid streaming memory hazards between FPR/vector and
predicate spills, which currently occupy the same stack area even when
the `-aarch64-stack-hazard-size` flag is set.

This is implemented with two new pseudos SPILL_PPR_TO_ZPR_SLOT_PSEUDO
and FILL_PPR_FROM_ZPR_SLOT_PSEUDO. The expansion of these pseudos
handles scavenging the required registers (z0 in the above example) and,
in the worst case spilling a register to an emergency stack slot in the
expansion. The condition flags are also preserved around the `cmpne` in
case they are live at the expansion point.
---
 .../Target/AArch64/AArch64FrameLowering.cpp   |  313 ++++-
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  |   16 +-
 .../Target/AArch64/AArch64RegisterInfo.cpp    |    4 +-
 llvm/lib/Target/AArch64/AArch64RegisterInfo.h |    2 +-
 .../lib/Target/AArch64/AArch64RegisterInfo.td |   11 +-
 llvm/lib/Target/AArch64/AArch64Subtarget.cpp  |   19 +
 llvm/lib/Target/AArch64/AArch64Subtarget.h    |    2 +
 llvm/lib/Target/AArch64/SMEInstrFormats.td    |   14 +
 .../AArch64/spill-fill-zpr-predicates.mir     | 1070 +++++++++++++++++
 .../AArch64/ssve-stack-hazard-remarks.ll      |   13 +-
 llvm/utils/TableGen/SubtargetEmitter.cpp      |   22 +-
 11 files changed, 1474 insertions(+), 12 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/spill-fill-zpr-predicates.mir

diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index a082a1ebe95bf..81523adeefcee 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -1634,6 +1634,9 @@ static bool IsSVECalleeSave(MachineBasicBlock::iterator I) {
   case AArch64::STR_PXI:
   case AArch64::LDR_ZXI:
   case AArch64::LDR_PXI:
+  case AArch64::PTRUE_B:
+  case AArch64::CPY_ZPzI_B:
+  case AArch64::CMPNE_PPzZI_B:
     return I->getFlag(MachineInstr::FrameSetup) ||
            I->getFlag(MachineInstr::FrameDestroy);
   }
@@ -3265,7 +3268,8 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
       StrOpc = RPI.isPaired() ? AArch64::ST1B_2Z_IMM : AArch64::STR_ZXI;
       break;
     case RegPairInfo::PPR:
-      StrOpc = AArch64::STR_PXI;
+      StrOpc =
+          Size == 16 ? AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO : AArch64::STR_PXI;
       break;
     case RegPairInfo::VG:
       StrOpc = AArch64::STRXui;
@@ -3494,7 +3498,8 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
       LdrOpc = RPI.isPaired() ? AArch64::LD1B_2Z_IMM : AArch64::LDR_ZXI;
       break;
     case RegPairInfo::PPR:
-      LdrOpc = AArch64::LDR_PXI;
+      LdrOpc = Size == 16 ? AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO
+                          : AArch64::LDR_PXI;
       break;
     case RegPairInfo::VG:
       continue;
@@ -3720,6 +3725,14 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
       continue;
     }
 
+    // Always save P4 when PPR spills are ZPR-sized and a predicate above p8 is
+    // spilled. If all of p0-p3 are used as return values p4 is must be free
+    // to reload p8-p15.
+    if (RegInfo->getSpillSize(AArch64::PPRRegClass) == 16 &&
+        AArch64::PPR_p8to15RegClass.contains(Reg)) {
+      SavedRegs.set(AArch64::P4);
+    }
+
     // MachO's compact unwind format relies on all registers being stored in
     // pairs.
     // FIXME: the usual format is actually better if unwinding isn't needed.
@@ -4159,8 +4172,295 @@ int64_t AArch64FrameLowering::assignSVEStackObjectOffsets(
                                         true);
 }
 
+/// Attempts to scavenge a register from \p ScavengeableRegs given the used
+/// registers in \p UsedRegs.
+static Register tryScavengeRegister(LiveRegUnits const &UsedRegs,
+                                    BitVector const &ScavengeableRegs) {
+  for (auto Reg : ScavengeableRegs.set_bits()) {
+    if (UsedRegs.available(Reg))
+      return Reg;
+  }
+  return AArch64::NoRegister;
+}
+
+/// Propagates frame-setup/destroy flags from \p SourceMI to all instructions in
+/// \p MachineInstrs.
+static void propagateFrameFlags(MachineInstr &SourceMI,
+                                ArrayRef<MachineInstr *> MachineInstrs) {
+  for (MachineInstr *MI : MachineInstrs) {
+    if (SourceMI.getFlag(MachineInstr::FrameSetup))
+      MI->setFlag(MachineInstr::FrameSetup);
+    if (SourceMI.getFlag(MachineInstr::FrameDestroy))
+      MI->setFlag(MachineInstr::FrameDestroy);
+  }
+}
+
+/// RAII helper class for scavenging or spilling a register. On construction
+/// attempts to find a free register of class \p RC (given \p UsedRegs and \p
+/// AllocatableRegs), if no register can be found spills \p SpillCandidate to \p
+/// MaybeSpillFI to free a register. The free'd register is returned via the \p
+/// FreeReg output parameter. On destruction, if there is a spill, its previous
+/// value is reloaded. The spilling and scavenging is only valid at the
+/// insertion point \p MBBI, this class should _not_ be used in places that
+/// create or manipulate basic blocks, moving the expected insertion point.
+struct ScopedScavengeOrSpill {
+  ScopedScavengeOrSpill(const ScopedScavengeOrSpill &) = delete;
+  ScopedScavengeOrSpill(ScopedScavengeOrSpill &&) = delete;
+
+  ScopedScavengeOrSpill(MachineFunction &MF, MachineBasicBlock &MBB,
+                        MachineBasicBlock::iterator MBBI,
+                        Register SpillCandidate, const TargetRegisterClass &RC,
+                        LiveRegUnits const &UsedRegs,
+                        BitVector const &AllocatableRegs,
+                        std::optional<int> *MaybeSpillFI)
+      : MBB(MBB), MBBI(MBBI), RC(RC), TII(static_cast<const AArch64InstrInfo &>(
+                                          *MF.getSubtarget().getInstrInfo())),
+        TRI(*MF.getSubtarget().getRegisterInfo()) {
+    FreeReg = tryScavengeRegister(UsedRegs, AllocatableRegs);
+    if (FreeReg != AArch64::NoRegister)
+      return;
+    assert(MaybeSpillFI && "Expected emergency spill slot FI information "
+                           "(attempted to spill in prologue/epilogue?)");
+    if (!MaybeSpillFI->has_value()) {
+      MachineFrameInfo &MFI = MF.getFrameInfo();
+      *MaybeSpillFI = MFI.CreateSpillStackObject(TRI.getSpillSize(RC),
+                                                 TRI.getSpillAlign(RC));
+    }
+    FreeReg = SpillCandidate;
+    SpillFI = MaybeSpillFI->value();
+    TII.storeRegToStackSlot(MBB, MBBI, FreeReg, false, *SpillFI, &RC, &TRI,
+                            Register());
+  }
+
+  bool hasSpilled() const { return SpillFI.has_value(); }
+
+  /// Returns the free register (found from scavenging or spilling a register).
+  Register freeRegister() const { return FreeReg; }
+
+  Register operator*() const { return freeRegister(); }
+
+  ~ScopedScavengeOrSpill() {
+    if (hasSpilled())
+      TII.loadRegFromStackSlot(MBB, MBBI, FreeReg, *SpillFI, &RC, &TRI,
+                               Register());
+  }
+
+private:
+  MachineBasicBlock &MBB;
+  MachineBasicBlock::iterator MBBI;
+  const TargetRegisterClass &RC;
+  const AArch64InstrInfo &TII;
+  const TargetRegisterInfo &TRI;
+  Register FreeReg = AArch64::NoRegister;
+  std::optional<int> SpillFI;
+};
+
+/// Emergency stack slots for expanding SPILL_PPR_TO_ZPR_SLOT_PSEUDO and
+/// FILL_PPR_FROM_ZPR_SLOT_PSEUDO.
+struct EmergencyStackSlots {
+  std::optional<int> ZPRSpillFI;
+  std::optional<int> PPRSpillFI;
+  std::optional<int> GPRSpillFI;
+};
+
+/// Registers available for scavenging (ZPR, PPR3b, GPR).
+struct ScavengeableRegs {
+  BitVector ZPRRegs;
+  BitVector PPR3bRegs;
+  BitVector GPRRegs;
+};
+
+static bool isInPrologueOrEpilogue(const MachineInstr &MI) {
+  return MI.getFlag(MachineInstr::FrameSetup) ||
+         MI.getFlag(MachineInstr::FrameDestroy);
+}
+
+/// Expands:
+/// ```
+/// SPILL_PPR_TO_ZPR_SLOT_PSEUDO $p0, %stack.0, 0
+/// ```
+/// To:
+/// ```
+/// $z0 = CPY_ZPzI_B $p0, 1, 0
+/// STR_ZXI $z0, $stack.0, 0
+/// ```
+/// While ensuring a ZPR ($z0 in this example) is free for the predicate (
+/// spilling if necessary).
+static void expandSpillPPRToZPRSlotPseudo(MachineBasicBlock &MBB,
+                                          MachineInstr &MI,
+                                          const TargetRegisterInfo &TRI,
+                                          LiveRegUnits const &UsedRegs,
+                                          ScavengeableRegs const &SR,
+                                          EmergencyStackSlots &SpillSlots) {
+  MachineFunction &MF = *MBB.getParent();
+  auto *TII =
+      static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
+
+  ScopedScavengeOrSpill ZPredReg(
+      MF, MBB, MI, AArch64::Z0, AArch64::ZPRRegClass, UsedRegs, SR.ZPRRegs,
+      isInPrologueOrEpilogue(MI) ? nullptr : &SpillSlots.ZPRSpillFI);
+
+  SmallVector<MachineInstr *, 2> MachineInstrs;
+  const DebugLoc &DL = MI.getDebugLoc();
+  MachineInstrs.push_back(BuildMI(MBB, MI, DL, TII->get(AArch64::CPY_ZPzI_B))
+                              .addReg(*ZPredReg, RegState::Define)
+                              .add(MI.getOperand(0))
+                              .addImm(1)
+                              .addImm(0)
+                              .getInstr());
+  MachineInstrs.push_back(BuildMI(MBB, MI, DL, TII->get(AArch64::STR_ZXI))
+                              .addReg(*ZPredReg)
+                              .add(MI.getOperand(1))
+                              .addImm(MI.getOperand(2).getImm())
+                              .setMemRefs(MI.memoperands())
+                              .getInstr());
+  propagateFrameFlags(MI, MachineInstrs);
+}
+
+/// Expands:
+/// ```
+/// $p0 = FILL_PPR_FROM_ZPR_SLOT_PSEUDO %stack.0, 0
+/// ```
+/// To:
+/// ```
+/// $z0 = LDR_ZXI %stack.0, 0
+/// $p0 = PTRUE_B 31, implicit $vg
+/// $p0 = CMPNE_PPzZI_B $p0, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+/// ```
+/// While ensuring a ZPR ($z0 in this example) is free for the predicate (
+/// spilling if necessary). If the status flags are in use at the point of
+/// expansion they are preserved (by moving them to/from a GPR). This may cause
+/// an additional spill if no GPR is free at the expansion point.
+static bool expandFillPPRFromZPRSlotPseudo(MachineBasicBlock &MBB,
+                                           MachineInstr &MI,
+                                           const TargetRegisterInfo &TRI,
+                                           LiveRegUnits const &UsedRegs,
+                                           ScavengeableRegs const &SR,
+                                           EmergencyStackSlots &SpillSlots) {
+  MachineFunction &MF = *MBB.getParent();
+  auto *TII =
+      static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
+
+  ScopedScavengeOrSpill ZPredReg(
+      MF, MBB, MI, AArch64::Z0, AArch64::ZPRRegClass, UsedRegs, SR.ZPRRegs,
+      isInPrologueOrEpilogue(MI) ? nullptr : &SpillSlots.ZPRSpillFI);
+
+  ScopedScavengeOrSpill PredReg(
+      MF, MBB, MI, AArch64::P0, AArch64::PPR_3bRegClass, UsedRegs, SR.PPR3bRegs,
+      isInPrologueOrEpilogue(MI) ? nullptr : &SpillSlots.PPRSpillFI);
+
+  // Elide NZCV spills if we know it is not used.
+  bool IsNZCVUsed = !UsedRegs.available(AArch64::NZCV);
+  std::optional<ScopedScavengeOrSpill> NZCVSaveReg;
+  if (IsNZCVUsed)
+    NZCVSaveReg.emplace(
+        MF, MBB, MI, AArch64::X0, AArch64::GPR64RegClass, UsedRegs, SR.GPRRegs,
+        isInPrologueOrEpilogue(MI) ? nullptr : &SpillSlots.GPRSpillFI);
+  SmallVector<MachineInstr *, 4> MachineInstrs;
+  const DebugLoc &DL = MI.getDebugLoc();
+  MachineInstrs.push_back(BuildMI(MBB, MI, DL, TII->get(AArch64::LDR_ZXI))
+                              .addReg(*ZPredReg, RegState::Define)
+                              .add(MI.getOperand(1))
+                              .addImm(MI.getOperand(2).getImm())
+                              .setMemRefs(MI.memoperands())
+                              .getInstr());
+  if (IsNZCVUsed)
+    MachineInstrs.push_back(
+        BuildMI(MBB, MI, DL, TII->get(AArch64::MRS))
+            .addReg(NZCVSaveReg->freeRegister(), RegState::Define)
+            .addImm(AArch64SysReg::NZCV)
+            .addReg(AArch64::NZCV, RegState::Implicit)
+            .getInstr());
+  MachineInstrs.push_back(BuildMI(MBB, MI, DL, TII->get(AArch64::PTRUE_B))
+                              .addReg(*PredReg, RegState::Define)
+                              .addImm(31));
+  MachineInstrs.push_back(
+      BuildMI(MBB, MI, DL, TII->get(AArch64::CMPNE_PPzZI_B))
+          .addReg(MI.getOperand(0).getReg(), RegState::Define)
+          .addReg(*PredReg)
+          .addReg(*ZPredReg)
+          .addImm(0)
+          .addReg(AArch64::NZCV, RegState::ImplicitDefine)
+          .getInstr());
+  if (IsNZCVUsed)
+    MachineInstrs.push_back(BuildMI(MBB, MI, DL, TII->get(AArch64::MSR))
+                                .addImm(AArch64SysReg::NZCV)
+                                .addReg(NZCVSaveReg->freeRegister())
+                                .addReg(AArch64::NZCV, RegState::ImplicitDefine)
+                                .getInstr());
+
+  propagateFrameFlags(MI, MachineInstrs);
+  return PredReg.hasSpilled();
+}
+
+/// Expands all FILL_PPR_FROM_ZPR_SLOT_PSEUDO and SPILL_PPR_TO_ZPR_SLOT_PSEUDO
+/// operations within the MachineBasicBlock \p MBB.
+static bool expandSMEPPRToZPRSpillPseudos(MachineBasicBlock &MBB,
+                                          const TargetRegisterInfo &TRI,
+                                          ScavengeableRegs const &SR,
+                                          EmergencyStackSlots &SpillSlots) {
+  LiveRegUnits UsedRegs(TRI);
+  UsedRegs.addLiveOuts(MBB);
+  bool HasPPRSpills = false;
+  for (MachineInstr &MI : make_early_inc_range(reverse(MBB))) {
+    UsedRegs.stepBackward(MI);
+    switch (MI.getOpcode()) {
+    case AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO:
+      HasPPRSpills |= expandFillPPRFromZPRSlotPseudo(MBB, MI, TRI, UsedRegs, SR,
+                                                     SpillSlots);
+      MI.eraseFromParent();
+      break;
+    case AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO:
+      expandSpillPPRToZPRSlotPseudo(MBB, MI, TRI, UsedRegs, SR, SpillSlots);
+      MI.eraseFromParent();
+      break;
+    default:
+      break;
+    }
+  }
+
+  return HasPPRSpills;
+}
+
 void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
     MachineFunction &MF, RegScavenger *RS) const {
+
+  AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+  const TargetSubtargetInfo &TSI = MF.getSubtarget();
+  const TargetRegisterInfo &TRI = *TSI.getRegisterInfo();
+
+  // If predicates spills are 16-bytes we may need to expand
+  // SPILL_PPR_TO_ZPR_SLOT_PSEUDO/FILL_PPR_FROM_ZPR_SLOT_PSEUDO.
+  if (AFI->hasStackFrame() && TRI.getSpillSize(AArch64::PPRRegClass) == 16) {
+    auto ComputeScavengeableRegisters = [&](unsigned RegClassID) {
+      BitVector Regs = TRI.getAllocatableSet(MF, TRI.getRegClass(RegClassID));
+      assert(Regs.count() > 0 && "Expected scavengeable registers");
+      return Regs;
+    };
+
+    ScavengeableRegs SR{};
+    SR.ZPRRegs = ComputeScavengeableRegisters(AArch64::ZPRRegClassID);
+    // Only p0-7 are possible as the second operand of cmpne (needed for fills).
+    SR.PPR3bRegs = ComputeScavengeableRegisters(AArch64::PPR_3bRegClassID);
+    SR.GPRRegs = ComputeScavengeableRegisters(AArch64::GPR64RegClassID);
+
+    EmergencyStackSlots SpillSlots;
+    for (MachineBasicBlock &MBB : MF) {
+      // In the case we had to spill a predicate (in the range p0-p7) to reload
+      // a predicate (>= p8), additional spill/fill pseudos will be created.
+      // These need an additional expansion pass. Note: There will only be at
+      // most two expansion passes, as spilling/filling a predicate in the range
+      // p0-p7 never requires spilling another predicate.
+      for (int Pass = 0; Pass < 2; Pass++) {
+        bool HasPPRSpills =
+            expandSMEPPRToZPRSpillPseudos(MBB, TRI, SR, SpillSlots);
+        assert((Pass == 0 || !HasPPRSpills) && "Did not expect PPR spills");
+        if (!HasPPRSpills)
+          break;
+      }
+    }
+  }
+
   MachineFrameInfo &MFI = MF.getFrameInfo();
 
   assert(getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown &&
@@ -4170,7 +4470,6 @@ void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
   int64_t SVEStackSize =
       assignSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex);
 
-  AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
   AFI->setStackSizeSVE(alignTo(SVEStackSize, 16U));
   AFI->setMinMaxSVECSFrameIndex(MinCSFrameIndex, MaxCSFrameIndex);
 
@@ -5204,9 +5503,13 @@ void AArch64FrameLowering::emitRemarks(
 
           unsigned RegTy = StackAccess::AccessType::GPR;
           if (MFI.getStackID(FrameIdx) == TargetStackID::ScalableVector) {
-            if (AArch64::PPRRegClass.contains(MI.getOperand(0).getReg()))
+            // SPILL_PPR_TO_ZPR_SLOT_PSEUDO and FILL_PPR_FROM_ZPR_SLOT_PSEUDO
+            // spill/fill the predicate as a data vector (so are an FPR acess).
+            if (MI.getOpcode() != AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO &&
+                MI.getOpcode() != AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO &&
+                AArch64::PPRRegClass.contains(MI.getOperand(0).getReg())) {
               RegTy = StackAccess::PPR;
-            else
+            } else
               RegTy = StackAccess::FPR;
           } else if (AArch64InstrInfo::isFpOrNEON(MI)) {
             RegTy = StackAccess::FPR;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 17dd8a073eff0..0f2b969fba35c 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -81,7 +81,7 @@ static cl::opt<unsigned>
 AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
     : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP,
                           AArch64::CATCHRET),
-      RI(STI.getTargetTriple()), Subtarget(STI) {}
+      RI(STI.getTargetTriple(), STI.getHwMode()), Subtarget(STI) {}
 
 /// GetInstSize - Return the number of bytes of code the specified
 /// instruction may be.  This returns the maximum number of bytes.
@@ -2438,6 +2438,8 @@ unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
   case AArch64::STZ2Gi:
   case AArch64::STZGi:
   case AArch64::TAGPstack:
+  case AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO:
+  case AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO:
     return 2;
   case AArch64::LD1B_D_IMM:
   case AArch64::LD1B_H_IMM:
@@ -4223,6 +4225,8 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
     MinOffset = -256;
     MaxOffset = 254;
     break;
+  case AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO:
+  case AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO:
   case AArch64::LDR_ZXI:
   case AArch64::STR_ZXI:
     Scale = TypeSize::getScalable(16);
@@ -5355,6 +5359,11 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
              "Unexpected register store without SVE store instructions");
       Opc = AArch64::STR_ZXI;
       StackID = TargetStackID::ScalableVector;
+    } else if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.isSVEorStreamingSVEAvailable() &&
+             "Unexpected predicate store without SVE store instructions");
+      Opc = AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO;
+      StackID = TargetStackID::ScalableVector;
     }
     break;
   case 24:
@@ -5527,6 +5536,11 @@ void AArch64InstrInfo::loadRegFromStackSlot(
              "Unexpected register load without SVE load instructions");
       Opc = AArch64::LDR_ZXI;
       StackID = TargetStackID::ScalableVector;
+    } else if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.isSVEorStreamingSVEAvailable() &&
+             "Unexpected predicate load without SVE load instructions");
+      Opc = AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO;
+      StackID = TargetStackID::ScalableVector;
     }
     break;
   case 24:
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 5973b63b5a802..e9730348ba58e 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -38,8 +38,8 @@ using namespace llvm;
 #define GET_REGINFO_TARGET_DESC
 #include "AArch64GenRegisterInfo.inc"
 
-AArch64RegisterInfo::AArch64RegisterInfo(const Triple &TT)
-    : AArch64GenRegisterInfo(AArch64::LR), TT(TT) {
+AArch64RegisterInfo::AArch64RegisterInfo(const Triple &TT, unsigned HwMode)
+    : AArch64GenRegisterInfo(AArch64::LR, 0, 0, 0, HwMode), TT(TT) {
   AArch64_MC::initLLVMToCVRegMapping(this);
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
index 11da624af4881..898a509f75908 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
@@ -27,7 +27,7 @@ class AArch64RegisterInfo final : public AArch64GenRegisterInfo {
   const Triple &TT;
 
 public:
-  AArch64RegisterInfo(const Triple &TT);
+  AArch64RegisterInfo(const Triple &TT, unsigned HwMode);
 
   // FIXME: This should be tablegen'd like getDwarfRegNum is
   int getSEHRegNum(unsigned i) const {
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
index dd4f2549929f8..fed9b7b173e9c 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -979,10 +979,19 @@ class ZPRRegOp <string Suffix, AsmOperandClass C, ElementSizeEnum Size,
 //******************************************************************************
 
 // SVE predicate register classes.
+
+// Note: This hardware mode is enabled in AArch64Subtarget::getHwModeSet()
+// (without the use of the table-gen'd predicates).
+def SMEWithZPRPredicateSpills : HwMode<"", [Predicate<"false">]>;
+
+def PPRSpillFillRI : RegInfoByHwMode<
+      [DefaultMode,              SMEWithZPRPredicateSpills],
+      [RegInfo<16,16,16>,        RegInfo<16,128,128>]>;
+
 class PPRClass<int firstreg, int lastreg, int step = 1> : RegisterClass<"AArch64",
                                   [ nxv16i1, nxv8i1, nxv4i1, nxv2i1, nxv1i1 ], 16,
                                   (sequence "P%u", firstreg, lastreg, step)> {
-  let Size = 16;
+  let RegInfos = PPRSpillFillRI;
 }
 
 def PPR    : PPRClass<0, 15> {
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 809e658e65380..bb36af8fce5cc 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -86,6 +86,11 @@ static cl::alias AArch64StreamingStackHazardSize(
     cl::desc("alias for -aarch64-streaming-hazard-size"),
     cl::aliasopt(AArch64StreamingHazardSize));
 
+static cl::opt<bool> EnableZPRPredicateSpills(
+    "aarch64-enable-zpr-predicate-spills", cl::init(false), cl::Hidden,
+    cl::desc(
+        "Enables spilling/reloading SVE predicates as data vectors (ZPRs)"));
+
 // Subreg liveness tracking is disabled by default for now until all issues
 // are ironed out. This option allows the feature to be used in tests.
 static cl::opt<bool>
@@ -405,6 +410,20 @@ AArch64Subtarget::AArch64Subtarget(const Triple &TT, StringRef CPU,
   EnableSubregLiveness = EnableSubregLivenessTracking.getValue();
 }
 
+unsigned AArch64Subtarget::getHwModeSet() const {
+  AArch64HwModeBits Modes = AArch64HwModeBits::DefaultMode;
+
+  // Use a special hardware mode in streaming[-compatible] functions with
+  // aarch64-enable-zpr-predicate-spills. This changes the spill size (and
+  // alignment) for the predicate register class.
+  if (EnableZPRPredicateSpills.getValue() &&
+      (isStreaming() || isStreamingCompatible())) {
+    Modes |= AArch64HwModeBits::SMEWithZPRPredicateSpills;
+  }
+
+  return to_underlying(Modes);
+}
+
 const CallLowering *AArch64Subtarget::getCallLowering() const {
   return CallLoweringInfo.get();
 }
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index dca5f5393fe47..c6eb77e3bc3ba 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -130,6 +130,8 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
                    bool IsStreaming = false, bool IsStreamingCompatible = false,
                    bool HasMinSize = false);
 
+  virtual unsigned getHwModeSet() const override;
+
 // Getters for SubtargetFeatures defined in tablegen
 #define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER)                    \
   bool GETTER() const { return ATTRIBUTE; }
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index a01d59d0e5c43..0ac131e48c4f8 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -59,6 +59,20 @@ def FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO :
   let hasPostISelHook = 1;
 }
 
+def SPILL_PPR_TO_ZPR_SLOT_PSEUDO :
+  Pseudo<(outs), (ins PPRorPNRAny:$Pt, GPR64sp:$Rn, simm9:$imm9), []>, Sched<[]>
+{
+  let mayStore = 1;
+  let hasSideEffects = 0;
+}
+
+def FILL_PPR_FROM_ZPR_SLOT_PSEUDO :
+  Pseudo<(outs PPRorPNRAny:$Pt), (ins GPR64sp:$Rn, simm9:$imm9), []>, Sched<[]>
+{
+  let mayLoad = 1;
+  let hasSideEffects = 0;
+}
+
 def SDTZALoadStore : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisPtrTy<1>, SDTCisInt<2>]>;
 def AArch64SMELdr : SDNode<"AArch64ISD::SME_ZA_LDR", SDTZALoadStore,
                              [SDNPHasChain, SDNPSideEffect, SDNPMayLoad]>;
diff --git a/llvm/test/CodeGen/AArch64/spill-fill-zpr-predicates.mir b/llvm/test/CodeGen/AArch64/spill-fill-zpr-predicates.mir
new file mode 100644
index 0000000000000..b58f91ac68a93
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/spill-fill-zpr-predicates.mir
@@ -0,0 +1,1070 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=aarch64-linux-gnu -aarch64-enable-zpr-predicate-spills -run-pass=greedy %s -o - | FileCheck %s
+# RUN: llc -mtriple=aarch64-linux-gnu -aarch64-enable-zpr-predicate-spills -start-before=greedy -stop-after=aarch64-expand-pseudo -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=EXPAND
+--- |
+  source_filename = "<stdin>"
+  target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+  target triple = "aarch64--linux-gnu"
+
+  define aarch64_sve_vector_pcs void @zpr_predicate_spill() #0 { entry: unreachable }
+
+  define aarch64_sve_vector_pcs void @zpr_predicate_spill__save_restore_nzcv() #0 { entry: unreachable }
+
+  define aarch64_sve_vector_pcs void @zpr_predicate_spill__save_restore_nzcv__scavenge_csr_gpr() #0 { entry: unreachable }
+
+  define aarch64_sve_vector_pcs void @zpr_predicate_spill__spill_zpr() #0 { entry: unreachable }
+
+  define aarch64_sve_vector_pcs void @zpr_predicate_spill_above_p7() #0 { entry: unreachable }
+
+  define aarch64_sve_vector_pcs void @zpr_predicate_spill_p4_saved() #0 { entry: unreachable }
+
+  attributes #0 = {nounwind "target-features"="+sme,+sve" "aarch64_pstate_sm_compatible"}
+...
+---
+name: zpr_predicate_spill
+tracksRegLiveness: true
+stack:
+liveins:
+  - { reg: '$p0' }
+body:             |
+  bb.0.entry:
+    liveins: $p0
+
+    ; CHECK-LABEL: name: zpr_predicate_spill
+    ; CHECK: stack:
+    ; CHECK:      - { id: 0, name: '', type: spill-slot, offset: 0, size: 16, alignment: 16,
+    ; CHECK-NEXT:     stack-id: scalable-vector, callee-saved-register:
+    ; CHECK: liveins: $p0
+    ; CHECK-NEXT: {{  $}}
+    ;
+    ; CHECK-NEXT: SPILL_PPR_TO_ZPR_SLOT_PSEUDO $p0, %stack.0, 0 :: (store (s128) into %stack.0)
+    ;
+    ; CHECK-NEXT: $p0 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p1 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p2 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p3 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p4 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p5 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p6 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p7 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p8 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p9 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p10 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p11 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p12 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p13 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p14 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p15 = IMPLICIT_DEF
+    ;
+    ; CHECK-NEXT: $p0 = FILL_PPR_FROM_ZPR_SLOT_PSEUDO %stack.0, 0 :: (load (s128) from %stack.0)
+    ;
+    ; CHECK-NEXT: RET_ReallyLR implicit $p0
+
+    ; EXPAND-LABEL: name: zpr_predicate_spill
+    ; EXPAND: liveins: $p0, $fp, $p15, $p14, $p13, $p12, $p11, $p10, $p9, $p8, $p7, $p6, $p5, $p4
+    ; EXPAND-NEXT: {{  $}}
+    ;
+    ; EXPAND-NEXT: $sp = frame-setup SUBXri $sp, 1040, 0
+    ; EXPAND-NEXT: frame-setup STRXui killed $fp, $sp, 128 :: (store (s64) into %stack.14)
+    ; EXPAND-NEXT: $sp = frame-setup ADDVL_XXI $sp, -12, implicit $vg
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p15, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 0 :: (store (s128) into %stack.13)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p14, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 1 :: (store (s128) into %stack.12)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p13, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 2 :: (store (s128) into %stack.11)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p12, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 3 :: (store (s128) into %stack.10)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p11, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 4 :: (store (s128) into %stack.9)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p10, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 5 :: (store (s128) into %stack.8)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p9, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 6 :: (store (s128) into %stack.7)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p8, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 7 :: (store (s128) into %stack.6)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p7, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 8 :: (store (s128) into %stack.5)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p6, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 9 :: (store (s128) into %stack.4)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p5, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 10 :: (store (s128) into %stack.3)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p4, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 11 :: (store (s128) into %stack.2)
+    ; EXPAND-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0
+    ; EXPAND-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1, implicit $vg
+    ;
+    ; EXPAND-NEXT: $z0 = CPY_ZPzI_B $p0, 1, 0
+    ; EXPAND-NEXT: $x8 = ADDXri $sp, 1024, 0
+    ; EXPAND-NEXT: STR_ZXI $z0, $x8, 0 :: (store (s128) into %stack.0)
+    ;
+    ; EXPAND-NEXT: $p0 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p1 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p2 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p3 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p4 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p5 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p6 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p7 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p8 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p9 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p10 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p11 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p12 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p13 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p14 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p15 = IMPLICIT_DEF
+    ;
+    ; EXPAND-NEXT: $z0 = LDR_ZXI killed $x8, 0 :: (load (s128) from %stack.0)
+    ; EXPAND-NEXT: $p0 = PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p0 = CMPNE_PPzZI_B $p0, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ;
+    ; EXPAND-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0
+    ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1, implicit $vg
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 0 :: (load (s128) from %stack.13)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p15 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.12)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p14 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 2 :: (load (s128) from %stack.11)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p13 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 3 :: (load (s128) from %stack.10)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p12 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 4 :: (load (s128) from %stack.9)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p11 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 5 :: (load (s128) from %stack.8)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p10 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 6 :: (load (s128) from %stack.7)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p9 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 7 :: (load (s128) from %stack.6)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p8 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 8 :: (load (s128) from %stack.5)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p7 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 9 :: (load (s128) from %stack.4)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p6 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 10 :: (load (s128) from %stack.3)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p5 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 11 :: (load (s128) from %stack.2)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p4 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 12, implicit $vg
+    ; EXPAND-NEXT: $fp = frame-destroy LDRXui $sp, 128 :: (load (s64) from %stack.14)
+    ; EXPAND-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0
+    ; EXPAND-NEXT: RET undef $lr, implicit $p0
+    %1:ppr = COPY $p0
+
+    $p0 = IMPLICIT_DEF
+    $p1 = IMPLICIT_DEF
+    $p2 = IMPLICIT_DEF
+    $p3 = IMPLICIT_DEF
+    $p4 = IMPLICIT_DEF
+    $p5 = IMPLICIT_DEF
+    $p6 = IMPLICIT_DEF
+    $p7 = IMPLICIT_DEF
+    $p8 = IMPLICIT_DEF
+    $p9 = IMPLICIT_DEF
+    $p10 = IMPLICIT_DEF
+    $p11 = IMPLICIT_DEF
+    $p12 = IMPLICIT_DEF
+    $p13 = IMPLICIT_DEF
+    $p14 = IMPLICIT_DEF
+    $p15 = IMPLICIT_DEF
+
+    $p0 = COPY %1
+
+    RET_ReallyLR implicit $p0
+...
+---
+name: zpr_predicate_spill__save_restore_nzcv
+tracksRegLiveness: true
+stack:
+liveins:
+  - { reg: '$p0' }
+body:             |
+  bb.0.entry:
+    liveins: $p0
+
+    ; CHECK-LABEL: name: zpr_predicate_spill__save_restore_nzcv
+    ; CHECK: stack:
+    ; CHECK:      - { id: 0, name: '', type: spill-slot, offset: 0, size: 16, alignment: 16,
+    ; CHECK-NEXT:     stack-id: scalable-vector, callee-saved-register:
+    ; CHECK: liveins: $p0
+    ; CHECK-NEXT: {{  $}}
+    ;
+    ; CHECK-NEXT: $nzcv = IMPLICIT_DEF
+    ;
+    ; CHECK-NEXT: SPILL_PPR_TO_ZPR_SLOT_PSEUDO $p0, %stack.0, 0 :: (store (s128) into %stack.0)
+    ;
+    ; CHECK-NEXT: $p0 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p1 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p2 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p3 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p4 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p5 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p6 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p7 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p8 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p9 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p10 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p11 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p12 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p13 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p14 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p15 = IMPLICIT_DEF
+    ;
+    ; CHECK-NEXT: $p0 = FILL_PPR_FROM_ZPR_SLOT_PSEUDO %stack.0, 0 :: (load (s128) from %stack.0)
+    ;
+    ; CHECK-NEXT: FAKE_USE implicit $nzcv
+    ;
+    ; CHECK-NEXT: RET_ReallyLR implicit $p0
+
+    ; EXPAND-LABEL: name: zpr_predicate_spill__save_restore_nzcv
+    ; EXPAND: liveins: $p0, $fp, $p15, $p14, $p13, $p12, $p11, $p10, $p9, $p8, $p7, $p6, $p5, $p4
+    ; EXPAND-NEXT: {{  $}}
+    ;
+    ; EXPAND-NEXT: $sp = frame-setup SUBXri $sp, 1040, 0
+    ; EXPAND-NEXT: frame-setup STRXui killed $fp, $sp, 128 :: (store (s64) into %stack.14)
+    ; EXPAND-NEXT: $sp = frame-setup ADDVL_XXI $sp, -12, implicit $vg
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p15, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 0 :: (store (s128) into %stack.13)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p14, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 1 :: (store (s128) into %stack.12)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p13, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 2 :: (store (s128) into %stack.11)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p12, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 3 :: (store (s128) into %stack.10)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p11, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 4 :: (store (s128) into %stack.9)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p10, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 5 :: (store (s128) into %stack.8)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p9, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 6 :: (store (s128) into %stack.7)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p8, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 7 :: (store (s128) into %stack.6)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p7, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 8 :: (store (s128) into %stack.5)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p6, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 9 :: (store (s128) into %stack.4)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p5, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 10 :: (store (s128) into %stack.3)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p4, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 11 :: (store (s128) into %stack.2)
+    ; EXPAND-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0
+    ; EXPAND-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1, implicit $vg
+    ;
+    ; EXPAND-NEXT: $nzcv = IMPLICIT_DEF
+    ;
+    ; EXPAND-NEXT: $z0 = CPY_ZPzI_B $p0, 1, 0
+    ; EXPAND-NEXT: $x8 = ADDXri $sp, 1024, 0
+    ; EXPAND-NEXT: STR_ZXI $z0, $x8, 0 :: (store (s128) into %stack.0)
+    ;
+    ; EXPAND-NEXT: $p0 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p1 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p2 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p3 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p4 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p5 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p6 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p7 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p8 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p9 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p10 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p11 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p12 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p13 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p14 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p15 = IMPLICIT_DEF
+    ;
+    ; EXPAND-NEXT: $z0 = LDR_ZXI killed $x8, 0 :: (load (s128) from %stack.0)
+    ; EXPAND-NEXT: $fp = MRS 55824, implicit-def $nzcv, implicit $nzcv
+    ; EXPAND-NEXT: $p0 = PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p0 = CMPNE_PPzZI_B $p0, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: MSR 55824, $fp, implicit-def $nzcv
+    ;
+    ; EXPAND-NEXT: FAKE_USE implicit $nzcv
+    ;
+    ; EXPAND-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0
+    ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1, implicit $vg
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 0 :: (load (s128) from %stack.13)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p15 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.12)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p14 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 2 :: (load (s128) from %stack.11)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p13 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 3 :: (load (s128) from %stack.10)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p12 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 4 :: (load (s128) from %stack.9)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p11 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 5 :: (load (s128) from %stack.8)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p10 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 6 :: (load (s128) from %stack.7)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p9 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 7 :: (load (s128) from %stack.6)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p8 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 8 :: (load (s128) from %stack.5)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p7 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 9 :: (load (s128) from %stack.4)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p6 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 10 :: (load (s128) from %stack.3)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p5 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 11 :: (load (s128) from %stack.2)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p4 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 12, implicit $vg
+    ; EXPAND-NEXT: $fp = frame-destroy LDRXui $sp, 128 :: (load (s64) from %stack.14)
+    ; EXPAND-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0
+    ; EXPAND-NEXT: RET undef $lr, implicit $p0
+    $nzcv = IMPLICIT_DEF
+
+    %1:ppr = COPY $p0
+
+    $p0 = IMPLICIT_DEF
+    $p1 = IMPLICIT_DEF
+    $p2 = IMPLICIT_DEF
+    $p3 = IMPLICIT_DEF
+    $p4 = IMPLICIT_DEF
+    $p5 = IMPLICIT_DEF
+    $p6 = IMPLICIT_DEF
+    $p7 = IMPLICIT_DEF
+    $p8 = IMPLICIT_DEF
+    $p9 = IMPLICIT_DEF
+    $p10 = IMPLICIT_DEF
+    $p11 = IMPLICIT_DEF
+    $p12 = IMPLICIT_DEF
+    $p13 = IMPLICIT_DEF
+    $p14 = IMPLICIT_DEF
+    $p15 = IMPLICIT_DEF
+
+    $p0 = COPY %1
+
+    FAKE_USE implicit $nzcv
+
+    RET_ReallyLR implicit $p0
+...
+---
+name: zpr_predicate_spill__save_restore_nzcv__scavenge_csr_gpr
+tracksRegLiveness: true
+stack:
+liveins:
+  - { reg: '$p0' }
+  - { reg: '$x0' }
+  - { reg: '$x1' }
+  - { reg: '$x2' }
+  - { reg: '$x3' }
+  - { reg: '$x4' }
+  - { reg: '$x5' }
+  - { reg: '$x6' }
+  - { reg: '$x7' }
+body:             |
+  bb.0.entry:
+    liveins: $p0, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7
+
+    ; CHECK-LABEL: name: zpr_predicate_spill__save_restore_nzcv__scavenge_csr_gpr
+    ; CHECK: stack:
+    ; CHECK:      - { id: 0, name: '', type: spill-slot, offset: 0, size: 16, alignment: 16,
+    ; CHECK-NEXT:     stack-id: scalable-vector, callee-saved-register:
+    ; CHECK: liveins: $p0, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7
+    ; CHECK-NEXT: {{  $}}
+    ;
+    ; CHECK-NEXT: $nzcv = IMPLICIT_DEF
+    ;
+    ; CHECK-NEXT: $x8 = IMPLICIT_DEF
+    ; CHECK-NEXT: $x9 = IMPLICIT_DEF
+    ; CHECK-NEXT: $x10 = IMPLICIT_DEF
+    ; CHECK-NEXT: $x11 = IMPLICIT_DEF
+    ; CHECK-NEXT: $x12 = IMPLICIT_DEF
+    ; CHECK-NEXT: $x13 = IMPLICIT_DEF
+    ; CHECK-NEXT: $x14 = IMPLICIT_DEF
+    ; CHECK-NEXT: $x15 = IMPLICIT_DEF
+    ; CHECK-NEXT: $x16 = IMPLICIT_DEF
+    ; CHECK-NEXT: $x17 = IMPLICIT_DEF
+    ; CHECK-NEXT: $x18 = IMPLICIT_DEF
+    ;
+    ; CHECK-NEXT: SPILL_PPR_TO_ZPR_SLOT_PSEUDO $p0, %stack.0, 0 :: (store (s128) into %stack.0)
+    ;
+    ; CHECK-NEXT: $p0 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p1 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p2 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p3 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p4 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p5 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p6 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p7 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p8 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p9 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p10 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p11 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p12 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p13 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p14 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p15 = IMPLICIT_DEF
+    ;
+    ; CHECK-NEXT: $p0 = FILL_PPR_FROM_ZPR_SLOT_PSEUDO %stack.0, 0 :: (load (s128) from %stack.0)
+    ;
+    ; CHECK-NEXT: FAKE_USE implicit $nzcv, implicit $x8, implicit $x9, implicit $x10, implicit $x11, implicit $x12, implicit $x13, implicit $x14, implicit $x15, implicit $x16, implicit $x17, implicit $x18
+    ;
+    ; CHECK-NEXT: RET_ReallyLR implicit $p0, implicit $x0, implicit $x1, implicit $x2, implicit $x3, implicit $x4, implicit $x5, implicit $x6, implicit $x7, implicit $x8, implicit $x9, implicit $x10, implicit $x11, implicit $x12, implicit $x13, implicit $x14, implicit $x15, implicit $x16, implicit $x17, implicit $x18
+
+    ; EXPAND-LABEL: name: zpr_predicate_spill__save_restore_nzcv__scavenge_csr_gpr
+    ; EXPAND: liveins: $p0, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $fp, $p15, $p14, $p13, $p12, $p11, $p10, $p9, $p8, $p7, $p6, $p5, $p4
+    ; EXPAND-NEXT: {{  $}}
+    ;
+    ; EXPAND-NEXT: $sp = frame-setup SUBXri $sp, 1040, 0
+    ; EXPAND-NEXT: frame-setup STRXui killed $fp, $sp, 128 :: (store (s64) into %stack.14)
+    ; EXPAND-NEXT: $sp = frame-setup ADDVL_XXI $sp, -12, implicit $vg
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p15, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 0 :: (store (s128) into %stack.13)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p14, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 1 :: (store (s128) into %stack.12)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p13, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 2 :: (store (s128) into %stack.11)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p12, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 3 :: (store (s128) into %stack.10)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p11, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 4 :: (store (s128) into %stack.9)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p10, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 5 :: (store (s128) into %stack.8)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p9, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 6 :: (store (s128) into %stack.7)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p8, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 7 :: (store (s128) into %stack.6)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p7, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 8 :: (store (s128) into %stack.5)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p6, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 9 :: (store (s128) into %stack.4)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p5, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 10 :: (store (s128) into %stack.3)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p4, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 11 :: (store (s128) into %stack.2)
+    ; EXPAND-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0
+    ; EXPAND-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1, implicit $vg
+    ;
+    ; EXPAND-NEXT: $nzcv = IMPLICIT_DEF
+    ;
+    ; EXPAND-NEXT: $x8 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $x9 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $x10 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $x11 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $x12 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $x13 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $x14 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $x15 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $x16 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $x17 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $x18 = IMPLICIT_DEF
+    ;
+    ; EXPAND-NEXT: $z0 = CPY_ZPzI_B $p0, 1, 0
+    ; EXPAND-NEXT: $fp = ADDXri $sp, 1024, 0
+    ; EXPAND-NEXT: STR_ZXI $z0, $fp, 0 :: (store (s128) into %stack.0)
+    ;
+    ; EXPAND-NEXT: $p0 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p1 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p2 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p3 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p4 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p5 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p6 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p7 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p8 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p9 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p10 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p11 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p12 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p13 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p14 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p15 = IMPLICIT_DEF
+    ;
+    ; EXPAND-NEXT: $z0 = LDR_ZXI killed $fp, 0 :: (load (s128) from %stack.0)
+    ; EXPAND-NEXT: $fp = MRS 55824, implicit-def $nzcv, implicit $nzcv
+    ; EXPAND-NEXT: $p0 = PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p0 = CMPNE_PPzZI_B $p0, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: MSR 55824, $fp, implicit-def $nzcv
+    ;
+    ; EXPAND-NEXT: FAKE_USE implicit $nzcv, implicit $x8, implicit $x9, implicit $x10, implicit $x11, implicit $x12, implicit $x13, implicit $x14, implicit $x15, implicit $x16, implicit $x17, implicit $x18
+    ;
+    ; EXPAND-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0
+    ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1, implicit $vg
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 0 :: (load (s128) from %stack.13)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p15 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.12)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p14 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 2 :: (load (s128) from %stack.11)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p13 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 3 :: (load (s128) from %stack.10)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p12 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 4 :: (load (s128) from %stack.9)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p11 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 5 :: (load (s128) from %stack.8)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p10 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 6 :: (load (s128) from %stack.7)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p9 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 7 :: (load (s128) from %stack.6)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p8 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 8 :: (load (s128) from %stack.5)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p7 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 9 :: (load (s128) from %stack.4)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p6 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 10 :: (load (s128) from %stack.3)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p5 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 11 :: (load (s128) from %stack.2)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p4 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 12, implicit $vg
+    ; EXPAND-NEXT: $fp = frame-destroy LDRXui $sp, 128 :: (load (s64) from %stack.14)
+    ; EXPAND-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0
+    ; EXPAND-NEXT: RET undef $lr, implicit $p0, implicit $x0, implicit $x1, implicit $x2, implicit $x3, implicit $x4, implicit $x5, implicit $x6, implicit $x7, implicit $x8, implicit $x9, implicit $x10, implicit $x11, implicit $x12, implicit $x13, implicit $x14, implicit $x15, implicit $x16, implicit $x17, implicit $x18
+    $nzcv = IMPLICIT_DEF
+    $x8 = IMPLICIT_DEF
+    $x9 = IMPLICIT_DEF
+    $x10 = IMPLICIT_DEF
+    $x11 = IMPLICIT_DEF
+    $x12 = IMPLICIT_DEF
+    $x13 = IMPLICIT_DEF
+    $x14 = IMPLICIT_DEF
+    $x15 = IMPLICIT_DEF
+    $x16 = IMPLICIT_DEF
+    $x17 = IMPLICIT_DEF
+    $x18 = IMPLICIT_DEF
+
+    %1:ppr = COPY $p0
+
+    $p0 = IMPLICIT_DEF
+    $p1 = IMPLICIT_DEF
+    $p2 = IMPLICIT_DEF
+    $p3 = IMPLICIT_DEF
+    $p4 = IMPLICIT_DEF
+    $p5 = IMPLICIT_DEF
+    $p6 = IMPLICIT_DEF
+    $p7 = IMPLICIT_DEF
+    $p8 = IMPLICIT_DEF
+    $p9 = IMPLICIT_DEF
+    $p10 = IMPLICIT_DEF
+    $p11 = IMPLICIT_DEF
+    $p12 = IMPLICIT_DEF
+    $p13 = IMPLICIT_DEF
+    $p14 = IMPLICIT_DEF
+    $p15 = IMPLICIT_DEF
+
+    $p0 = COPY %1
+
+    FAKE_USE implicit $nzcv, implicit $x8, implicit $x9, implicit $x10, implicit $x11, implicit $x12, implicit $x13, implicit $x14, implicit $x15, implicit $x16, implicit $x17, implicit $x18
+
+    RET_ReallyLR implicit $p0, implicit $x0, implicit $x1, implicit $x2, implicit $x3, implicit $x4, implicit $x5, implicit $x6, implicit $x7, implicit $x8, implicit $x9, implicit $x10, implicit $x11, implicit $x12, implicit $x13, implicit $x14, implicit $x15, implicit $x16, implicit $x17, implicit $x18
+...
+---
+name: zpr_predicate_spill__spill_zpr
+tracksRegLiveness: true
+stack:
+liveins:
+  - { reg: '$p0' }
+  - { reg: '$z0' }
+  - { reg: '$z1' }
+  - { reg: '$z2' }
+  - { reg: '$z3' }
+  - { reg: '$z4' }
+  - { reg: '$z5' }
+  - { reg: '$z6' }
+  - { reg: '$z7' }
+body:             |
+  bb.0.entry:
+    liveins: $p0, $z0, $z1, $z2, $z3, $z4, $z5, $z6, $z7
+
+    ; CHECK-LABEL: name: zpr_predicate_spill__spill_zpr
+    ; CHECK: stack:
+    ; CHECK:      - { id: 0, name: '', type: spill-slot, offset: 0, size: 16, alignment: 16,
+    ; CHECK-NEXT:     stack-id: scalable-vector, callee-saved-register:
+    ; CHECK: liveins: $p0, $z0, $z1, $z2, $z3, $z4, $z5, $z6, $z7
+    ; CHECK-NEXT: {{  $}}
+    ;
+    ; CHECK-NEXT: $z16 = IMPLICIT_DEF
+    ; CHECK-NEXT: $z17 = IMPLICIT_DEF
+    ; CHECK-NEXT: $z18 = IMPLICIT_DEF
+    ; CHECK-NEXT: $z19 = IMPLICIT_DEF
+    ; CHECK-NEXT: $z20 = IMPLICIT_DEF
+    ; CHECK-NEXT: $z21 = IMPLICIT_DEF
+    ; CHECK-NEXT: $z22 = IMPLICIT_DEF
+    ; CHECK-NEXT: $z23 = IMPLICIT_DEF
+    ; CHECK-NEXT: $z24 = IMPLICIT_DEF
+    ; CHECK-NEXT: $z25 = IMPLICIT_DEF
+    ; CHECK-NEXT: $z26 = IMPLICIT_DEF
+    ; CHECK-NEXT: $z27 = IMPLICIT_DEF
+    ; CHECK-NEXT: $z28 = IMPLICIT_DEF
+    ; CHECK-NEXT: $z29 = IMPLICIT_DEF
+    ; CHECK-NEXT: $z30 = IMPLICIT_DEF
+    ; CHECK-NEXT: $z31 = IMPLICIT_DEF
+    ;
+    ; CHECK-NEXT: SPILL_PPR_TO_ZPR_SLOT_PSEUDO $p0, %stack.0, 0 :: (store (s128) into %stack.0)
+    ;
+    ; CHECK-NEXT: $p0 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p1 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p2 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p3 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p4 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p5 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p6 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p7 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p8 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p9 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p10 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p11 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p12 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p13 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p14 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p15 = IMPLICIT_DEF
+    ;
+    ; CHECK-NEXT: $p0 = FILL_PPR_FROM_ZPR_SLOT_PSEUDO %stack.0, 0 :: (load (s128) from %stack.0)
+    ;
+    ; CHECK-NEXT: FAKE_USE implicit $z16, implicit $z17, implicit $z18, implicit $z19, implicit $z20, implicit $z21, implicit $z22, implicit $z23, implicit $z24, implicit $z25, implicit $z26, implicit $z27, implicit $z28, implicit $z29, implicit $z30, implicit $z31
+    ;
+    ; CHECK-NEXT: RET_ReallyLR implicit $p0, implicit $z0, implicit $z1, implicit $z2, implicit $z3, implicit $z4, implicit $z5, implicit $z6, implicit $z7
+
+    ; EXPAND-LABEL: name: zpr_predicate_spill__spill_zpr
+    ; EXPAND: liveins: $p0, $z0, $z1, $z2, $z3, $z4, $z5, $z6, $z7, $fp, $p15, $p14, $p13, $p12, $p11, $p10, $p9, $p8, $p7, $p6, $p5, $p4, $z23, $z22, $z21, $z20, $z19, $z18, $z17, $z16
+    ; EXPAND-NEXT: {{  $}}
+    ;
+    ; EXPAND-NEXT: $sp = frame-setup SUBXri $sp, 1040, 0
+    ; EXPAND-NEXT: frame-setup STRXui killed $fp, $sp, 128 :: (store (s64) into %stack.22)
+    ; EXPAND-NEXT: $sp = frame-setup ADDVL_XXI $sp, -20, implicit $vg
+    ; EXPAND-NEXT: $z24 = frame-setup CPY_ZPzI_B killed $p15, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 0 :: (store (s128) into %stack.21)
+    ; EXPAND-NEXT: $z24 = frame-setup CPY_ZPzI_B killed $p14, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 1 :: (store (s128) into %stack.20)
+    ; EXPAND-NEXT: $z24 = frame-setup CPY_ZPzI_B killed $p13, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 2 :: (store (s128) into %stack.19)
+    ; EXPAND-NEXT: $z24 = frame-setup CPY_ZPzI_B killed $p12, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 3 :: (store (s128) into %stack.18)
+    ; EXPAND-NEXT: $z24 = frame-setup CPY_ZPzI_B killed $p11, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 4 :: (store (s128) into %stack.17)
+    ; EXPAND-NEXT: $z24 = frame-setup CPY_ZPzI_B killed $p10, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 5 :: (store (s128) into %stack.16)
+    ; EXPAND-NEXT: $z24 = frame-setup CPY_ZPzI_B killed $p9, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 6 :: (store (s128) into %stack.15)
+    ; EXPAND-NEXT: $z24 = frame-setup CPY_ZPzI_B killed $p8, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 7 :: (store (s128) into %stack.14)
+    ; EXPAND-NEXT: $z24 = frame-setup CPY_ZPzI_B killed $p7, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 8 :: (store (s128) into %stack.13)
+    ; EXPAND-NEXT: $z24 = frame-setup CPY_ZPzI_B killed $p6, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 9 :: (store (s128) into %stack.12)
+    ; EXPAND-NEXT: $z24 = frame-setup CPY_ZPzI_B killed $p5, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 10 :: (store (s128) into %stack.11)
+    ; EXPAND-NEXT: $z24 = frame-setup CPY_ZPzI_B killed $p4, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 11 :: (store (s128) into %stack.10)
+    ; EXPAND-NEXT: frame-setup STR_ZXI killed $z23, $sp, 12 :: (store (s128) into %stack.9)
+    ; EXPAND-NEXT: frame-setup STR_ZXI killed $z22, $sp, 13 :: (store (s128) into %stack.8)
+    ; EXPAND-NEXT: frame-setup STR_ZXI killed $z21, $sp, 14 :: (store (s128) into %stack.7)
+    ; EXPAND-NEXT: frame-setup STR_ZXI killed $z20, $sp, 15 :: (store (s128) into %stack.6)
+    ; EXPAND-NEXT: frame-setup STR_ZXI killed $z19, $sp, 16 :: (store (s128) into %stack.5)
+    ; EXPAND-NEXT: frame-setup STR_ZXI killed $z18, $sp, 17 :: (store (s128) into %stack.4)
+    ; EXPAND-NEXT: frame-setup STR_ZXI killed $z17, $sp, 18 :: (store (s128) into %stack.3)
+    ; EXPAND-NEXT: frame-setup STR_ZXI killed $z16, $sp, 19 :: (store (s128) into %stack.2)
+    ; EXPAND-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0
+    ; EXPAND-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2, implicit $vg
+    ;
+    ; EXPAND-NEXT: $z16 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $z17 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $z18 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $z19 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $z20 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $z21 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $z22 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $z23 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $z24 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $z25 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $z26 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $z27 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $z28 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $z29 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $z30 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $z31 = IMPLICIT_DEF
+    ;
+    ; EXPAND-NEXT: $x8 = ADDXri $sp, 1024, 0
+    ; EXPAND-NEXT: STR_ZXI $z0, $x8, 0 :: (store (s128) into %stack.24)
+    ; EXPAND-NEXT: $z0 = CPY_ZPzI_B $p0, 1, 0
+    ; EXPAND-NEXT: STR_ZXI $z0, $x8, 1 :: (store (s128) into %stack.0)
+    ; EXPAND-NEXT: $z0 = LDR_ZXI $x8, 0 :: (load (s128) from %stack.24)
+    ;
+    ; EXPAND-NEXT: $p0 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p1 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p2 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p3 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p4 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p5 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p6 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p7 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p8 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p9 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p10 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p11 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p12 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p13 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p14 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p15 = IMPLICIT_DEF
+    ;
+    ; EXPAND-NEXT: STR_ZXI $z0, $x8, 0 :: (store (s128) into %stack.24)
+    ; EXPAND-NEXT: $z0 = LDR_ZXI $x8, 1 :: (load (s128) from %stack.0)
+    ; EXPAND-NEXT: $p0 = PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p0 = CMPNE_PPzZI_B $p0, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = LDR_ZXI killed $x8, 0 :: (load (s128) from %stack.24)
+    ;
+    ; EXPAND-NEXT: FAKE_USE implicit $z16, implicit $z17, implicit $z18, implicit $z19, implicit $z20, implicit $z21, implicit $z22, implicit $z23, implicit $z24, implicit $z25, implicit $z26, implicit $z27, implicit $z28, implicit $z29, implicit $z30, implicit $z31
+    ;
+    ; EXPAND-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0
+    ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2, implicit $vg
+    ; EXPAND-NEXT: $z23 = frame-destroy LDR_ZXI $sp, 12 :: (load (s128) from %stack.9)
+    ; EXPAND-NEXT: $z22 = frame-destroy LDR_ZXI $sp, 13 :: (load (s128) from %stack.8)
+    ; EXPAND-NEXT: $z21 = frame-destroy LDR_ZXI $sp, 14 :: (load (s128) from %stack.7)
+    ; EXPAND-NEXT: $z20 = frame-destroy LDR_ZXI $sp, 15 :: (load (s128) from %stack.6)
+    ; EXPAND-NEXT: $z19 = frame-destroy LDR_ZXI $sp, 16 :: (load (s128) from %stack.5)
+    ; EXPAND-NEXT: $z18 = frame-destroy LDR_ZXI $sp, 17 :: (load (s128) from %stack.4)
+    ; EXPAND-NEXT: $z17 = frame-destroy LDR_ZXI $sp, 18 :: (load (s128) from %stack.3)
+    ; EXPAND-NEXT: $z16 = frame-destroy LDR_ZXI $sp, 19 :: (load (s128) from %stack.2)
+    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 0 :: (load (s128) from %stack.21)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p15 = frame-destroy CMPNE_PPzZI_B $p1, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.20)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p14 = frame-destroy CMPNE_PPzZI_B $p1, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 2 :: (load (s128) from %stack.19)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p13 = frame-destroy CMPNE_PPzZI_B $p1, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 3 :: (load (s128) from %stack.18)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p12 = frame-destroy CMPNE_PPzZI_B $p1, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 4 :: (load (s128) from %stack.17)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p11 = frame-destroy CMPNE_PPzZI_B $p1, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 5 :: (load (s128) from %stack.16)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p10 = frame-destroy CMPNE_PPzZI_B $p1, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 6 :: (load (s128) from %stack.15)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p9 = frame-destroy CMPNE_PPzZI_B $p1, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 7 :: (load (s128) from %stack.14)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p8 = frame-destroy CMPNE_PPzZI_B $p1, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 8 :: (load (s128) from %stack.13)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p7 = frame-destroy CMPNE_PPzZI_B $p1, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 9 :: (load (s128) from %stack.12)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p6 = frame-destroy CMPNE_PPzZI_B $p1, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 10 :: (load (s128) from %stack.11)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p5 = frame-destroy CMPNE_PPzZI_B $p1, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 11 :: (load (s128) from %stack.10)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p4 = frame-destroy CMPNE_PPzZI_B $p1, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 20, implicit $vg
+    ; EXPAND-NEXT: $fp = frame-destroy LDRXui $sp, 128 :: (load (s64) from %stack.22)
+    ; EXPAND-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0
+    ; EXPAND-NEXT: RET undef $lr, implicit $p0, implicit $z0, implicit $z1, implicit $z2, implicit $z3, implicit $z4, implicit $z5, implicit $z6, implicit $z7
+    $z16 = IMPLICIT_DEF
+    $z17 = IMPLICIT_DEF
+    $z18 = IMPLICIT_DEF
+    $z19 = IMPLICIT_DEF
+    $z20 = IMPLICIT_DEF
+    $z21 = IMPLICIT_DEF
+    $z22 = IMPLICIT_DEF
+    $z23 = IMPLICIT_DEF
+    $z24 = IMPLICIT_DEF
+    $z25 = IMPLICIT_DEF
+    $z26 = IMPLICIT_DEF
+    $z27 = IMPLICIT_DEF
+    $z28 = IMPLICIT_DEF
+    $z29 = IMPLICIT_DEF
+    $z30 = IMPLICIT_DEF
+    $z31 = IMPLICIT_DEF
+
+    %1:ppr = COPY $p0
+
+    $p0 = IMPLICIT_DEF
+    $p1 = IMPLICIT_DEF
+    $p2 = IMPLICIT_DEF
+    $p3 = IMPLICIT_DEF
+    $p4 = IMPLICIT_DEF
+    $p5 = IMPLICIT_DEF
+    $p6 = IMPLICIT_DEF
+    $p7 = IMPLICIT_DEF
+    $p8 = IMPLICIT_DEF
+    $p9 = IMPLICIT_DEF
+    $p10 = IMPLICIT_DEF
+    $p11 = IMPLICIT_DEF
+    $p12 = IMPLICIT_DEF
+    $p13 = IMPLICIT_DEF
+    $p14 = IMPLICIT_DEF
+    $p15 = IMPLICIT_DEF
+
+    $p0 = COPY %1
+
+    FAKE_USE implicit $z16, implicit $z17, implicit $z18, implicit $z19, implicit $z20, implicit $z21, implicit $z22, implicit $z23, implicit $z24, implicit $z25, implicit $z26, implicit $z27, implicit $z28, implicit $z29, implicit $z30, implicit $z31
+
+    RET_ReallyLR implicit $p0, implicit $z0, implicit $z1, implicit $z2, implicit $z3, implicit $z4, implicit $z5, implicit $z6, implicit $z7
+...
+---
+name: zpr_predicate_spill_above_p7
+tracksRegLiveness: true
+stack:
+liveins:
+  - { reg: '$p0' }
+  - { reg: '$p1' }
+  - { reg: '$p2' }
+  - { reg: '$p3' }
+body:             |
+  bb.0.entry:
+    liveins: $p0, $p1, $p2, $p3
+
+    ; CHECK-LABEL: name: zpr_predicate_spill_above_p7
+    ; CHECK: stack:
+    ; CHECK:      - { id: 0, name: '', type: spill-slot, offset: 0, size: 16, alignment: 16,
+    ; CHECK-NEXT:     stack-id: scalable-vector, callee-saved-register:
+    ; CHECK: liveins: $p0, $p1, $p2, $p3
+    ; CHECK-NEXT: {{  $}}
+    ;
+    ; CHECK-NEXT: $p15 = IMPLICIT_DEF
+    ;
+    ; CHECK-NEXT: SPILL_PPR_TO_ZPR_SLOT_PSEUDO $p15, %stack.0, 0 :: (store (s128) into %stack.0)
+    ;
+    ; CHECK-NEXT: $p0 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p1 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p2 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p3 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p4 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p5 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p6 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p7 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p8 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p9 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p10 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p11 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p12 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p13 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p14 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p15 = IMPLICIT_DEF
+    ;
+    ; CHECK-NEXT: $p15 = FILL_PPR_FROM_ZPR_SLOT_PSEUDO %stack.0, 0 :: (load (s128) from %stack.0)
+    ;
+    ; CHECK-NEXT: FAKE_USE implicit $p4, implicit $p5, implicit $p6, implicit $p7
+    ;
+    ; CHECK-NEXT: RET_ReallyLR implicit $p0, implicit $p1, implicit $p2, implicit $p3
+
+    ; EXPAND-LABEL: name: zpr_predicate_spill_above_p7
+    ; EXPAND: liveins: $p0, $p1, $p2, $p3, $fp, $p15, $p14, $p13, $p12, $p11, $p10, $p9, $p8, $p7, $p6, $p5, $p4
+    ; EXPAND-NEXT: {{  $}}
+    ;
+    ; EXPAND-NEXT: $sp = frame-setup SUBXri $sp, 1040, 0
+    ; EXPAND-NEXT: frame-setup STRXui killed $fp, $sp, 128 :: (store (s64) into %stack.14)
+    ; EXPAND-NEXT: $sp = frame-setup ADDVL_XXI $sp, -12, implicit $vg
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p15, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 0 :: (store (s128) into %stack.13)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p14, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 1 :: (store (s128) into %stack.12)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p13, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 2 :: (store (s128) into %stack.11)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p12, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 3 :: (store (s128) into %stack.10)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p11, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 4 :: (store (s128) into %stack.9)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p10, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 5 :: (store (s128) into %stack.8)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p9, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 6 :: (store (s128) into %stack.7)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p8, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 7 :: (store (s128) into %stack.6)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p7, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 8 :: (store (s128) into %stack.5)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p6, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 9 :: (store (s128) into %stack.4)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p5, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 10 :: (store (s128) into %stack.3)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p4, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 11 :: (store (s128) into %stack.2)
+    ; EXPAND-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0
+    ; EXPAND-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2, implicit $vg
+    ;
+    ; EXPAND-NEXT: $p15 = IMPLICIT_DEF
+    ;
+    ; EXPAND-NEXT: $z0 = CPY_ZPzI_B $p15, 1, 0
+    ; EXPAND-NEXT: $x8 = ADDXri $sp, 1024, 0
+    ; EXPAND-NEXT: STR_ZXI $z0, $x8, 1 :: (store (s128) into %stack.0)
+    ;
+    ; EXPAND-NEXT: $p0 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p1 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p2 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p3 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p4 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p5 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p6 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p7 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p8 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p9 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p10 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p11 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p12 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p13 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p14 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p15 = IMPLICIT_DEF
+    ;
+    ; EXPAND-NEXT: $z0 = CPY_ZPzI_B $p0, 1, 0
+    ; EXPAND-NEXT: STR_ZXI $z0, $x8, 0 :: (store (s128) into %stack.16)
+    ; EXPAND-NEXT: $z0 = LDR_ZXI $x8, 1 :: (load (s128) from %stack.0)
+    ; EXPAND-NEXT: $p0 = PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p15 = CMPNE_PPzZI_B $p0, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = LDR_ZXI killed $x8, 0 :: (load (s128) from %stack.16)
+    ; EXPAND-NEXT: $p0 = PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p0 = CMPNE_PPzZI_B $p0, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ;
+    ; EXPAND-NEXT: FAKE_USE implicit $p4, implicit $p5, implicit $p6, implicit $p7
+    ;
+    ; EXPAND-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0
+    ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2, implicit $vg
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 0 :: (load (s128) from %stack.13)
+    ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p15 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.12)
+    ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p14 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 2 :: (load (s128) from %stack.11)
+    ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p13 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 3 :: (load (s128) from %stack.10)
+    ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p12 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 4 :: (load (s128) from %stack.9)
+    ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p11 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 5 :: (load (s128) from %stack.8)
+    ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p10 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 6 :: (load (s128) from %stack.7)
+    ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p9 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 7 :: (load (s128) from %stack.6)
+    ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p8 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 8 :: (load (s128) from %stack.5)
+    ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p7 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 9 :: (load (s128) from %stack.4)
+    ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p6 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 10 :: (load (s128) from %stack.3)
+    ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p5 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 11 :: (load (s128) from %stack.2)
+    ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p4 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 12, implicit $vg
+    ; EXPAND-NEXT: $fp = frame-destroy LDRXui $sp, 128 :: (load (s64) from %stack.14)
+    ; EXPAND-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0
+    ; EXPAND-NEXT: RET undef $lr, implicit $p0, implicit $p1, implicit $p2, implicit $p3
+    $p15 = IMPLICIT_DEF
+    %1:ppr = COPY $p15
+
+    $p0 = IMPLICIT_DEF
+    $p1 = IMPLICIT_DEF
+    $p2 = IMPLICIT_DEF
+    $p3 = IMPLICIT_DEF
+    $p4 = IMPLICIT_DEF
+    $p5 = IMPLICIT_DEF
+    $p6 = IMPLICIT_DEF
+    $p7 = IMPLICIT_DEF
+    $p8 = IMPLICIT_DEF
+    $p9 = IMPLICIT_DEF
+    $p10 = IMPLICIT_DEF
+    $p11 = IMPLICIT_DEF
+    $p12 = IMPLICIT_DEF
+    $p13 = IMPLICIT_DEF
+    $p14 = IMPLICIT_DEF
+    $p15 = IMPLICIT_DEF
+
+    $p15 = COPY %1
+
+    FAKE_USE implicit $p4, implicit $p5, implicit $p6, implicit $p7
+
+    RET_ReallyLR implicit $p0, implicit $p1, implicit $p2, implicit $p3
+...
+---
+name: zpr_predicate_spill_p4_saved
+tracksRegLiveness: true
+stack:
+liveins:
+  - { reg: '$p0' }
+  - { reg: '$p1' }
+  - { reg: '$p2' }
+  - { reg: '$p3' }
+body:             |
+  bb.0.entry:
+    liveins: $p0, $p1, $p2, $p3
+
+    ; CHECK-LABEL: name: zpr_predicate_spill_p4_saved
+    ; CHECK: liveins: $p0, $p1, $p2, $p3
+    ; CHECK-NEXT: {{  $}}
+    ;
+    ; CHECK-NEXT: $p8 = IMPLICIT_DEF
+    ;
+    ; CHECK-NEXT: RET_ReallyLR implicit $p0, implicit $p1, implicit $p2, implicit $p3
+
+    ; EXPAND-LABEL: name: zpr_predicate_spill_p4_saved
+    ; EXPAND: liveins: $p0, $p1, $p2, $p3, $fp, $p8, $p4
+    ; EXPAND-NEXT: {{  $}}
+    ; EXPAND-NEXT: $sp = frame-setup SUBXri $sp, 1040, 0
+    ; EXPAND-NEXT: frame-setup STRXui killed $fp, $sp, 128 :: (store (s64) into %stack.3)
+    ; EXPAND-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2, implicit $vg
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p8, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 0 :: (store (s128) into %stack.2)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p4, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 1 :: (store (s128) into %stack.1)
+    ; EXPAND-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0
+    ;
+    ; EXPAND-NEXT: $p8 = IMPLICIT_DEF
+    ;
+    ; EXPAND-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 0 :: (load (s128) from %stack.2)
+    ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p8 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.1)
+    ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p4 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2, implicit $vg
+    ; EXPAND-NEXT: $fp = frame-destroy LDRXui $sp, 128 :: (load (s64) from %stack.3)
+    ; EXPAND-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0
+    ; EXPAND-NEXT: RET undef $lr, implicit $p0, implicit $p1, implicit $p2, implicit $p3
+
+    ; If we spill a register above p8, p4 must also be saved, so we can guarantee
+    ; they will be a register (in the range p0-p7 to for the cmpne reload).
+    $p8 = IMPLICIT_DEF
+
+    RET_ReallyLR implicit $p0, implicit $p1, implicit $p2, implicit $p3
+...
diff --git a/llvm/test/CodeGen/AArch64/ssve-stack-hazard-remarks.ll b/llvm/test/CodeGen/AArch64/ssve-stack-hazard-remarks.ll
index 0b6bf3892a0c2..c67d91952c618 100644
--- a/llvm/test/CodeGen/AArch64/ssve-stack-hazard-remarks.ll
+++ b/llvm/test/CodeGen/AArch64/ssve-stack-hazard-remarks.ll
@@ -1,5 +1,7 @@
 ; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -pass-remarks-analysis=sme -aarch64-stack-hazard-remark-size=64 -o /dev/null < %s 2>&1 | FileCheck %s --check-prefixes=CHECK
 ; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -pass-remarks-analysis=sme -aarch64-stack-hazard-size=1024 -o /dev/null < %s 2>&1 | FileCheck %s --check-prefixes=CHECK-PADDING
+; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -pass-remarks-analysis=sme -aarch64-enable-zpr-predicate-spills -aarch64-stack-hazard-remark-size=64 -o /dev/null < %s 2>&1 | FileCheck %s --check-prefixes=CHECK-ZPR-PRED-SPILLS
+; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -pass-remarks-analysis=sme -aarch64-enable-zpr-predicate-spills -aarch64-stack-hazard-size=1024 -o /dev/null < %s 2>&1 | FileCheck %s --check-prefixes=CHECK-ZPR-PRED-SPILLS-WITH-PADDING
 
 ; Don't emit remarks for non-streaming functions.
 define float @csr_x20_stackargs_notsc(float %a, float %b, float %c, float %d, float %e, float %f, float %g, float %h, float %i) {
@@ -66,13 +68,18 @@ entry:
 }
 
 ; SVE calling conventions
-; Predicate register spills end up in FP region, currently.
+; Predicate register spills end up in FP region, currently. This can be
+; mitigated with the -aarch64-enable-zpr-predicate-spills option.
 
 define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3, i16 %P4) #2 {
 ; CHECK: remark: <unknown>:0:0: stack hazard in 'svecc_call': PPR stack object at [SP-48-258 * vscale] is too close to FPR stack object at [SP-48-256 * vscale]
 ; CHECK: remark: <unknown>:0:0: stack hazard in 'svecc_call': FPR stack object at [SP-48-16 * vscale] is too close to GPR stack object at [SP-48]
 ; CHECK-PADDING: remark: <unknown>:0:0: stack hazard in 'svecc_call': PPR stack object at [SP-1072-258 * vscale] is too close to FPR stack object at [SP-1072-256 * vscale]
 ; CHECK-PADDING-NOT: remark: <unknown>:0:0: stack hazard in 'svecc_call':
+; CHECK-ZPR-PRED-SPILLS-NOT: <unknown>:0:0: stack hazard in 'svecc_call': PPR stack object at {{.*}} is too close to FPR stack object
+; CHECK-ZPR-PRED-SPILLS: <unknown>:0:0: stack hazard in 'svecc_call': FPR stack object at [SP-48-16 * vscale] is too close to GPR stack object at [SP-48]
+; CHECK-ZPR-PRED-SPILLS-WITH-PADDING-NOT: <unknown>:0:0: stack hazard in 'svecc_call': PPR stack object at {{.*}} is too close to FPR stack object
+; CHECK-ZPR-PRED-SPILLS-WITH-PADDING-NOT: <unknown>:0:0: stack hazard in 'svecc_call': FPR stack object at {{.*}} is too close to GPR stack object
 entry:
   tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2
   %call = call ptr @memset(ptr noundef nonnull %P1, i32 noundef 45, i32 noundef 37)
@@ -84,6 +91,10 @@ define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8
 ; CHECK: remark: <unknown>:0:0: stack hazard in 'svecc_alloca_call': FPR stack object at [SP-48-16 * vscale] is too close to GPR stack object at [SP-48]
 ; CHECK-PADDING: remark: <unknown>:0:0: stack hazard in 'svecc_alloca_call': PPR stack object at [SP-1072-258 * vscale] is too close to FPR stack object at [SP-1072-256 * vscale]
 ; CHECK-PADDING-NOT: remark: <unknown>:0:0: stack hazard in 'svecc_alloca_call':
+; CHECK-ZPR-PRED-SPILLS-NOT: <unknown>:0:0: stack hazard in 'svecc_call': PPR stack object at {{.*}} is too close to FPR stack object
+; CHECK-ZPR-PRED-SPILLS: <unknown>:0:0: stack hazard in 'svecc_alloca_call': FPR stack object at [SP-48-16 * vscale] is too close to GPR stack object at [SP-48]
+; CHECK-ZPR-PRED-SPILLS-WITH-PADDING-NOT: <unknown>:0:0: stack hazard in 'svecc_alloca_call': PPR stack object at {{.*}} is too close to FPR stack object
+; CHECK-ZPR-PRED-SPILLS-WITH-PADDING-NOT: <unknown>:0:0: stack hazard in 'svecc_alloca_call': FPR stack object at {{.*}} is too close to GPR stack object
 entry:
   tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2
   %0 = alloca [37 x i8], align 16
diff --git a/llvm/utils/TableGen/SubtargetEmitter.cpp b/llvm/utils/TableGen/SubtargetEmitter.cpp
index 3db3ae65cc555..49362ff5ef655 100644
--- a/llvm/utils/TableGen/SubtargetEmitter.cpp
+++ b/llvm/utils/TableGen/SubtargetEmitter.cpp
@@ -2082,6 +2082,7 @@ void SubtargetEmitter::run(raw_ostream &OS) {
   OS << "\n#ifdef GET_SUBTARGETINFO_TARGET_DESC\n";
   OS << "#undef GET_SUBTARGETINFO_TARGET_DESC\n\n";
 
+  OS << "#include \"llvm/ADT/BitmaskEnum.h\"\n";
   OS << "#include \"llvm/Support/Debug.h\"\n";
   OS << "#include \"llvm/Support/raw_ostream.h\"\n\n";
   if (Target == "AArch64")
@@ -2113,7 +2114,26 @@ void SubtargetEmitter::run(raw_ostream &OS) {
      << " unsigned CPUID) const override;\n"
      << "  DFAPacketizer *createDFAPacketizer(const InstrItineraryData *IID)"
      << " const;\n";
-  if (TGT.getHwModes().getNumModeIds() > 1) {
+
+  const CodeGenHwModes &CGH = TGT.getHwModes();
+  if (CGH.getNumModeIds() > 1) {
+    OS << "  enum class " << Target << "HwModeBits : unsigned {\n";
+    for (unsigned M = 0, NumModes = CGH.getNumModeIds(); M != NumModes; ++M) {
+      StringRef ModeName = CGH.getModeName(M, /*IncludeDefault=*/true);
+      OS << "    " << ModeName << " = ";
+      if (M == 0)
+        OS << "0";
+      else
+        OS << "(1 << " << (M - 1) << ")";
+      OS << ",\n";
+      if (M == NumModes - 1) {
+        OS << "\n";
+        OS << "    LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/" << ModeName
+           << "),\n";
+      }
+    }
+    OS << "  };\n";
+
     OS << "  unsigned getHwModeSet() const override;\n";
     OS << "  unsigned getHwMode(enum HwModeType type = HwMode_Default) const "
           "override;\n";

From 7b9b6740c1ffb4599549e73566202348daaebb02 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Thu, 6 Feb 2025 14:30:19 +0000
Subject: [PATCH 099/282] [AArch64][SME] Reduce ptrue count when filling p-regs
 from z-regs (#125523)

Currently, each expansion of `FILL_PPR_FROM_ZPR_SLOT_PSEUDO` creates a
new ptrue instruction. This patch adds a simple method to reuse a
previous ptrue instruction when expanding back-to-back fill pseudos.
---
 .../Target/AArch64/AArch64FrameLowering.cpp   | 47 +++++++++-----
 .../AArch64/spill-fill-zpr-predicates.mir     | 61 +------------------
 2 files changed, 34 insertions(+), 74 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 81523adeefcee..d3abd79b85a75 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -4175,7 +4175,10 @@ int64_t AArch64FrameLowering::assignSVEStackObjectOffsets(
 /// Attempts to scavenge a register from \p ScavengeableRegs given the used
 /// registers in \p UsedRegs.
 static Register tryScavengeRegister(LiveRegUnits const &UsedRegs,
-                                    BitVector const &ScavengeableRegs) {
+                                    BitVector const &ScavengeableRegs,
+                                    Register PreferredReg) {
+  if (PreferredReg != AArch64::NoRegister && UsedRegs.available(PreferredReg))
+    return PreferredReg;
   for (auto Reg : ScavengeableRegs.set_bits()) {
     if (UsedRegs.available(Reg))
       return Reg;
@@ -4212,11 +4215,12 @@ struct ScopedScavengeOrSpill {
                         Register SpillCandidate, const TargetRegisterClass &RC,
                         LiveRegUnits const &UsedRegs,
                         BitVector const &AllocatableRegs,
-                        std::optional<int> *MaybeSpillFI)
+                        std::optional<int> *MaybeSpillFI,
+                        Register PreferredReg = AArch64::NoRegister)
       : MBB(MBB), MBBI(MBBI), RC(RC), TII(static_cast<const AArch64InstrInfo &>(
                                           *MF.getSubtarget().getInstrInfo())),
         TRI(*MF.getSubtarget().getRegisterInfo()) {
-    FreeReg = tryScavengeRegister(UsedRegs, AllocatableRegs);
+    FreeReg = tryScavengeRegister(UsedRegs, AllocatableRegs, PreferredReg);
     if (FreeReg != AArch64::NoRegister)
       return;
     assert(MaybeSpillFI && "Expected emergency spill slot FI information "
@@ -4331,12 +4335,10 @@ static void expandSpillPPRToZPRSlotPseudo(MachineBasicBlock &MBB,
 /// spilling if necessary). If the status flags are in use at the point of
 /// expansion they are preserved (by moving them to/from a GPR). This may cause
 /// an additional spill if no GPR is free at the expansion point.
-static bool expandFillPPRFromZPRSlotPseudo(MachineBasicBlock &MBB,
-                                           MachineInstr &MI,
-                                           const TargetRegisterInfo &TRI,
-                                           LiveRegUnits const &UsedRegs,
-                                           ScavengeableRegs const &SR,
-                                           EmergencyStackSlots &SpillSlots) {
+static bool expandFillPPRFromZPRSlotPseudo(
+    MachineBasicBlock &MBB, MachineInstr &MI, const TargetRegisterInfo &TRI,
+    LiveRegUnits const &UsedRegs, ScavengeableRegs const &SR,
+    MachineInstr *&LastPTrue, EmergencyStackSlots &SpillSlots) {
   MachineFunction &MF = *MBB.getParent();
   auto *TII =
       static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
@@ -4347,7 +4349,9 @@ static bool expandFillPPRFromZPRSlotPseudo(MachineBasicBlock &MBB,
 
   ScopedScavengeOrSpill PredReg(
       MF, MBB, MI, AArch64::P0, AArch64::PPR_3bRegClass, UsedRegs, SR.PPR3bRegs,
-      isInPrologueOrEpilogue(MI) ? nullptr : &SpillSlots.PPRSpillFI);
+      isInPrologueOrEpilogue(MI) ? nullptr : &SpillSlots.PPRSpillFI,
+      /*PreferredReg=*/
+      LastPTrue ? LastPTrue->getOperand(0).getReg() : AArch64::NoRegister);
 
   // Elide NZCV spills if we know it is not used.
   bool IsNZCVUsed = !UsedRegs.available(AArch64::NZCV);
@@ -4371,9 +4375,17 @@ static bool expandFillPPRFromZPRSlotPseudo(MachineBasicBlock &MBB,
             .addImm(AArch64SysReg::NZCV)
             .addReg(AArch64::NZCV, RegState::Implicit)
             .getInstr());
-  MachineInstrs.push_back(BuildMI(MBB, MI, DL, TII->get(AArch64::PTRUE_B))
-                              .addReg(*PredReg, RegState::Define)
-                              .addImm(31));
+
+  // Reuse previous ptrue if we know it has not been clobbered.
+  if (LastPTrue) {
+    assert(*PredReg == LastPTrue->getOperand(0).getReg());
+    LastPTrue->moveBefore(&MI);
+  } else {
+    LastPTrue = BuildMI(MBB, MI, DL, TII->get(AArch64::PTRUE_B))
+                    .addReg(*PredReg, RegState::Define)
+                    .addImm(31);
+  }
+  MachineInstrs.push_back(LastPTrue);
   MachineInstrs.push_back(
       BuildMI(MBB, MI, DL, TII->get(AArch64::CMPNE_PPzZI_B))
           .addReg(MI.getOperand(0).getReg(), RegState::Define)
@@ -4402,19 +4414,24 @@ static bool expandSMEPPRToZPRSpillPseudos(MachineBasicBlock &MBB,
   LiveRegUnits UsedRegs(TRI);
   UsedRegs.addLiveOuts(MBB);
   bool HasPPRSpills = false;
+  MachineInstr *LastPTrue = nullptr;
   for (MachineInstr &MI : make_early_inc_range(reverse(MBB))) {
     UsedRegs.stepBackward(MI);
     switch (MI.getOpcode()) {
     case AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO:
+      if (LastPTrue &&
+          MI.definesRegister(LastPTrue->getOperand(0).getReg(), &TRI))
+        LastPTrue = nullptr;
       HasPPRSpills |= expandFillPPRFromZPRSlotPseudo(MBB, MI, TRI, UsedRegs, SR,
-                                                     SpillSlots);
+                                                     LastPTrue, SpillSlots);
       MI.eraseFromParent();
       break;
     case AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO:
       expandSpillPPRToZPRSlotPseudo(MBB, MI, TRI, UsedRegs, SR, SpillSlots);
       MI.eraseFromParent();
-      break;
+      [[fallthrough]];
     default:
+      LastPTrue = nullptr;
       break;
     }
   }
diff --git a/llvm/test/CodeGen/AArch64/spill-fill-zpr-predicates.mir b/llvm/test/CodeGen/AArch64/spill-fill-zpr-predicates.mir
index b58f91ac68a93..bff0cacfd5190 100644
--- a/llvm/test/CodeGen/AArch64/spill-fill-zpr-predicates.mir
+++ b/llvm/test/CodeGen/AArch64/spill-fill-zpr-predicates.mir
@@ -116,46 +116,34 @@ body:             |
     ; EXPAND-NEXT: $p15 = IMPLICIT_DEF
     ;
     ; EXPAND-NEXT: $z0 = LDR_ZXI killed $x8, 0 :: (load (s128) from %stack.0)
-    ; EXPAND-NEXT: $p0 = PTRUE_B 31, implicit $vg
-    ; EXPAND-NEXT: $p0 = CMPNE_PPzZI_B $p0, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p0 = CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ;
     ; EXPAND-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0
     ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1, implicit $vg
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 0 :: (load (s128) from %stack.13)
-    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p15 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.12)
-    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p14 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 2 :: (load (s128) from %stack.11)
-    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p13 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 3 :: (load (s128) from %stack.10)
-    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p12 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 4 :: (load (s128) from %stack.9)
-    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p11 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 5 :: (load (s128) from %stack.8)
-    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p10 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 6 :: (load (s128) from %stack.7)
-    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p9 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 7 :: (load (s128) from %stack.6)
-    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p8 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 8 :: (load (s128) from %stack.5)
-    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p7 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 9 :: (load (s128) from %stack.4)
-    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p6 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 10 :: (load (s128) from %stack.3)
-    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p5 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 11 :: (load (s128) from %stack.2)
-    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p4 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 12, implicit $vg
     ; EXPAND-NEXT: $fp = frame-destroy LDRXui $sp, 128 :: (load (s64) from %stack.14)
@@ -299,37 +287,26 @@ body:             |
     ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p15 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.12)
-    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p14 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 2 :: (load (s128) from %stack.11)
-    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p13 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 3 :: (load (s128) from %stack.10)
-    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p12 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 4 :: (load (s128) from %stack.9)
-    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p11 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 5 :: (load (s128) from %stack.8)
-    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p10 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 6 :: (load (s128) from %stack.7)
-    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p9 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 7 :: (load (s128) from %stack.6)
-    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p8 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 8 :: (load (s128) from %stack.5)
-    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p7 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 9 :: (load (s128) from %stack.4)
-    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p6 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 10 :: (load (s128) from %stack.3)
-    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p5 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 11 :: (load (s128) from %stack.2)
-    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p4 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 12, implicit $vg
     ; EXPAND-NEXT: $fp = frame-destroy LDRXui $sp, 128 :: (load (s64) from %stack.14)
@@ -509,37 +486,26 @@ body:             |
     ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p15 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.12)
-    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p14 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 2 :: (load (s128) from %stack.11)
-    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p13 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 3 :: (load (s128) from %stack.10)
-    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p12 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 4 :: (load (s128) from %stack.9)
-    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p11 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 5 :: (load (s128) from %stack.8)
-    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p10 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 6 :: (load (s128) from %stack.7)
-    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p9 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 7 :: (load (s128) from %stack.6)
-    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p8 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 8 :: (load (s128) from %stack.5)
-    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p7 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 9 :: (load (s128) from %stack.4)
-    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p6 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 10 :: (load (s128) from %stack.3)
-    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p5 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 11 :: (load (s128) from %stack.2)
-    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p4 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 12, implicit $vg
     ; EXPAND-NEXT: $fp = frame-destroy LDRXui $sp, 128 :: (load (s64) from %stack.14)
@@ -754,37 +720,26 @@ body:             |
     ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p15 = frame-destroy CMPNE_PPzZI_B $p1, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.20)
-    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p14 = frame-destroy CMPNE_PPzZI_B $p1, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 2 :: (load (s128) from %stack.19)
-    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p13 = frame-destroy CMPNE_PPzZI_B $p1, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 3 :: (load (s128) from %stack.18)
-    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p12 = frame-destroy CMPNE_PPzZI_B $p1, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 4 :: (load (s128) from %stack.17)
-    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p11 = frame-destroy CMPNE_PPzZI_B $p1, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 5 :: (load (s128) from %stack.16)
-    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p10 = frame-destroy CMPNE_PPzZI_B $p1, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 6 :: (load (s128) from %stack.15)
-    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p9 = frame-destroy CMPNE_PPzZI_B $p1, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 7 :: (load (s128) from %stack.14)
-    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p8 = frame-destroy CMPNE_PPzZI_B $p1, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 8 :: (load (s128) from %stack.13)
-    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p7 = frame-destroy CMPNE_PPzZI_B $p1, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 9 :: (load (s128) from %stack.12)
-    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p6 = frame-destroy CMPNE_PPzZI_B $p1, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 10 :: (load (s128) from %stack.11)
-    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p5 = frame-destroy CMPNE_PPzZI_B $p1, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 11 :: (load (s128) from %stack.10)
-    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p4 = frame-destroy CMPNE_PPzZI_B $p1, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 20, implicit $vg
     ; EXPAND-NEXT: $fp = frame-destroy LDRXui $sp, 128 :: (load (s64) from %stack.22)
@@ -953,37 +908,26 @@ body:             |
     ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p15 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.12)
-    ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p14 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 2 :: (load (s128) from %stack.11)
-    ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p13 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 3 :: (load (s128) from %stack.10)
-    ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p12 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 4 :: (load (s128) from %stack.9)
-    ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p11 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 5 :: (load (s128) from %stack.8)
-    ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p10 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 6 :: (load (s128) from %stack.7)
-    ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p9 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 7 :: (load (s128) from %stack.6)
-    ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p8 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 8 :: (load (s128) from %stack.5)
-    ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p7 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 9 :: (load (s128) from %stack.4)
-    ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p6 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 10 :: (load (s128) from %stack.3)
-    ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p5 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 11 :: (load (s128) from %stack.2)
-    ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p4 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 12, implicit $vg
     ; EXPAND-NEXT: $fp = frame-destroy LDRXui $sp, 128 :: (load (s64) from %stack.14)
@@ -1055,7 +999,6 @@ body:             |
     ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p8 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.1)
-    ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p4 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2, implicit $vg
     ; EXPAND-NEXT: $fp = frame-destroy LDRXui $sp, 128 :: (load (s64) from %stack.3)

From 1f3835104e6472f1ad17b5f6dd45c067c7aa5ce8 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Sun, 9 Feb 2025 16:45:04 -0500
Subject: [PATCH 100/282] [libc++] Make benchmarks dry-run by default on the
 release branch

As reported in #125510, doing a full run of the benchmarks during
release testing breaks for some of the testers, and it also takes a
long time. The proper fix would be for the release testing process to
call `check-cxx` instead of running lit directly inside libc++'s test
directory: that will also have the benefit of actually running all of
our tests, not only the Lit ones.

However, since that fix may take longer to happen, this patch tries
to reduce the pain of release testers by dry-running benchmarks by
default instead.
---
 libcxx/utils/libcxx/test/params.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcxx/utils/libcxx/test/params.py b/libcxx/utils/libcxx/test/params.py
index 8fd3872cd8cbb..6e13ad75ea949 100644
--- a/libcxx/utils/libcxx/test/params.py
+++ b/libcxx/utils/libcxx/test/params.py
@@ -371,7 +371,7 @@ def getSuitableClangTidy(cfg):
         name="enable_benchmarks",
         choices=["no", "run", "dry-run"],
         type=str,
-        default="run",
+        default="dry-run",
         help="Whether to run the benchmarks in the test suite, to only dry-run them or to disable them entirely.",
         actions=lambda mode: [AddFeature(f"enable-benchmarks={mode}")],
     ),

From e15cd84440efbaab4d07a9090c3c5cac116da540 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 29 Jan 2025 22:17:53 +0000
Subject: [PATCH 101/282] Revert "[SLP] getSpillCost - fully populate
 IntrinsicCostAttributes to improve cost analysis." (#124962)

Reverts llvm/llvm-project#124129 as its currently causing a regression at #124499 - avoids the regression until a proper fix can be added to getSpillCost

(cherry picked from commit 5921295dcaa1ad514d79e0ee824b9df1c077a2d0)
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 14 +++--
 .../SLPVectorizer/AArch64/loadorder.ll        | 33 +++++-----
 .../SLPVectorizer/AArch64/reduce-fadd.ll      | 28 ++++-----
 .../SLPVectorizer/AMDGPU/min_max.ll           |  8 +--
 .../SLPVectorizer/RISCV/complex-loads.ll      | 62 ++++++++-----------
 5 files changed, 73 insertions(+), 72 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 2532edc5d8699..19963e780ebd3 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -12258,12 +12258,18 @@ InstructionCost BoUpSLP::getSpillCost() const {
         if (auto *II = dyn_cast<IntrinsicInst>(I)) {
           if (II->isAssumeLikeIntrinsic())
             return true;
-          IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II);
+          FastMathFlags FMF;
+          SmallVector<Type *, 4> Tys;
+          for (auto &ArgOp : II->args())
+            Tys.push_back(ArgOp->getType());
+          if (auto *FPMO = dyn_cast<FPMathOperator>(II))
+            FMF = FPMO->getFastMathFlags();
+          IntrinsicCostAttributes ICA(II->getIntrinsicID(), II->getType(), Tys,
+                                      FMF);
           InstructionCost IntrCost =
               TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput);
-          InstructionCost CallCost =
-              TTI->getCallInstrCost(nullptr, II->getType(), ICA.getArgTypes(),
-                                    TTI::TCK_RecipThroughput);
+          InstructionCost CallCost = TTI->getCallInstrCost(
+              nullptr, II->getType(), Tys, TTI::TCK_RecipThroughput);
           if (IntrCost < CallCost)
             return true;
         }
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll
index 5ad676537f9c4..9ce79e5ea356b 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll
@@ -684,27 +684,27 @@ define void @store_blockstrided3(ptr nocapture noundef readonly %x, ptr nocaptur
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM5]]
 ; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i32 [[STRIDE]], 1
 ; CHECK-NEXT:    [[IDXPROM11:%.*]] = sext i32 [[MUL]] to i64
-; CHECK-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM11]]
-; CHECK-NEXT:    [[ADD14:%.*]] = add nsw i32 [[MUL]], 2
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM11]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX12]], align 4
+; CHECK-NEXT:    [[ADD14:%.*]] = or disjoint i32 [[MUL]], 1
 ; CHECK-NEXT:    [[IDXPROM15:%.*]] = sext i32 [[ADD14]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM15]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX16]], align 4
 ; CHECK-NEXT:    [[MUL21:%.*]] = mul nsw i32 [[STRIDE]], 3
 ; CHECK-NEXT:    [[IDXPROM23:%.*]] = sext i32 [[MUL21]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM23]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX24]], align 4
 ; CHECK-NEXT:    [[ADD26:%.*]] = add nsw i32 [[MUL21]], 1
 ; CHECK-NEXT:    [[IDXPROM27:%.*]] = sext i32 [[ADD26]] to i64
-; CHECK-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM27]]
+; CHECK-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM27]]
 ; CHECK-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr inbounds nuw i8, ptr [[Y:%.*]], i64 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX35]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM5]]
-; CHECK-NEXT:    [[ARRAYIDX49:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM11]]
-; CHECK-NEXT:    [[ARRAYIDX48:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM15]]
+; CHECK-NEXT:    [[ARRAYIDX48:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM11]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX48]], align 4
+; CHECK-NEXT:    [[ARRAYIDX52:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM15]]
 ; CHECK-NEXT:    [[ARRAYIDX60:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM23]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX60]], align 4
-; CHECK-NEXT:    [[ARRAYIDX65:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM27]]
+; CHECK-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM27]]
 ; CHECK-NEXT:    [[ARRAYIDX72:%.*]] = getelementptr inbounds nuw i8, ptr [[Z:%.*]], i64 4
 ; CHECK-NEXT:    [[MUL73:%.*]] = mul nsw i32 [[TMP3]], [[TMP0]]
 ; CHECK-NEXT:    [[ARRAYIDX76:%.*]] = getelementptr inbounds nuw i8, ptr [[Z]], i64 24
@@ -715,22 +715,25 @@ define void @store_blockstrided3(ptr nocapture noundef readonly %x, ptr nocaptur
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul nsw <2 x i32> [[TMP8]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = mul nsw <2 x i32> [[TMP9]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT:    [[ARRAYIDX84:%.*]] = getelementptr inbounds nuw i8, ptr [[Z]], i64 28
 ; CHECK-NEXT:    [[MUL81:%.*]] = mul nsw i32 [[TMP4]], [[TMP1]]
+; CHECK-NEXT:    [[ARRAYIDX82:%.*]] = getelementptr inbounds nuw i8, ptr [[Z]], i64 32
+; CHECK-NEXT:    [[TMP13:%.*]] = load <2 x i32>, ptr [[ARRAYIDX16]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = load <2 x i32>, ptr [[ARRAYIDX52]], align 4
+; CHECK-NEXT:    [[TMP15:%.*]] = mul nsw <2 x i32> [[TMP14]], [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <2 x i32> [[TMP15]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:    [[MUL87:%.*]] = mul nsw i32 [[TMP5]], [[TMP2]]
 ; CHECK-NEXT:    [[ARRAYIDX88:%.*]] = getelementptr inbounds nuw i8, ptr [[Z]], i64 44
+; CHECK-NEXT:    [[ARRAYIDX92:%.*]] = getelementptr inbounds nuw i8, ptr [[Z]], i64 36
 ; CHECK-NEXT:    [[TMP17:%.*]] = load <2 x i32>, ptr [[ARRAYIDX28]], align 4
 ; CHECK-NEXT:    [[TMP18:%.*]] = load <2 x i32>, ptr [[ARRAYIDX64]], align 4
-; CHECK-NEXT:    [[TMP15:%.*]] = load <2 x i32>, ptr [[ARRAYIDX49]], align 4
-; CHECK-NEXT:    [[TMP16:%.*]] = load <2 x i32>, ptr [[ARRAYIDX65]], align 4
 ; CHECK-NEXT:    store i32 [[MUL73]], ptr [[Z]], align 4
 ; CHECK-NEXT:    store <4 x i32> [[TMP12]], ptr [[ARRAYIDX72]], align 4
-; CHECK-NEXT:    store i32 [[MUL81]], ptr [[ARRAYIDX76]], align 4
+; CHECK-NEXT:    store i32 [[MUL81]], ptr [[ARRAYIDX82]], align 4
+; CHECK-NEXT:    store <2 x i32> [[TMP16]], ptr [[ARRAYIDX76]], align 4
 ; CHECK-NEXT:    store i32 [[MUL87]], ptr [[ARRAYIDX88]], align 4
-; CHECK-NEXT:    [[TMP20:%.*]] = mul nsw <2 x i32> [[TMP15]], [[TMP17]]
-; CHECK-NEXT:    [[TMP21:%.*]] = mul nsw <2 x i32> [[TMP16]], [[TMP18]]
-; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <2 x i32> [[TMP20]], <2 x i32> [[TMP21]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT:    store <4 x i32> [[TMP19]], ptr [[ARRAYIDX84]], align 4
+; CHECK-NEXT:    [[TMP19:%.*]] = mul nsw <2 x i32> [[TMP18]], [[TMP17]]
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <2 x i32> [[TMP19]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    store <2 x i32> [[TMP20]], ptr [[ARRAYIDX92]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll
index 6576cbe075b74..00a4417ba7aff 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll
@@ -716,29 +716,29 @@ define float @reduce_float_case3(ptr %a) {
 ; CHECK-NEXT:    [[GEP5:%.*]] = getelementptr inbounds float, ptr [[A]], i32 5
 ; CHECK-NEXT:    [[GEP6:%.*]] = getelementptr inbounds float, ptr [[A]], i32 6
 ; CHECK-NEXT:    [[GEP7:%.*]] = getelementptr inbounds float, ptr [[A]], i32 7
-; CHECK-NEXT:    [[LOAD2:%.*]] = load float, ptr [[A]], align 4
-; CHECK-NEXT:    [[LOAD3:%.*]] = load float, ptr [[GEP1]], align 4
-; CHECK-NEXT:    [[LOAD4:%.*]] = load float, ptr [[GEP2]], align 4
-; CHECK-NEXT:    [[LOAD5:%.*]] = load float, ptr [[GEP3]], align 4
-; CHECK-NEXT:    [[LOAD6:%.*]] = load float, ptr [[GEP4]], align 4
-; CHECK-NEXT:    [[LOAD7:%.*]] = load float, ptr [[GEP5]], align 4
-; CHECK-NEXT:    [[LOAD8:%.*]] = load float, ptr [[GEP6]], align 4
-; CHECK-NEXT:    [[LOAD9:%.*]] = load float, ptr [[GEP7]], align 4
+; CHECK-NEXT:    [[LOAD:%.*]] = load float, ptr [[A]], align 4
+; CHECK-NEXT:    [[LOAD1:%.*]] = load float, ptr [[GEP1]], align 4
+; CHECK-NEXT:    [[LOAD2:%.*]] = load float, ptr [[GEP2]], align 4
+; CHECK-NEXT:    [[LOAD3:%.*]] = load float, ptr [[GEP3]], align 4
+; CHECK-NEXT:    [[LOAD4:%.*]] = load float, ptr [[GEP4]], align 4
+; CHECK-NEXT:    [[LOAD5:%.*]] = load float, ptr [[GEP5]], align 4
+; CHECK-NEXT:    [[LOAD6:%.*]] = load float, ptr [[GEP6]], align 4
+; CHECK-NEXT:    [[LOAD7:%.*]] = load float, ptr [[GEP7]], align 4
+; CHECK-NEXT:    [[LOG:%.*]] = call float @llvm.log.f32(float [[LOAD]])
+; CHECK-NEXT:    [[LOG1:%.*]] = call float @llvm.log.f32(float [[LOAD1]])
 ; CHECK-NEXT:    [[LOG2:%.*]] = call float @llvm.log.f32(float [[LOAD2]])
 ; CHECK-NEXT:    [[LOG3:%.*]] = call float @llvm.log.f32(float [[LOAD3]])
 ; CHECK-NEXT:    [[LOG4:%.*]] = call float @llvm.log.f32(float [[LOAD4]])
 ; CHECK-NEXT:    [[LOG5:%.*]] = call float @llvm.log.f32(float [[LOAD5]])
 ; CHECK-NEXT:    [[LOG6:%.*]] = call float @llvm.log.f32(float [[LOAD6]])
 ; CHECK-NEXT:    [[LOG7:%.*]] = call float @llvm.log.f32(float [[LOAD7]])
-; CHECK-NEXT:    [[LOG8:%.*]] = call float @llvm.log.f32(float [[LOAD8]])
-; CHECK-NEXT:    [[LOG9:%.*]] = call float @llvm.log.f32(float [[LOAD9]])
-; CHECK-NEXT:    [[ADD3:%.*]] = fadd float [[LOG2]], [[LOG3]]
+; CHECK-NEXT:    [[ADD1:%.*]] = fadd float [[LOG]], [[LOG1]]
+; CHECK-NEXT:    [[ADD2:%.*]] = fadd float [[ADD1]], [[LOG2]]
+; CHECK-NEXT:    [[ADD3:%.*]] = fadd float [[ADD2]], [[LOG3]]
 ; CHECK-NEXT:    [[ADD4:%.*]] = fadd float [[ADD3]], [[LOG4]]
 ; CHECK-NEXT:    [[ADD5:%.*]] = fadd float [[ADD4]], [[LOG5]]
 ; CHECK-NEXT:    [[ADD6:%.*]] = fadd float [[ADD5]], [[LOG6]]
-; CHECK-NEXT:    [[ADD8:%.*]] = fadd float [[ADD6]], [[LOG7]]
-; CHECK-NEXT:    [[ADD9:%.*]] = fadd float [[ADD8]], [[LOG8]]
-; CHECK-NEXT:    [[ADD7:%.*]] = fadd float [[ADD9]], [[LOG9]]
+; CHECK-NEXT:    [[ADD7:%.*]] = fadd float [[ADD6]], [[LOG7]]
 ; CHECK-NEXT:    ret float [[ADD7]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/min_max.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/min_max.ll
index a3be8f5e935c9..46c6c10125b95 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/min_max.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/min_max.ll
@@ -358,12 +358,12 @@ define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) {
 ; GFX8-NEXT:    [[ARG1_1:%.*]] = extractelement <4 x i16> [[ARG1]], i64 1
 ; GFX8-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
 ; GFX8-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
-; GFX8-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.umin.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG1]])
-; GFX8-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX8-NEXT:    [[TMP0:%.*]] = call <4 x i16> @llvm.umin.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG1]])
+; GFX8-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
 ; GFX8-NEXT:    [[INS_0:%.*]] = insertelement <4 x i16> poison, i16 [[ADD_0]], i64 0
-; GFX8-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
+; GFX8-NEXT:    [[INS_1:%.*]] = insertelement <4 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
 ; GFX8-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; GFX8-NEXT:    [[INS_31:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; GFX8-NEXT:    [[INS_31:%.*]] = shufflevector <4 x i16> [[INS_1]], <4 x i16> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; GFX8-NEXT:    ret <4 x i16> [[INS_31]]
 ;
 ; GFX9-LABEL: @uadd_sat_v4i16(
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
index 11fa3337544a1..257e4660c80aa 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
@@ -28,9 +28,13 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[ADD_PTR64_1:%.*]] = getelementptr i8, ptr [[ADD_PTR64]], i64 [[IDX_EXT63]]
 ; CHECK-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 4
 ; CHECK-NEXT:    [[ARRAYIDX5_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX8_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX8_2]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[ADD_PTR_1]], align 1
 ; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[TMP21:%.*]] = zext <2 x i8> [[TMP19]] to <2 x i32>
+; CHECK-NEXT:    [[CONV_2:%.*]] = zext i8 [[TMP6]] to i32
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1
 ; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[TMP31:%.*]] = zext <2 x i8> [[TMP22]] to <2 x i32>
@@ -46,6 +50,7 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP30:%.*]] = add <2 x i32> [[TMP25]], [[TMP23]]
 ; CHECK-NEXT:    [[TMP32:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:    [[TMP51:%.*]] = zext <2 x i8> [[TMP32]] to <2 x i32>
+; CHECK-NEXT:    [[CONV9_2:%.*]] = zext i8 [[TMP7]] to i32
 ; CHECK-NEXT:    [[TMP56:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:    [[TMP57:%.*]] = zext <2 x i8> [[TMP56]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP35:%.*]] = sub <2 x i32> [[TMP51]], [[TMP57]]
@@ -59,8 +64,8 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP34:%.*]] = add <2 x i32> [[TMP42]], [[TMP30]]
 ; CHECK-NEXT:    [[TMP44:%.*]] = sub <2 x i32> [[TMP30]], [[TMP42]]
 ; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <2 x i32> [[TMP34]], i32 0
-; CHECK-NEXT:    [[CONV_2:%.*]] = extractelement <2 x i32> [[TMP34]], i32 1
-; CHECK-NEXT:    [[ADD48_2:%.*]] = add i32 [[CONV_2]], [[TMP43]]
+; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <2 x i32> [[TMP34]], i32 1
+; CHECK-NEXT:    [[ADD48_2:%.*]] = add i32 [[TMP45]], [[TMP43]]
 ; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <2 x i32> [[TMP44]], i32 0
 ; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <2 x i32> [[TMP44]], i32 1
 ; CHECK-NEXT:    [[ADD55_2:%.*]] = add i32 [[TMP47]], [[TMP46]]
@@ -115,7 +120,15 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP85:%.*]] = sub <2 x i32> [[TMP78]], [[TMP80]]
 ; CHECK-NEXT:    [[ADD95:%.*]] = add i32 [[ADD94]], [[ADD48_2]]
 ; CHECK-NEXT:    [[SUB86_3:%.*]] = sub i32 [[ADD48_2]], [[ADD94]]
-; CHECK-NEXT:    [[SHR_I_1:%.*]] = lshr i32 [[TMP77]], 15
+; CHECK-NEXT:    [[SHR_I:%.*]] = lshr i32 [[TMP77]], 15
+; CHECK-NEXT:    [[AND_I:%.*]] = and i32 [[SHR_I]], 65537
+; CHECK-NEXT:    [[MUL_I:%.*]] = mul i32 [[AND_I]], 65535
+; CHECK-NEXT:    [[SHR_I49:%.*]] = lshr i32 [[TMP45]], 15
+; CHECK-NEXT:    [[AND_I50:%.*]] = and i32 [[SHR_I49]], 65537
+; CHECK-NEXT:    [[MUL_I51:%.*]] = mul i32 [[AND_I50]], 65535
+; CHECK-NEXT:    [[ADD94_1:%.*]] = add i32 [[ADD55_3]], [[ADD55_2]]
+; CHECK-NEXT:    [[SUB102_1:%.*]] = sub i32 [[ADD55_2]], [[ADD55_3]]
+; CHECK-NEXT:    [[SHR_I_1:%.*]] = lshr i32 [[CONV9_2]], 15
 ; CHECK-NEXT:    [[AND_I_1:%.*]] = and i32 [[SHR_I_1]], 65537
 ; CHECK-NEXT:    [[MUL_I_1:%.*]] = mul i32 [[AND_I_1]], 65535
 ; CHECK-NEXT:    [[SHR_I49_1:%.*]] = lshr i32 [[CONV_2]], 15
@@ -231,10 +244,10 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[SUB104:%.*]] = sub i32 [[ADD78]], [[ADD95]]
 ; CHECK-NEXT:    [[ADD105:%.*]] = add i32 [[SUB86_3]], [[SUB86]]
 ; CHECK-NEXT:    [[SUB106:%.*]] = sub i32 [[SUB86]], [[SUB86_3]]
-; CHECK-NEXT:    [[ADD_I:%.*]] = add i32 [[MUL_I_1]], [[ADD103]]
+; CHECK-NEXT:    [[ADD_I:%.*]] = add i32 [[MUL_I]], [[ADD103]]
 ; CHECK-NEXT:    [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP77]]
-; CHECK-NEXT:    [[ADD_I52:%.*]] = add i32 [[MUL_I51_1]], [[ADD105]]
-; CHECK-NEXT:    [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[CONV_2]]
+; CHECK-NEXT:    [[ADD_I52:%.*]] = add i32 [[MUL_I51]], [[ADD105]]
+; CHECK-NEXT:    [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[TMP45]]
 ; CHECK-NEXT:    [[ADD_I57:%.*]] = add i32 [[MUL_I56]], [[SUB104]]
 ; CHECK-NEXT:    [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP160]]
 ; CHECK-NEXT:    [[ADD_I62:%.*]] = add i32 [[MUL_I61]], [[SUB106]]
@@ -242,42 +255,21 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[ADD110:%.*]] = add i32 [[XOR_I53]], [[XOR_I]]
 ; CHECK-NEXT:    [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I58]]
 ; CHECK-NEXT:    [[ADD105_3:%.*]] = add i32 [[ADD112]], [[XOR_I63]]
-; CHECK-NEXT:    [[TMP169:%.*]] = load <2 x i8>, ptr [[ADD_PTR_1]], align 1
-; CHECK-NEXT:    [[TMP181:%.*]] = zext <2 x i8> [[TMP169]] to <2 x i32>
-; CHECK-NEXT:    [[TMP152:%.*]] = insertelement <2 x i32> poison, i32 [[ADD55_2]], i32 0
-; CHECK-NEXT:    [[TMP182:%.*]] = shufflevector <2 x i32> [[TMP152]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP183:%.*]] = insertelement <2 x i32> poison, i32 [[ADD55_3]], i32 0
-; CHECK-NEXT:    [[TMP184:%.*]] = shufflevector <2 x i32> [[TMP183]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP191:%.*]] = sub <2 x i32> [[TMP182]], [[TMP184]]
-; CHECK-NEXT:    [[TMP192:%.*]] = add <2 x i32> [[TMP182]], [[TMP184]]
-; CHECK-NEXT:    [[TMP194:%.*]] = shufflevector <2 x i32> [[TMP191]], <2 x i32> [[TMP192]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP195:%.*]] = lshr <2 x i32> [[TMP181]], splat (i32 15)
-; CHECK-NEXT:    [[TMP196:%.*]] = and <2 x i32> [[TMP195]], splat (i32 65537)
-; CHECK-NEXT:    [[TMP198:%.*]] = mul <2 x i32> [[TMP196]], splat (i32 65535)
-; CHECK-NEXT:    [[TMP202:%.*]] = insertelement <2 x i32> poison, i32 [[ADD55]], i32 0
-; CHECK-NEXT:    [[TMP203:%.*]] = shufflevector <2 x i32> [[TMP202]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP205:%.*]] = insertelement <2 x i32> poison, i32 [[ADD55_1]], i32 0
-; CHECK-NEXT:    [[TMP206:%.*]] = shufflevector <2 x i32> [[TMP205]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP207:%.*]] = sub <2 x i32> [[TMP203]], [[TMP206]]
-; CHECK-NEXT:    [[TMP210:%.*]] = add <2 x i32> [[TMP203]], [[TMP206]]
-; CHECK-NEXT:    [[TMP168:%.*]] = shufflevector <2 x i32> [[TMP207]], <2 x i32> [[TMP210]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[ADD94_1:%.*]] = extractelement <2 x i32> [[TMP194]], i32 1
-; CHECK-NEXT:    [[ADD78_1:%.*]] = extractelement <2 x i32> [[TMP168]], i32 1
+; CHECK-NEXT:    [[ADD78_1:%.*]] = add i32 [[ADD55_1]], [[ADD55]]
+; CHECK-NEXT:    [[SUB86_1:%.*]] = sub i32 [[ADD55]], [[ADD55_1]]
+; CHECK-NEXT:    [[ADD103_1:%.*]] = add i32 [[ADD94_1]], [[ADD78_1]]
 ; CHECK-NEXT:    [[SUB104_1:%.*]] = sub i32 [[ADD78_1]], [[ADD94_1]]
-; CHECK-NEXT:    [[TMP220:%.*]] = add <2 x i32> [[TMP194]], [[TMP168]]
-; CHECK-NEXT:    [[SUB102_1:%.*]] = extractelement <2 x i32> [[TMP194]], i32 0
-; CHECK-NEXT:    [[SUB86_1:%.*]] = extractelement <2 x i32> [[TMP168]], i32 0
-; CHECK-NEXT:    [[TMP174:%.*]] = shufflevector <2 x i32> [[TMP168]], <2 x i32> [[TMP194]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[ADD105_1:%.*]] = add i32 [[SUB102_1]], [[SUB86_1]]
 ; CHECK-NEXT:    [[SUB106_1:%.*]] = sub i32 [[SUB86_1]], [[SUB102_1]]
-; CHECK-NEXT:    [[TMP175:%.*]] = add <2 x i32> [[TMP198]], [[TMP220]]
-; CHECK-NEXT:    [[TMP221:%.*]] = xor <2 x i32> [[TMP175]], [[TMP181]]
+; CHECK-NEXT:    [[ADD_I_1:%.*]] = add i32 [[MUL_I_1]], [[ADD103_1]]
+; CHECK-NEXT:    [[XOR_I_1:%.*]] = xor i32 [[ADD_I_1]], [[CONV9_2]]
+; CHECK-NEXT:    [[ADD_I52_1:%.*]] = add i32 [[MUL_I51_1]], [[ADD105_1]]
+; CHECK-NEXT:    [[XOR_I53_1:%.*]] = xor i32 [[ADD_I52_1]], [[CONV_2]]
 ; CHECK-NEXT:    [[ADD_I57_1:%.*]] = add i32 [[MUL_I56_1]], [[SUB104_1]]
 ; CHECK-NEXT:    [[XOR_I58_1:%.*]] = xor i32 [[ADD_I57_1]], [[TMP162]]
 ; CHECK-NEXT:    [[ADD_I62_1:%.*]] = add i32 [[MUL_I61_1]], [[SUB106_1]]
 ; CHECK-NEXT:    [[XOR_I63_1:%.*]] = xor i32 [[ADD_I62_1]], [[TMP129]]
-; CHECK-NEXT:    [[XOR_I53_1:%.*]] = extractelement <2 x i32> [[TMP221]], i32 0
 ; CHECK-NEXT:    [[ADD108_1:%.*]] = add i32 [[XOR_I53_1]], [[ADD105_3]]
-; CHECK-NEXT:    [[XOR_I_1:%.*]] = extractelement <2 x i32> [[TMP221]], i32 1
 ; CHECK-NEXT:    [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[XOR_I_1]]
 ; CHECK-NEXT:    [[ADD112_5:%.*]] = add i32 [[ADD110_1]], [[XOR_I58_1]]
 ; CHECK-NEXT:    [[ADD113_2:%.*]] = add i32 [[ADD112_5]], [[XOR_I63_1]]

From b4421735de4d487592708e96431b88fa53e82d2a Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Mon, 10 Feb 2025 10:57:22 -0500
Subject: [PATCH 102/282] [clang] Handle f(no-)strict-overflow, f(no-)wrapv,
 f(no-)wrapv-pointer like gcc (#126524)

We now process all 6 options left-to-right and pick whatever is active
at the end.

Fixes #124868.

(cherry picked from commit 783275eb7b3ecde63bdb6ac1316c090bfc568bdd)
---
 clang/lib/Driver/ToolChains/CommonArgs.cpp | 48 +++++++++++++++-------
 clang/test/Driver/clang_wrapv_opts.c       | 24 +++++------
 2 files changed, 45 insertions(+), 27 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 4ed4dbc1a8d1b..ae635fb6a1807 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -3098,21 +3098,39 @@ bool tools::shouldRecordCommandLine(const ToolChain &TC,
 
 void tools::renderCommonIntegerOverflowOptions(const ArgList &Args,
                                                ArgStringList &CmdArgs) {
-  // -fno-strict-overflow implies -fwrapv if it isn't disabled, but
-  // -fstrict-overflow won't turn off an explicitly enabled -fwrapv.
-  bool StrictOverflow = Args.hasFlag(options::OPT_fstrict_overflow,
-                                     options::OPT_fno_strict_overflow, true);
-  if (Arg *A = Args.getLastArg(options::OPT_fwrapv, options::OPT_fno_wrapv)) {
-    if (A->getOption().matches(options::OPT_fwrapv))
-      CmdArgs.push_back("-fwrapv");
-  } else if (!StrictOverflow) {
-    CmdArgs.push_back("-fwrapv");
+  bool use_fwrapv = false;
+  bool use_fwrapv_pointer = false;
+  for (const Arg *A : Args.filtered(
+           options::OPT_fstrict_overflow, options::OPT_fno_strict_overflow,
+           options::OPT_fwrapv, options::OPT_fno_wrapv,
+           options::OPT_fwrapv_pointer, options::OPT_fno_wrapv_pointer)) {
+    A->claim();
+    switch (A->getOption().getID()) {
+    case options::OPT_fstrict_overflow:
+      use_fwrapv = false;
+      use_fwrapv_pointer = false;
+      break;
+    case options::OPT_fno_strict_overflow:
+      use_fwrapv = true;
+      use_fwrapv_pointer = true;
+      break;
+    case options::OPT_fwrapv:
+      use_fwrapv = true;
+      break;
+    case options::OPT_fno_wrapv:
+      use_fwrapv = false;
+      break;
+    case options::OPT_fwrapv_pointer:
+      use_fwrapv_pointer = true;
+      break;
+    case options::OPT_fno_wrapv_pointer:
+      use_fwrapv_pointer = false;
+      break;
+    }
   }
-  if (Arg *A = Args.getLastArg(options::OPT_fwrapv_pointer,
-                               options::OPT_fno_wrapv_pointer)) {
-    if (A->getOption().matches(options::OPT_fwrapv_pointer))
-      CmdArgs.push_back("-fwrapv-pointer");
-  } else if (!StrictOverflow) {
+
+  if (use_fwrapv)
+    CmdArgs.push_back("-fwrapv");
+  if (use_fwrapv_pointer)
     CmdArgs.push_back("-fwrapv-pointer");
-  }
 }
diff --git a/clang/test/Driver/clang_wrapv_opts.c b/clang/test/Driver/clang_wrapv_opts.c
index 9f3a884324dcd..295d8deb0d99d 100644
--- a/clang/test/Driver/clang_wrapv_opts.c
+++ b/clang/test/Driver/clang_wrapv_opts.c
@@ -1,20 +1,20 @@
 // RUN: %clang -### -S -fwrapv -fno-wrapv -fwrapv -Werror %s 2>&1 | FileCheck -check-prefix=CHECK1 %s
 // CHECK1: "-fwrapv"
-//
+
 // RUN: %clang -### -S -fwrapv-pointer -fno-wrapv-pointer -fwrapv-pointer -Werror %s 2>&1 | FileCheck -check-prefix=CHECK1-POINTER %s
 // CHECK1-POINTER: "-fwrapv-pointer"
-//
+
 // RUN: %clang -### -S -fstrict-overflow -fno-strict-overflow -Werror %s 2>&1 | FileCheck -check-prefix=CHECK2 %s
 // CHECK2: "-fwrapv"{{.*}}"-fwrapv-pointer"
-//
+
 // RUN: %clang -### -S -fwrapv -fstrict-overflow -Werror -Werror %s 2>&1 | FileCheck -check-prefix=CHECK3 %s --implicit-check-not="-fwrapv-pointer"
-// CHECK3: "-fwrapv"
-//
+// CHECK3-NOT: "-fwrapv"
+
 // RUN: %clang -### -S -fwrapv-pointer -fstrict-overflow -Werror %s 2>&1 | FileCheck -check-prefix=CHECK3-POINTER %s --implicit-check-not="-fwrapv"
-// CHECK3-POINTER: "-fwrapv-pointer"
-//
-// RUN: %clang -### -S -fno-wrapv -fno-strict-overflow -Werror %s 2>&1 | FileCheck -check-prefix=CHECK4 %s --implicit-check-not="-fwrapv"
-// CHECK4: "-fwrapv-pointer"
-//
-// RUN: %clang -### -S -fno-wrapv-pointer -fno-strict-overflow -Werror %s 2>&1 | FileCheck -check-prefix=CHECK4-POINTER %s --implicit-check-not="-fwrapv-pointer"
-// CHECK4-POINTER: "-fwrapv"
+// CHECK3-POINTER-NOT: "-fwrapv-pointer"
+
+// RUN: %clang -### -S -fno-wrapv -fno-strict-overflow -fno-wrapv-pointer -Werror %s 2>&1 | FileCheck -check-prefix=CHECK4 %s --implicit-check-not="-fwrapv-pointer"
+// CHECK4: "-fwrapv"
+
+// RUN: %clang -### -S -fno-wrapv-pointer -fno-strict-overflow -fno-wrapv -Werror %s 2>&1 | FileCheck -check-prefix=CHECK4-POINTER %s --implicit-check-not="-fwrapv"
+// CHECK4-POINTER: "-fwrapv-pointer"

From 984a779b81552a97cbfcb8139ad646a958cf47ce Mon Sep 17 00:00:00 2001
From: Daniil Kovalev <dkovalev@accesssoftek.com>
Date: Wed, 5 Feb 2025 11:39:27 +0300
Subject: [PATCH 103/282] [PAC] Do not support some values of branch-protection
 with ptrauth-returns (#125280)

This patch does two things.

1. Previously, when checking driver arguments, we emitted an error for
unsupported values of `-mbranch-protection` when using pauthtest ABI.
The reason for that was ptrauth-returns being enabled as part of
pauthtest. This patch changes the check against pauthtest to a check
against ptrauth-returns.

2. Similarly, check against values of the following function attribute
which are unsupported with ptrauth-returns:
`__attribute__((target("branch-protection=XXX`. Note that existing
`validateBranchProtection` function is used, and current behavior is to
ignore the unsupported attribute value, so no error is emitted.

(cherry picked from commit 84b0c128a751acfbf5b439edc724ba27d1da653e)
---
 clang/include/clang/Basic/TargetInfo.h        |  1 +
 clang/lib/Basic/Targets/AArch64.cpp           |  8 ++++
 clang/lib/Basic/Targets/AArch64.h             |  1 +
 clang/lib/Basic/Targets/ARM.cpp               |  1 +
 clang/lib/Basic/Targets/ARM.h                 |  1 +
 clang/lib/CodeGen/Targets/AArch64.cpp         |  4 +-
 clang/lib/CodeGen/Targets/ARM.cpp             |  4 +-
 clang/lib/Driver/ToolChains/Clang.cpp         | 46 ++++++++++---------
 clang/lib/Sema/SemaDeclAttr.cpp               |  3 +-
 clang/test/Driver/aarch64-ptrauth.c           | 27 +++++++----
 ...rch64-ignore-branch-protection-attribute.c | 31 +++++++++++++
 11 files changed, 91 insertions(+), 36 deletions(-)
 create mode 100644 clang/test/Frontend/aarch64-ignore-branch-protection-attribute.c

diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h
index d762144478b48..1a8398d449cd2 100644
--- a/clang/include/clang/Basic/TargetInfo.h
+++ b/clang/include/clang/Basic/TargetInfo.h
@@ -1469,6 +1469,7 @@ class TargetInfo : public TransferrableTargetInfo,
   /// specification
   virtual bool validateBranchProtection(StringRef Spec, StringRef Arch,
                                         BranchProtectionInfo &BPI,
+                                        const LangOptions &LO,
                                         StringRef &Err) const {
     Err = "";
     return false;
diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp
index 57c9849ef2a72..cabda0a1323a3 100644
--- a/clang/lib/Basic/Targets/AArch64.cpp
+++ b/clang/lib/Basic/Targets/AArch64.cpp
@@ -253,11 +253,19 @@ bool AArch64TargetInfo::validateGlobalRegisterVariable(
 
 bool AArch64TargetInfo::validateBranchProtection(StringRef Spec, StringRef,
                                                  BranchProtectionInfo &BPI,
+                                                 const LangOptions &LO,
                                                  StringRef &Err) const {
   llvm::ARM::ParsedBranchProtection PBP;
   if (!llvm::ARM::parseBranchProtection(Spec, PBP, Err, HasPAuthLR))
     return false;
 
+  // GCS is currently untested with ptrauth-returns, but enabling this could be
+  // allowed in future after testing with a suitable system.
+  if (LO.PointerAuthReturns &&
+      (PBP.Scope != "none" || PBP.BranchProtectionPAuthLR ||
+       PBP.GuardedControlStack))
+    return false;
+
   BPI.SignReturnAddr =
       llvm::StringSwitch<LangOptions::SignReturnAddressScopeKind>(PBP.Scope)
           .Case("non-leaf", LangOptions::SignReturnAddressScopeKind::NonLeaf)
diff --git a/clang/lib/Basic/Targets/AArch64.h b/clang/lib/Basic/Targets/AArch64.h
index b75d2a9dc8eca..527e49d63512c 100644
--- a/clang/lib/Basic/Targets/AArch64.h
+++ b/clang/lib/Basic/Targets/AArch64.h
@@ -132,6 +132,7 @@ class LLVM_LIBRARY_VISIBILITY AArch64TargetInfo : public TargetInfo {
 
   bool validateBranchProtection(StringRef Spec, StringRef Arch,
                                 BranchProtectionInfo &BPI,
+                                const LangOptions &LO,
                                 StringRef &Err) const override;
 
   bool isValidCPUName(StringRef Name) const override;
diff --git a/clang/lib/Basic/Targets/ARM.cpp b/clang/lib/Basic/Targets/ARM.cpp
index 0fd5433a76402..aad05052bcf06 100644
--- a/clang/lib/Basic/Targets/ARM.cpp
+++ b/clang/lib/Basic/Targets/ARM.cpp
@@ -405,6 +405,7 @@ bool ARMTargetInfo::isBranchProtectionSupportedArch(StringRef Arch) const {
 
 bool ARMTargetInfo::validateBranchProtection(StringRef Spec, StringRef Arch,
                                              BranchProtectionInfo &BPI,
+                                             const LangOptions &LO,
                                              StringRef &Err) const {
   llvm::ARM::ParsedBranchProtection PBP;
   if (!llvm::ARM::parseBranchProtection(Spec, PBP, Err))
diff --git a/clang/lib/Basic/Targets/ARM.h b/clang/lib/Basic/Targets/ARM.h
index fdb40c3d41918..a42362724b654 100644
--- a/clang/lib/Basic/Targets/ARM.h
+++ b/clang/lib/Basic/Targets/ARM.h
@@ -155,6 +155,7 @@ class LLVM_LIBRARY_VISIBILITY ARMTargetInfo : public TargetInfo {
   bool isBranchProtectionSupportedArch(StringRef Arch) const override;
   bool validateBranchProtection(StringRef Spec, StringRef Arch,
                                 BranchProtectionInfo &BPI,
+                                const LangOptions &LO,
                                 StringRef &Err) const override;
 
   // FIXME: This should be based on Arch attributes, not CPU names.
diff --git a/clang/lib/CodeGen/Targets/AArch64.cpp b/clang/lib/CodeGen/Targets/AArch64.cpp
index 057199c66f5a1..170ce1640367a 100644
--- a/clang/lib/CodeGen/Targets/AArch64.cpp
+++ b/clang/lib/CodeGen/Targets/AArch64.cpp
@@ -147,8 +147,8 @@ class AArch64TargetCodeGenInfo : public TargetCodeGenInfo {
           CGM.getTarget().parseTargetAttr(TA->getFeaturesStr());
       if (!Attr.BranchProtection.empty()) {
         StringRef Error;
-        (void)CGM.getTarget().validateBranchProtection(Attr.BranchProtection,
-                                                       Attr.CPU, BPI, Error);
+        (void)CGM.getTarget().validateBranchProtection(
+            Attr.BranchProtection, Attr.CPU, BPI, CGM.getLangOpts(), Error);
         assert(Error.empty());
       }
     }
diff --git a/clang/lib/CodeGen/Targets/ARM.cpp b/clang/lib/CodeGen/Targets/ARM.cpp
index 47e31ceeaf294..77641ce10e3d3 100644
--- a/clang/lib/CodeGen/Targets/ARM.cpp
+++ b/clang/lib/CodeGen/Targets/ARM.cpp
@@ -149,8 +149,8 @@ class ARMTargetCodeGenInfo : public TargetCodeGenInfo {
         StringRef DiagMsg;
         StringRef Arch =
             Attr.CPU.empty() ? CGM.getTarget().getTargetOpts().CPU : Attr.CPU;
-        if (!CGM.getTarget().validateBranchProtection(Attr.BranchProtection,
-                                                      Arch, BPI, DiagMsg)) {
+        if (!CGM.getTarget().validateBranchProtection(
+                Attr.BranchProtection, Arch, BPI, CGM.getLangOpts(), DiagMsg)) {
           CGM.getDiags().Report(
               D->getLocation(),
               diag::warn_target_unsupported_branch_protection_attribute)
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 589de953be5be..ec5ee29ece434 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -1618,32 +1618,34 @@ static void CollectARMPACBTIOptions(const ToolChain &TC, const ArgList &Args,
     GuardedControlStack = PBP.GuardedControlStack;
   }
 
-  CmdArgs.push_back(
-      Args.MakeArgString(Twine("-msign-return-address=") + Scope));
-  if (Scope != "none") {
+  bool HasPtrauthReturns = llvm::any_of(CmdArgs, [](const char *Arg) {
+    return StringRef(Arg) == "-fptrauth-returns";
+  });
+  // GCS is currently untested with ptrauth-returns, but enabling this could be
+  // allowed in future after testing with a suitable system.
+  if (HasPtrauthReturns &&
+      (Scope != "none" || BranchProtectionPAuthLR || GuardedControlStack)) {
     if (Triple.getEnvironment() == llvm::Triple::PAuthTest)
       D.Diag(diag::err_drv_unsupported_opt_for_target)
           << A->getAsString(Args) << Triple.getTriple();
+    else
+      D.Diag(diag::err_drv_incompatible_options)
+          << A->getAsString(Args) << "-fptrauth-returns";
+  }
+
+  CmdArgs.push_back(
+      Args.MakeArgString(Twine("-msign-return-address=") + Scope));
+  if (Scope != "none")
     CmdArgs.push_back(
         Args.MakeArgString(Twine("-msign-return-address-key=") + Key));
-  }
-  if (BranchProtectionPAuthLR) {
-    if (Triple.getEnvironment() == llvm::Triple::PAuthTest)
-      D.Diag(diag::err_drv_unsupported_opt_for_target)
-          << A->getAsString(Args) << Triple.getTriple();
+  if (BranchProtectionPAuthLR)
     CmdArgs.push_back(
         Args.MakeArgString(Twine("-mbranch-protection-pauth-lr")));
-  }
   if (IndirectBranches)
     CmdArgs.push_back("-mbranch-target-enforce");
-  // GCS is currently untested with PAuthABI, but enabling this could be allowed
-  // in future after testing with a suitable system.
-  if (GuardedControlStack) {
-    if (Triple.getEnvironment() == llvm::Triple::PAuthTest)
-      D.Diag(diag::err_drv_unsupported_opt_for_target)
-          << A->getAsString(Args) << Triple.getTriple();
+
+  if (GuardedControlStack)
     CmdArgs.push_back("-mguarded-control-stack");
-  }
 }
 
 void Clang::AddARMTargetArgs(const llvm::Triple &Triple, const ArgList &Args,
@@ -1822,12 +1824,6 @@ void Clang::AddAArch64TargetArgs(const ArgList &Args,
       CmdArgs.push_back("-aarch64-enable-global-merge=true");
   }
 
-  // Enable/disable return address signing and indirect branch targets.
-  CollectARMPACBTIOptions(getToolChain(), Args, CmdArgs, true /*isAArch64*/);
-
-  if (Triple.getEnvironment() == llvm::Triple::PAuthTest)
-    handlePAuthABI(Args, CmdArgs);
-
   // Handle -msve_vector_bits=<bits>
   if (Arg *A = Args.getLastArg(options::OPT_msve_vector_bits_EQ)) {
     StringRef Val = A->getValue();
@@ -1896,6 +1892,12 @@ void Clang::AddAArch64TargetArgs(const ArgList &Args,
                     options::OPT_fno_ptrauth_init_fini_address_discrimination);
   Args.addOptInFlag(CmdArgs, options::OPT_faarch64_jump_table_hardening,
                     options::OPT_fno_aarch64_jump_table_hardening);
+
+  if (Triple.getEnvironment() == llvm::Triple::PAuthTest)
+    handlePAuthABI(Args, CmdArgs);
+
+  // Enable/disable return address signing and indirect branch targets.
+  CollectARMPACBTIOptions(getToolChain(), Args, CmdArgs, true /*isAArch64*/);
 }
 
 void Clang::AddLoongArchTargetArgs(const ArgList &Args,
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 9d7d22590bce4..f351663c6824e 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -3073,7 +3073,8 @@ bool Sema::checkTargetAttr(SourceLocation LiteralLoc, StringRef AttrStr) {
   if (ParsedAttrs.BranchProtection.empty())
     return false;
   if (!Context.getTargetInfo().validateBranchProtection(
-          ParsedAttrs.BranchProtection, ParsedAttrs.CPU, BPI, DiagMsg)) {
+          ParsedAttrs.BranchProtection, ParsedAttrs.CPU, BPI,
+          Context.getLangOpts(), DiagMsg)) {
     if (DiagMsg.empty())
       return Diag(LiteralLoc, diag::warn_unsupported_target_attribute)
              << Unsupported << None << "branch-protection" << Target;
diff --git a/clang/test/Driver/aarch64-ptrauth.c b/clang/test/Driver/aarch64-ptrauth.c
index d036189e61498..1d2993f4c60c4 100644
--- a/clang/test/Driver/aarch64-ptrauth.c
+++ b/clang/test/Driver/aarch64-ptrauth.c
@@ -64,22 +64,31 @@
 
 //// The only branch protection option compatible with PAuthABI is BTI.
 // RUN: not %clang -### -c --target=aarch64-linux -mabi=pauthtest -mbranch-protection=pac-ret %s 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=ERR4
+// RUN:   FileCheck %s --check-prefix=ERR4_1
 // RUN: not %clang -### -c --target=aarch64-linux-pauthtest       -mbranch-protection=pac-ret %s 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=ERR4
-// ERR4: error: unsupported option '-mbranch-protection=pac-ret' for target 'aarch64-unknown-linux-pauthtest'
+// RUN:   FileCheck %s --check-prefix=ERR4_1
+// RUN: not %clang -### -c --target=aarch64 -fptrauth-returns     -mbranch-protection=pac-ret %s 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=ERR4_2
+// ERR4_1: error: unsupported option '-mbranch-protection=pac-ret' for target 'aarch64-unknown-linux-pauthtest'
+// ERR4_2: error: the combination of '-mbranch-protection=pac-ret' and '-fptrauth-returns' is incompatible
 
 // RUN: not %clang -### -c --target=aarch64-linux -mabi=pauthtest -mbranch-protection=gcs %s 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=ERR5
+// RUN:   FileCheck %s --check-prefix=ERR5_1
 // RUN: not %clang -### -c --target=aarch64-linux-pauthtest       -mbranch-protection=gcs %s 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=ERR5
-// ERR5: error: unsupported option '-mbranch-protection=gcs' for target 'aarch64-unknown-linux-pauthtest'
+// RUN:   FileCheck %s --check-prefix=ERR5_1
+// RUN: not %clang -### -c --target=aarch64 -fptrauth-returns     -mbranch-protection=gcs %s 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=ERR5_2
+// ERR5_1: error: unsupported option '-mbranch-protection=gcs' for target 'aarch64-unknown-linux-pauthtest'
+// ERR5_2: error: the combination of '-mbranch-protection=gcs' and '-fptrauth-returns' is incompatible
 
 // RUN: not %clang -### -c --target=aarch64-linux -mabi=pauthtest -mbranch-protection=standard %s 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=ERR6
+// RUN:   FileCheck %s --check-prefix=ERR6_1
 // RUN: not %clang -### -c --target=aarch64-linux-pauthtest       -mbranch-protection=standard %s 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=ERR6
-// ERR6: error: unsupported option '-mbranch-protection=standard' for target 'aarch64-unknown-linux-pauthtest'
+// RUN:   FileCheck %s --check-prefix=ERR6_1
+// RUN: not %clang -### -c --target=aarch64 -fptrauth-returns     -mbranch-protection=standard %s 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=ERR6_2
+// ERR6_1: error: unsupported option '-mbranch-protection=standard' for target 'aarch64-unknown-linux-pauthtest'
+// ERR6_2: error: the combination of '-mbranch-protection=standard' and '-fptrauth-returns' is incompatible
 
 // RUN: not %clang -### -c --target=aarch64-linux -mabi=pauthtest -msign-return-address=all %s 2>&1 | \
 // RUN:   FileCheck %s --check-prefix=ERR7
diff --git a/clang/test/Frontend/aarch64-ignore-branch-protection-attribute.c b/clang/test/Frontend/aarch64-ignore-branch-protection-attribute.c
new file mode 100644
index 0000000000000..32cc98dd4e037
--- /dev/null
+++ b/clang/test/Frontend/aarch64-ignore-branch-protection-attribute.c
@@ -0,0 +1,31 @@
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang -target aarch64-linux-pauthtest   %s -S -emit-llvm -o - 2>&1 | FileCheck --implicit-check-not=warning: %s
+// RUN: %clang -target aarch64 -fptrauth-returns %s -S -emit-llvm -o - 2>&1 | FileCheck --implicit-check-not=warning: %s
+
+/// Unsupported with pauthtest, warning emitted
+__attribute__((target("branch-protection=pac-ret"))) void f1() {}
+// CHECK:      warning: unsupported 'branch-protection' in the 'target' attribute string; 'target' attribute ignored [-Wignored-attributes]
+// CHECK-NEXT: __attribute__((target("branch-protection=pac-ret"))) void f1() {}
+__attribute__((target("branch-protection=gcs"))) void f2() {}
+// CHECK:      warning: unsupported 'branch-protection' in the 'target' attribute string; 'target' attribute ignored [-Wignored-attributes]
+// CHECK-NEXT: __attribute__((target("branch-protection=gcs"))) void f2() {}
+__attribute__((target("branch-protection=standard"))) void f3() {}
+// CHECK:      warning: unsupported 'branch-protection' in the 'target' attribute string; 'target' attribute ignored [-Wignored-attributes]
+// CHECK-NEXT: __attribute__((target("branch-protection=standard"))) void f3() {}
+
+/// Supported with pauthtest, no warning emitted
+__attribute__((target("branch-protection=bti"))) void f4() {}
+
+/// Supported with pauthtest, no warning emitted
+__attribute__((target("branch-protection=none"))) void f5() {}
+
+/// Check there are no branch protection function attributes which are unsupported with pauthtest
+
+// CHECK-NOT:  attributes {{.*}} "sign-return-address"
+// CHECK-NOT:  attributes {{.*}} "sign-return-address-key"
+// CHECK-NOT:  attributes {{.*}} "branch-protection-pauth-lr"
+// CHECK-NOT:  attributes {{.*}} "guarded-control-stack"
+
+/// Check function attributes which are supported with pauthtest
+// CHECK:      attributes {{.*}} "branch-target-enforcement"

From 88f8956711f7c8d306d08fff8603d6b99e8302c1 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Tue, 4 Feb 2025 16:37:21 +0100
Subject: [PATCH 104/282] [mlir] Fix MLIRTestDialect dependency in MLIRTestIR

This is a test library which is not part of libMLIR, so it should
use normal LINK_LIBS instead of mlir_target_link_libraries.

This fixes an issue introduced in #123910 and follows up on the
fix in #125004, which added the library to DEPENDS, which is not
sufficient.
---
 mlir/test/lib/IR/CMakeLists.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mlir/test/lib/IR/CMakeLists.txt b/mlir/test/lib/IR/CMakeLists.txt
index e5416da70d500..71a96c7f92c0c 100644
--- a/mlir/test/lib/IR/CMakeLists.txt
+++ b/mlir/test/lib/IR/CMakeLists.txt
@@ -27,13 +27,15 @@ add_mlir_library(MLIRTestIR
   TestVisitorsGeneric.cpp
 
   EXCLUDE_FROM_LIBMLIR
+
+  LINK_LIBS PUBLIC
+  MLIRTestDialect
   )
 mlir_target_link_libraries(MLIRTestIR PUBLIC
   MLIRPass
   MLIRBytecodeReader
   MLIRBytecodeWriter
   MLIRFunctionInterfaces
-  MLIRTestDialect
   )
 
 target_include_directories(MLIRTestIR

From dfa60a77e0bae875ea30340067bebea1c70b9d3d Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 5 Feb 2025 09:48:23 +0100
Subject: [PATCH 105/282] [flang] Move FIRSupport dependency to correct place
 (#125697)

This library is provided by flang, not MLIR, so it should not be part of
MLIR_LIBS.

Fixes an issue introduced in https://github.com/llvm/llvm-project/pull/120966.

(cherry picked from commit ee76bdac192ce86c5d13e4c712e0327aaefda45f)
---
 flang/lib/Optimizer/Analysis/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flang/lib/Optimizer/Analysis/CMakeLists.txt b/flang/lib/Optimizer/Analysis/CMakeLists.txt
index 6fe9c70f83765..c4dae898f8e57 100644
--- a/flang/lib/Optimizer/Analysis/CMakeLists.txt
+++ b/flang/lib/Optimizer/Analysis/CMakeLists.txt
@@ -12,6 +12,7 @@ add_flang_library(FIRAnalysis
   LINK_LIBS
   FIRBuilder
   FIRDialect
+  FIRSupport
   HLFIRDialect
 
   MLIR_LIBS
@@ -19,5 +20,4 @@ add_flang_library(FIRAnalysis
   MLIRLLVMDialect
   MLIRMathTransforms
   MLIROpenMPDialect
-  FIRSupport
 )

From 4c4ed5e2f5357d724e4c26d21ee3e840210b917f Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 5 Feb 2025 11:58:44 +0100
Subject: [PATCH 106/282] [flang][cmake] Fix bcc dependencies (#125822)

The Fortran libraries are not part of MLIR, so they should use
target_link_libraries() rather than mlir_target_link_libraries().

This fixes an issue introduced in
https://github.com/llvm/llvm-project/pull/120966.

(cherry picked from commit f9af5c145f40480d46874b643ca2b1237e9fbb2a)
---
 flang/tools/bbc/CMakeLists.txt | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/flang/tools/bbc/CMakeLists.txt b/flang/tools/bbc/CMakeLists.txt
index 85aeb85e0c530..97462be83ea43 100644
--- a/flang/tools/bbc/CMakeLists.txt
+++ b/flang/tools/bbc/CMakeLists.txt
@@ -29,6 +29,11 @@ target_link_libraries(bbc PRIVATE
   flangFrontend
   flangPasses
   FlangOpenMPTransforms
+  FortranCommon
+  FortranParser
+  FortranEvaluate
+  FortranSemantics
+  FortranLower
 )
 
 mlir_target_link_libraries(bbc PRIVATE
@@ -36,9 +41,4 @@ mlir_target_link_libraries(bbc PRIVATE
   ${extension_libs}
   MLIRAffineToStandard
   MLIRSCFToControlFlow
-  FortranCommon
-  FortranParser
-  FortranEvaluate
-  FortranSemantics
-  FortranLower
 )

From 6195c3a981c8ea8729b940388d91f9238ad7c57e Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Thu, 30 Jan 2025 20:34:29 +0100
Subject: [PATCH 107/282] [Clang] Fix __{add,remove}_pointer in Objective-C++
 (#123678)

This aligns the builtins with how implementations work which don't use
the buitins.
---
 clang/docs/ReleaseNotes.rst          |  2 ++
 clang/lib/Sema/SemaType.cpp          |  6 +++---
 clang/test/SemaCXX/remove_pointer.mm |  8 --------
 clang/test/SemaObjCXX/type-traits.mm | 17 +++++++++++++++++
 4 files changed, 22 insertions(+), 11 deletions(-)
 delete mode 100644 clang/test/SemaCXX/remove_pointer.mm
 create mode 100644 clang/test/SemaObjCXX/type-traits.mm

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index b61563ade0a1e..ad1a5e7ae282e 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -912,6 +912,8 @@ Bug Fixes to Compiler Builtins
 
 - Fix ``__builtin_source_location`` incorrectly returning wrong column for method chains. (#GH119129)
 
+- The behvaiour of ``__add_pointer`` and ``__remove_pointer`` for Objective-C++'s ``id`` and interfaces has been fixed.
+
 Bug Fixes to Attribute Support
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index 1a591a5376f5e..1fa5239a597c8 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -1826,7 +1826,8 @@ QualType Sema::BuildPointerType(QualType T,
   if (checkQualifiedFunction(*this, T, Loc, QFK_Pointer))
     return QualType();
 
-  assert(!T->isObjCObjectType() && "Should build ObjCObjectPointerType");
+  if (T->isObjCObjectType())
+    return Context.getObjCObjectPointerType(T);
 
   // In ARC, it is forbidden to build pointers to unqualified pointers.
   if (getLangOpts().ObjCAutoRefCount)
@@ -9808,8 +9809,7 @@ QualType Sema::BuiltinAddPointer(QualType BaseType, SourceLocation Loc) {
 }
 
 QualType Sema::BuiltinRemovePointer(QualType BaseType, SourceLocation Loc) {
-  // We don't want block pointers or ObjectiveC's id type.
-  if (!BaseType->isAnyPointerType() || BaseType->isObjCIdType())
+  if (!BaseType->isAnyPointerType())
     return BaseType;
 
   return BaseType->getPointeeType();
diff --git a/clang/test/SemaCXX/remove_pointer.mm b/clang/test/SemaCXX/remove_pointer.mm
deleted file mode 100644
index d1cf1fa9f4efc..0000000000000
--- a/clang/test/SemaCXX/remove_pointer.mm
+++ /dev/null
@@ -1,8 +0,0 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s
-
-// expected-no-diagnostics
-
-@class X;
-
-static_assert(__is_same(__remove_pointer(X *), X), "");
-static_assert(__is_same(__remove_pointer(id), id), "");
diff --git a/clang/test/SemaObjCXX/type-traits.mm b/clang/test/SemaObjCXX/type-traits.mm
new file mode 100644
index 0000000000000..81b9573b52192
--- /dev/null
+++ b/clang/test/SemaObjCXX/type-traits.mm
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -triple x86_64-apple-darwin10 -fsyntax-only -verify -std=c++17  %s
+
+// expected-no-diagnostics
+
+@interface I;
+@end
+
+@class C;
+
+static_assert(__is_same(__add_pointer(id), id*));
+static_assert(__is_same(__add_pointer(I), I*));
+
+static_assert(__is_same(__remove_pointer(C*), C));
+static_assert(!__is_same(__remove_pointer(id), id));
+static_assert(__is_same(__remove_pointer(id*), id));
+static_assert(__is_same(__remove_pointer(__add_pointer(id)), id));
+static_assert(__is_same(__add_pointer(__remove_pointer(id)), id));

From 1ee32d22344f94f179063e3ddf8ae880af2cb240 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Wed, 5 Feb 2025 12:38:48 -0600
Subject: [PATCH 108/282] [Clang] Add width handling for <gpuintrin.h> shuffle
 helper (#125896)

Summary:
The CUDA impelementation has long supported the `width` argument on its
shuffle instrucitons, which makes it more difficult to replace those
uses with this helper. This patch just correctly implements that for
AMDGPU and NVPTX so it's equivalent to `__shfl_sync` in CUDA. This will
ease porting.

Fortunately these get optimized out correctly when passing in known
widths.

(cherry picked from commit 2d8106cb5a505326d1da0f4461708ed44a0ac761)
---
 clang/lib/Headers/amdgpuintrin.h              | 14 +++++---
 clang/lib/Headers/gpuintrin.h                 | 24 ++++++++------
 clang/lib/Headers/nvptxintrin.h               | 15 +++++----
 libc/src/__support/GPU/utils.h                |  5 +--
 .../src/__support/GPU/CMakeLists.txt          |  9 +++++
 .../integration/src/__support/GPU/shuffle.cpp | 33 +++++++++++++++++++
 6 files changed, 76 insertions(+), 24 deletions(-)
 create mode 100644 libc/test/integration/src/__support/GPU/shuffle.cpp

diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
index 038605605462f..9dad99ffe9439 100644
--- a/clang/lib/Headers/amdgpuintrin.h
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -145,17 +145,21 @@ _DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) {
 
 // Shuffles the the lanes inside the wavefront according to the given index.
 _DEFAULT_FN_ATTRS static __inline__ uint32_t
-__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) {
-  return __builtin_amdgcn_ds_bpermute(__idx << 2, __x);
+__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
+                      uint32_t __width) {
+  uint32_t __lane = __idx + (__gpu_lane_id() & ~(__width - 1));
+  return __builtin_amdgcn_ds_bpermute(__lane << 2, __x);
 }
 
 // Shuffles the the lanes inside the wavefront according to the given index.
 _DEFAULT_FN_ATTRS static __inline__ uint64_t
-__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x) {
+__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
+                      uint32_t __width) {
   uint32_t __hi = (uint32_t)(__x >> 32ull);
   uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
-  return ((uint64_t)__builtin_amdgcn_ds_bpermute(__idx << 2, __hi) << 32ull) |
-         ((uint64_t)__builtin_amdgcn_ds_bpermute(__idx << 2, __lo));
+  return ((uint64_t)__gpu_shuffle_idx_u32(__lane_mask, __idx, __hi, __width)
+          << 32ull) |
+         ((uint64_t)__gpu_shuffle_idx_u32(__lane_mask, __idx, __lo, __width));
 }
 
 // Returns true if the flat pointer points to AMDGPU 'shared' memory.
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index 4c463c333308f..11c87e85cd497 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -133,18 +133,21 @@ __gpu_read_first_lane_f64(uint64_t __lane_mask, double __x) {
 
 // Shuffles the the lanes according to the given index.
 _DEFAULT_FN_ATTRS static __inline__ float
-__gpu_shuffle_idx_f32(uint64_t __lane_mask, uint32_t __idx, float __x) {
+__gpu_shuffle_idx_f32(uint64_t __lane_mask, uint32_t __idx, float __x,
+                      uint32_t __width) {
   return __builtin_bit_cast(
       float, __gpu_shuffle_idx_u32(__lane_mask, __idx,
-                                   __builtin_bit_cast(uint32_t, __x)));
+                                   __builtin_bit_cast(uint32_t, __x), __width));
 }
 
 // Shuffles the the lanes according to the given index.
 _DEFAULT_FN_ATTRS static __inline__ double
-__gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x) {
+__gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x,
+                      uint32_t __width) {
   return __builtin_bit_cast(
-      double, __gpu_shuffle_idx_u64(__lane_mask, __idx,
-                                    __builtin_bit_cast(uint64_t, __x)));
+      double,
+      __gpu_shuffle_idx_u64(__lane_mask, __idx,
+                            __builtin_bit_cast(uint64_t, __x), __width));
 }
 
 // Gets the sum of all lanes inside the warp or wavefront.
@@ -153,7 +156,8 @@ __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x) {
       uint64_t __lane_mask, __type __x) {                                      \
     for (uint32_t __step = __gpu_num_lanes() / 2; __step > 0; __step /= 2) {   \
       uint32_t __index = __step + __gpu_lane_id();                             \
-      __x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x);          \
+      __x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x,           \
+                                          __gpu_num_lanes());                  \
     }                                                                          \
     return __gpu_read_first_lane_##__suffix(__lane_mask, __x);                 \
   }
@@ -171,10 +175,10 @@ __DO_LANE_SUM(double, f64);   // double __gpu_lane_sum_f64(m, x)
       uint32_t __index = __gpu_lane_id() - __step;                             \
       __bitmask_type bitmask = __gpu_lane_id() >= __step;                      \
       __x += __builtin_bit_cast(                                               \
-          __type,                                                              \
-          -bitmask & __builtin_bit_cast(__bitmask_type,                        \
-                                        __gpu_shuffle_idx_##__suffix(          \
-                                            __lane_mask, __index, __x)));      \
+          __type, -bitmask & __builtin_bit_cast(__bitmask_type,                \
+                                                __gpu_shuffle_idx_##__suffix(  \
+                                                    __lane_mask, __index, __x, \
+                                                    __gpu_num_lanes())));      \
     }                                                                          \
     return __x;                                                                \
   }
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
index fb2864eab6a09..40fa2edebe975 100644
--- a/clang/lib/Headers/nvptxintrin.h
+++ b/clang/lib/Headers/nvptxintrin.h
@@ -149,22 +149,23 @@ _DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) {
 
 // Shuffles the the lanes inside the warp according to the given index.
 _DEFAULT_FN_ATTRS static __inline__ uint32_t
-__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) {
+__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
+                      uint32_t __width) {
   uint32_t __mask = (uint32_t)__lane_mask;
-  return __nvvm_shfl_sync_idx_i32(__mask, __x, __idx, __gpu_num_lanes() - 1u);
+  return __nvvm_shfl_sync_idx_i32(__mask, __x, __idx,
+                                  ((__gpu_num_lanes() - __width) << 8u) | 0x1f);
 }
 
 // Shuffles the the lanes inside the warp according to the given index.
 _DEFAULT_FN_ATTRS static __inline__ uint64_t
-__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x) {
+__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
+                      uint32_t __width) {
   uint32_t __hi = (uint32_t)(__x >> 32ull);
   uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
   uint32_t __mask = (uint32_t)__lane_mask;
-  return ((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __hi, __idx,
-                                             __gpu_num_lanes() - 1u)
+  return ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __hi, __width)
           << 32ull) |
-         ((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __lo, __idx,
-                                             __gpu_num_lanes() - 1u));
+         ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __lo, __width));
 }
 
 // Returns true if the flat pointer points to CUDA 'shared' memory.
diff --git a/libc/src/__support/GPU/utils.h b/libc/src/__support/GPU/utils.h
index e138c84c0cb22..323c003f1ff07 100644
--- a/libc/src/__support/GPU/utils.h
+++ b/libc/src/__support/GPU/utils.h
@@ -87,8 +87,9 @@ LIBC_INLINE void sync_threads() { __gpu_sync_threads(); }
 
 LIBC_INLINE void sync_lane(uint64_t lane_mask) { __gpu_sync_lane(lane_mask); }
 
-LIBC_INLINE uint32_t shuffle(uint64_t lane_mask, uint32_t idx, uint32_t x) {
-  return __gpu_shuffle_idx_u32(lane_mask, idx, x);
+LIBC_INLINE uint32_t shuffle(uint64_t lane_mask, uint32_t idx, uint32_t x,
+                             uint32_t width = __gpu_num_lanes()) {
+  return __gpu_shuffle_idx_u32(lane_mask, idx, x, width);
 }
 
 [[noreturn]] LIBC_INLINE void end_program() { __gpu_exit(); }
diff --git a/libc/test/integration/src/__support/GPU/CMakeLists.txt b/libc/test/integration/src/__support/GPU/CMakeLists.txt
index 7811e0da45ddc..68bbc3849bc7e 100644
--- a/libc/test/integration/src/__support/GPU/CMakeLists.txt
+++ b/libc/test/integration/src/__support/GPU/CMakeLists.txt
@@ -9,3 +9,12 @@ add_integration_test(
   LOADER_ARGS
     --threads 64
 )
+
+add_integration_test(
+  shuffle_test
+  SUITE libc-support-gpu-tests
+  SRCS
+    shuffle.cpp
+  LOADER_ARGS
+    --threads 64
+)
diff --git a/libc/test/integration/src/__support/GPU/shuffle.cpp b/libc/test/integration/src/__support/GPU/shuffle.cpp
new file mode 100644
index 0000000000000..c346a2eb3f0c2
--- /dev/null
+++ b/libc/test/integration/src/__support/GPU/shuffle.cpp
@@ -0,0 +1,33 @@
+//===-- Test for the shuffle operations on the GPU ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/GPU/utils.h"
+#include "test/IntegrationTest/test.h"
+
+using namespace LIBC_NAMESPACE;
+
+// Test to make sure the shuffle instruction works by doing a simple broadcast.
+// Each iteration reduces the width, so it will broadcast to a subset we check.
+static void test_shuffle() {
+  uint64_t mask = gpu::get_lane_mask();
+  EXPECT_EQ(cpp::popcount(mask), gpu::get_lane_size());
+
+  uint32_t x = gpu::get_lane_id();
+  for (uint32_t width = gpu::get_lane_size(); width > 0; width /= 2)
+    EXPECT_EQ(gpu::shuffle(mask, 0, x, width), (x / width) * width);
+}
+
+TEST_MAIN(int argc, char **argv, char **envp) {
+  if (gpu::get_thread_id() >= gpu::get_lane_size())
+    return 0;
+
+  test_shuffle();
+
+  return 0;
+}

From 9e7f8352c7a3dbbade39c11fe6f786d0457956ee Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Wed, 5 Feb 2025 12:55:05 -0600
Subject: [PATCH 109/282] [Clang] Fix test after new argument was added

(cherry picked from commit 718cdeb9c701725412a040b2b7148523a286a256)
---
 clang/test/Headers/gpuintrin.c | 68 ++++++++++++++++++++++++++++++++--
 1 file changed, 65 insertions(+), 3 deletions(-)

diff --git a/clang/test/Headers/gpuintrin.c b/clang/test/Headers/gpuintrin.c
index 281339716c3ed..89efe12ee8def 100644
--- a/clang/test/Headers/gpuintrin.c
+++ b/clang/test/Headers/gpuintrin.c
@@ -38,7 +38,7 @@
 // AMDGPU-NEXT:    [[CALL20:%.*]] = call i64 @__gpu_ballot(i64 noundef -1, i1 noundef zeroext true) #[[ATTR7]]
 // AMDGPU-NEXT:    call void @__gpu_sync_threads() #[[ATTR7]]
 // AMDGPU-NEXT:    call void @__gpu_sync_lane(i64 noundef -1) #[[ATTR7]]
-// AMDGPU-NEXT:    [[CALL21:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1) #[[ATTR7]]
+// AMDGPU-NEXT:    [[CALL21:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1, i32 noundef 0) #[[ATTR7]]
 // AMDGPU-NEXT:    [[CALL22:%.*]] = call i64 @__gpu_first_lane_id(i64 noundef -1) #[[ATTR7]]
 // AMDGPU-NEXT:    [[CALL23:%.*]] = call zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1) #[[ATTR7]]
 // AMDGPU-NEXT:    call void @__gpu_exit() #[[ATTR8:[0-9]+]]
@@ -70,7 +70,7 @@
 // NVPTX-NEXT:    [[CALL20:%.*]] = call i64 @__gpu_ballot(i64 noundef -1, i1 noundef zeroext true) #[[ATTR6]]
 // NVPTX-NEXT:    call void @__gpu_sync_threads() #[[ATTR6]]
 // NVPTX-NEXT:    call void @__gpu_sync_lane(i64 noundef -1) #[[ATTR6]]
-// NVPTX-NEXT:    [[CALL21:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1) #[[ATTR6]]
+// NVPTX-NEXT:    [[CALL21:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1, i32 noundef 0) #[[ATTR6]]
 // NVPTX-NEXT:    [[CALL22:%.*]] = call i64 @__gpu_first_lane_id(i64 noundef -1) #[[ATTR6]]
 // NVPTX-NEXT:    [[CALL23:%.*]] = call zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1) #[[ATTR6]]
 // NVPTX-NEXT:    call void @__gpu_exit() #[[ATTR7:[0-9]+]]
@@ -90,6 +90,68 @@ __gpu_kernel void foo() {
   __gpu_num_threads_z();
   __gpu_num_threads(0);
   __gpu_thread_id_x();
+// AMDGPU-LABEL: define internal i32 @__gpu_thread_id(
+// AMDGPU-SAME: i32 noundef [[__DIM:%.*]]) #[[ATTR0]] {
+// AMDGPU-NEXT:  [[ENTRY:.*:]]
+// AMDGPU-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT:    [[__DIM_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGPU-NEXT:    [[__DIM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__DIM_ADDR]] to ptr
+// AMDGPU-NEXT:    store i32 [[__DIM]], ptr [[__DIM_ADDR_ASCAST]], align 4
+// AMDGPU-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__DIM_ADDR_ASCAST]], align 4
+// AMDGPU-NEXT:    switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [
+// AMDGPU-NEXT:      i32 0, label %[[SW_BB:.*]]
+// AMDGPU-NEXT:      i32 1, label %[[SW_BB1:.*]]
+// AMDGPU-NEXT:      i32 2, label %[[SW_BB3:.*]]
+// AMDGPU-NEXT:    ]
+// AMDGPU:       [[SW_BB]]:
+// AMDGPU-NEXT:    [[CALL:%.*]] = call i32 @__gpu_thread_id_x() #[[ATTR7]]
+// AMDGPU-NEXT:    store i32 [[CALL]], ptr [[RETVAL_ASCAST]], align 4
+// AMDGPU-NEXT:    br label %[[RETURN:.*]]
+// AMDGPU:       [[SW_BB1]]:
+// AMDGPU-NEXT:    [[CALL2:%.*]] = call i32 @__gpu_thread_id_y() #[[ATTR7]]
+// AMDGPU-NEXT:    store i32 [[CALL2]], ptr [[RETVAL_ASCAST]], align 4
+// AMDGPU-NEXT:    br label %[[RETURN]]
+// AMDGPU:       [[SW_BB3]]:
+// AMDGPU-NEXT:    [[CALL4:%.*]] = call i32 @__gpu_thread_id_z() #[[ATTR7]]
+// AMDGPU-NEXT:    store i32 [[CALL4]], ptr [[RETVAL_ASCAST]], align 4
+// AMDGPU-NEXT:    br label %[[RETURN]]
+// AMDGPU:       [[SW_DEFAULT]]:
+// AMDGPU-NEXT:    unreachable
+// AMDGPU:       [[RETURN]]:
+// AMDGPU-NEXT:    [[TMP1:%.*]] = load i32, ptr [[RETVAL_ASCAST]], align 4
+// AMDGPU-NEXT:    ret i32 [[TMP1]]
+//
+// NVPTX-LABEL: define internal i32 @__gpu_thread_id(
+// NVPTX-SAME: i32 noundef [[__DIM:%.*]]) #[[ATTR0]] {
+// NVPTX-NEXT:  [[ENTRY:.*:]]
+// NVPTX-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// NVPTX-NEXT:    [[__DIM_ADDR:%.*]] = alloca i32, align 4
+// NVPTX-NEXT:    store i32 [[__DIM]], ptr [[__DIM_ADDR]], align 4
+// NVPTX-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__DIM_ADDR]], align 4
+// NVPTX-NEXT:    switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [
+// NVPTX-NEXT:      i32 0, label %[[SW_BB:.*]]
+// NVPTX-NEXT:      i32 1, label %[[SW_BB1:.*]]
+// NVPTX-NEXT:      i32 2, label %[[SW_BB3:.*]]
+// NVPTX-NEXT:    ]
+// NVPTX:       [[SW_BB]]:
+// NVPTX-NEXT:    [[CALL:%.*]] = call i32 @__gpu_thread_id_x() #[[ATTR6]]
+// NVPTX-NEXT:    store i32 [[CALL]], ptr [[RETVAL]], align 4
+// NVPTX-NEXT:    br label %[[RETURN:.*]]
+// NVPTX:       [[SW_BB1]]:
+// NVPTX-NEXT:    [[CALL2:%.*]] = call i32 @__gpu_thread_id_y() #[[ATTR6]]
+// NVPTX-NEXT:    store i32 [[CALL2]], ptr [[RETVAL]], align 4
+// NVPTX-NEXT:    br label %[[RETURN]]
+// NVPTX:       [[SW_BB3]]:
+// NVPTX-NEXT:    [[CALL4:%.*]] = call i32 @__gpu_thread_id_z() #[[ATTR6]]
+// NVPTX-NEXT:    store i32 [[CALL4]], ptr [[RETVAL]], align 4
+// NVPTX-NEXT:    br label %[[RETURN]]
+// NVPTX:       [[SW_DEFAULT]]:
+// NVPTX-NEXT:    unreachable
+// NVPTX:       [[RETURN]]:
+// NVPTX-NEXT:    [[TMP1:%.*]] = load i32, ptr [[RETVAL]], align 4
+// NVPTX-NEXT:    ret i32 [[TMP1]]
+//
   __gpu_thread_id_y();
   __gpu_thread_id_z();
   __gpu_thread_id(0);
@@ -100,7 +162,7 @@ __gpu_kernel void foo() {
   __gpu_ballot(-1, 1);
   __gpu_sync_threads();
   __gpu_sync_lane(-1);
-  __gpu_shuffle_idx_u32(-1, -1, -1);
+  __gpu_shuffle_idx_u32(-1, -1, -1, 0);
   __gpu_first_lane_id(-1);
   __gpu_is_first_in_lane(-1);
   __gpu_exit();

From 553185bd7188fa1c48cce8a661df812a03afb5b7 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Wed, 5 Feb 2025 09:31:58 -0800
Subject: [PATCH 110/282] [BOLT,test] Link against a shared object to test PLT
 (#125625)

A few tests generate a statically-linked position-independent executable
with `-nostdlib -Wl,--unresolved-symbols=ignore-all -pie` (`%clang`) and
test PLT handling. (--unresolved-symbols=ignore-all suppresses undefined
symbol errors and serves as a convenience hack.)

This relies on an unguaranteed linker behavior: a statically-linked PIE
does not necessarily generate PLT entries.
While current lld generates a PLT entry, it will change to suppress the
PLT entry to simplify internal handling and improve consistency.

(The behavior has no consistency in GNU ld, some ports generated a
.dynsym entry while some don't. While most seem to generate a PLT entry
but some ports use a weird `R_*_NONE` relocation.)

(cherry picked from commit a907008bcb8dcc093f8aa5c0450d92cd63473b81)
---
 bolt/test/AArch64/exceptions-plt.cpp | 4 +++-
 bolt/test/AArch64/plt-call.test      | 4 +++-
 bolt/test/X86/callcont-fallthru.s    | 4 +++-
 bolt/test/X86/cfi-instrs-reordered.s | 4 +++-
 bolt/test/X86/plt-call.test          | 4 +++-
 bolt/test/runtime/exceptions-plt.cpp | 4 +++-
 bolt/test/runtime/plt-lld.test       | 7 ++++---
 7 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/bolt/test/AArch64/exceptions-plt.cpp b/bolt/test/AArch64/exceptions-plt.cpp
index 576f0fc91a9d8..33c28406ca8d6 100644
--- a/bolt/test/AArch64/exceptions-plt.cpp
+++ b/bolt/test/AArch64/exceptions-plt.cpp
@@ -2,7 +2,9 @@
 
 // REQUIRES: system-linux
 
-// RUN: %clangxx %cxxflags -O1 -Wl,-q,-znow %s -o %t.exe
+// RUN: %clang %cflags -fpic -shared -xc /dev/null -o %t.so
+// Link against a DSO to ensure PLT entries.
+// RUN: %clangxx %cxxflags -O1 -Wl,-q,-znow %s %t.so -o %t.exe
 // RUN: llvm-bolt %t.exe -o %t.bolt.exe --plt=all --print-only=.*main.* \
 // RUN:   --print-finalized 2>&1 | FileCheck %s
 
diff --git a/bolt/test/AArch64/plt-call.test b/bolt/test/AArch64/plt-call.test
index da307d4a6c01e..1fa62c4a36aaf 100644
--- a/bolt/test/AArch64/plt-call.test
+++ b/bolt/test/AArch64/plt-call.test
@@ -1,6 +1,8 @@
 // Verify that PLTCall optimization works.
 
-RUN: %clang %cflags %p/../Inputs/plt-tailcall.c \
+RUN: %clang %cflags -fpic -shared -xc /dev/null -o %t.so
+// Link against a DSO to ensure PLT entries.
+RUN: %clang %cflags %p/../Inputs/plt-tailcall.c %t.so \
 RUN:    -o %t -Wl,-q
 RUN: llvm-bolt %t -o %t.bolt --plt=all --print-plt  --print-only=foo | FileCheck %s
 
diff --git a/bolt/test/X86/callcont-fallthru.s b/bolt/test/X86/callcont-fallthru.s
index 31a7910d7fa3f..d76f869c971fd 100644
--- a/bolt/test/X86/callcont-fallthru.s
+++ b/bolt/test/X86/callcont-fallthru.s
@@ -1,7 +1,9 @@
 ## Ensures that a call continuation fallthrough count is set when using
 ## pre-aggregated perf data.
 
-# RUN: %clangxx %cxxflags %s -o %t -Wl,-q -nostdlib
+# RUN: %clang %cflags -fpic -shared -xc /dev/null -o %t.so
+## Link against a DSO to ensure PLT entries.
+# RUN: %clangxx %cxxflags %s %t.so -o %t -Wl,-q -nostdlib
 # RUN: link_fdata %s %t %t.pa1 PREAGG
 # RUN: link_fdata %s %t %t.pa2 PREAGG2
 # RUN: link_fdata %s %t %t.pa3 PREAGG3
diff --git a/bolt/test/X86/cfi-instrs-reordered.s b/bolt/test/X86/cfi-instrs-reordered.s
index c325aaf1ad8b1..5173fa6c3c7d0 100644
--- a/bolt/test/X86/cfi-instrs-reordered.s
+++ b/bolt/test/X86/cfi-instrs-reordered.s
@@ -3,7 +3,9 @@
 
 # RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %s -o %t.o
 # RUN: llvm-strip --strip-unneeded %t.o
-# RUN: %clangxx %cflags %t.o -o %t.exe
+# RUN: %clang %cflags -fpic -shared -xc /dev/null -o %t.so
+## Link against a DSO to ensure PLT entries.
+# RUN: %clangxx %cflags %t.o %t.so -o %t.exe
 # RUN: llvm-bolt %t.exe -o %t --reorder-blocks=cache --print-after-lowering \
 # RUN:   --print-only=_Z10SolveCubicddddPiPd 2>&1 | FileCheck %s
 #
diff --git a/bolt/test/X86/plt-call.test b/bolt/test/X86/plt-call.test
index e6ae86c179d27..aeee3024ac170 100644
--- a/bolt/test/X86/plt-call.test
+++ b/bolt/test/X86/plt-call.test
@@ -1,6 +1,8 @@
 // Verify that PLTCall optimization works.
 
-RUN: %clang %cflags %p/../Inputs/plt-tailcall.c \
+RUN: %clang %cflags -fpic -shared -xc /dev/null -o %t.so
+// Link against a DSO to ensure PLT entries.
+RUN: %clang %cflags %p/../Inputs/plt-tailcall.c %t.so \
 RUN:    -o %t -Wl,-q
 RUN: llvm-bolt %t -o %t.bolt --plt=all --print-plt  --print-only=foo | FileCheck %s
 
diff --git a/bolt/test/runtime/exceptions-plt.cpp b/bolt/test/runtime/exceptions-plt.cpp
index 8a75a3cb384b9..3d8e7a5133e2c 100644
--- a/bolt/test/runtime/exceptions-plt.cpp
+++ b/bolt/test/runtime/exceptions-plt.cpp
@@ -2,7 +2,9 @@
 
 // REQUIRES: system-linux
 
-// RUN: %clangxx %cxxflags -O1 -Wl,-q,-znow %s -o %t.exe
+// RUN: %clang %cflags -fpic -shared -xc /dev/null -o %t.so
+// Link against a DSO to ensure PLT entries.
+// RUN: %clangxx %cxxflags -O1 -Wl,-q,-znow %s %t.so -o %t.exe
 // RUN: llvm-bolt %t.exe -o %t.bolt.exe --plt=all
 // RUN: %t.bolt.exe
 
diff --git a/bolt/test/runtime/plt-lld.test b/bolt/test/runtime/plt-lld.test
index b505a191f90ab..3432e18bf4daf 100644
--- a/bolt/test/runtime/plt-lld.test
+++ b/bolt/test/runtime/plt-lld.test
@@ -1,14 +1,15 @@
 // This test checks that the pointers to PLT are properly updated.
-// The test is using lld linker.
+// The test uses lld and links against a DSO to ensure PLT entries.
+RUN: %clang %cflags -fpic -shared -xc /dev/null -o %t.so
 
 // Non-PIE:
-RUN: %clang %cflags -no-pie %p/../Inputs/plt.c -fuse-ld=lld \
+RUN: %clang %cflags -no-pie %p/../Inputs/plt.c %t.so -fuse-ld=lld \
 RUN:    -o %t.lld.exe -Wl,-q
 RUN: llvm-bolt %t.lld.exe -o %t.lld.bolt.exe --use-old-text=0 --lite=0
 RUN: %t.lld.bolt.exe | FileCheck %s
 
 // PIE:
-RUN: %clang %cflags -fPIC -pie %p/../Inputs/plt.c -fuse-ld=lld \
+RUN: %clang %cflags -fPIC -pie %p/../Inputs/plt.c %t.so -fuse-ld=lld \
 RUN:    -o %t.lld.pie.exe -Wl,-q
 RUN: llvm-bolt %t.lld.pie.exe -o %t.lld.bolt.pie.exe --use-old-text=0 --lite=0
 RUN: %t.lld.bolt.pie.exe | FileCheck %s

From c03f46f2f0eac60ee407a6c645cfdb62e97fa77b Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Tue, 11 Feb 2025 14:57:31 -0600
Subject: [PATCH 111/282] [Offload] Properly guard modifications to the RPC
 device array (#126790)

Summary:
If the user deallocates an RPC device this can sometimes fail if the RPC
server is still running. This will happen if the modification happens
while the server is still checking it. This patch adds a mutex to guard
modifications to it.

(cherry picked from commit baf7a3c1e561ff7e3f7da2261ce1012c4f2ba1c0)
---
 offload/plugins-nextgen/common/include/RPC.h | 12 +++++++++---
 offload/plugins-nextgen/common/src/RPC.cpp   |  5 ++++-
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/offload/plugins-nextgen/common/include/RPC.h b/offload/plugins-nextgen/common/include/RPC.h
index 42fca4aa4aebc..08556f15a76bf 100644
--- a/offload/plugins-nextgen/common/include/RPC.h
+++ b/offload/plugins-nextgen/common/include/RPC.h
@@ -72,6 +72,9 @@ struct RPCServerTy {
   /// Array of associated devices. These must be alive as long as the server is.
   std::unique_ptr<plugin::GenericDeviceTy *[]> Devices;
 
+  /// Mutex that guards accesses to the buffers and device array.
+  std::mutex BufferMutex{};
+
   /// A helper class for running the user thread that handles the RPC interface.
   /// Because we only need to check the RPC server while any kernels are
   /// working, we track submission / completion events to allow the thread to
@@ -90,6 +93,9 @@ struct RPCServerTy {
     std::condition_variable CV;
     std::mutex Mutex;
 
+    /// A reference to the main server's mutex.
+    std::mutex &BufferMutex;
+
     /// A reference to all the RPC interfaces that the server is handling.
     llvm::ArrayRef<void *> Buffers;
 
@@ -98,9 +104,9 @@ struct RPCServerTy {
 
     /// Initialize the worker thread to run in the background.
     ServerThread(void *Buffers[], plugin::GenericDeviceTy *Devices[],
-                 size_t Length)
-        : Running(false), NumUsers(0), CV(), Mutex(), Buffers(Buffers, Length),
-          Devices(Devices, Length) {}
+                 size_t Length, std::mutex &BufferMutex)
+        : Running(false), NumUsers(0), CV(), Mutex(), BufferMutex(BufferMutex),
+          Buffers(Buffers, Length), Devices(Devices, Length) {}
 
     ~ServerThread() { assert(!Running && "Thread not shut down explicitly\n"); }
 
diff --git a/offload/plugins-nextgen/common/src/RPC.cpp b/offload/plugins-nextgen/common/src/RPC.cpp
index e6750a540b391..eb305736d6264 100644
--- a/offload/plugins-nextgen/common/src/RPC.cpp
+++ b/offload/plugins-nextgen/common/src/RPC.cpp
@@ -131,6 +131,7 @@ void RPCServerTy::ServerThread::run() {
     Lock.unlock();
     while (NumUsers.load(std::memory_order_relaxed) > 0 &&
            Running.load(std::memory_order_relaxed)) {
+      std::lock_guard<decltype(Mutex)> Lock(BufferMutex);
       for (const auto &[Buffer, Device] : llvm::zip_equal(Buffers, Devices)) {
         if (!Buffer || !Device)
           continue;
@@ -149,7 +150,7 @@ RPCServerTy::RPCServerTy(plugin::GenericPluginTy &Plugin)
       Devices(std::make_unique<plugin::GenericDeviceTy *[]>(
           Plugin.getNumDevices())),
       Thread(new ServerThread(Buffers.get(), Devices.get(),
-                              Plugin.getNumDevices())) {}
+                              Plugin.getNumDevices(), BufferMutex)) {}
 
 llvm::Error RPCServerTy::startThread() {
   Thread->startThread();
@@ -190,6 +191,7 @@ Error RPCServerTy::initDevice(plugin::GenericDeviceTy &Device,
   if (auto Err = Device.dataSubmit(ClientGlobal.getPtr(), &client,
                                    sizeof(rpc::Client), nullptr))
     return Err;
+  std::lock_guard<decltype(BufferMutex)> Lock(BufferMutex);
   Buffers[Device.getDeviceId()] = RPCBuffer;
   Devices[Device.getDeviceId()] = &Device;
 
@@ -197,6 +199,7 @@ Error RPCServerTy::initDevice(plugin::GenericDeviceTy &Device,
 }
 
 Error RPCServerTy::deinitDevice(plugin::GenericDeviceTy &Device) {
+  std::lock_guard<decltype(BufferMutex)> Lock(BufferMutex);
   Device.free(Buffers[Device.getDeviceId()], TARGET_ALLOC_HOST);
   Buffers[Device.getDeviceId()] = nullptr;
   Devices[Device.getDeviceId()] = nullptr;

From 923d35bcf76529ba3afe736d160cb6be12b63e24 Mon Sep 17 00:00:00 2001
From: higher-performance <higher.performance.github@gmail.com>
Date: Tue, 11 Feb 2025 01:52:13 -0500
Subject: [PATCH 112/282] Fix false positive of
 [[clang::require_explicit_initialization]] on copy/move constructors
 (#126553)

Fixes #126490

(cherry picked from commit 90192e8872cc90b4d292b180a49babf72d17e579)
---
 clang/lib/Sema/SemaInit.cpp          |  4 +++-
 clang/test/SemaCXX/uninitialized.cpp | 34 +++++++++++++++++++++++++++-
 2 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index 450edcb52ae15..37796758960cd 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -4576,7 +4576,9 @@ static void TryConstructorInitialization(Sema &S,
     if (!IsListInit &&
         (Kind.getKind() == InitializationKind::IK_Default ||
          Kind.getKind() == InitializationKind::IK_Direct) &&
-        DestRecordDecl != nullptr && DestRecordDecl->isAggregate() &&
+        DestRecordDecl != nullptr &&
+        !(CtorDecl->isCopyOrMoveConstructor() && CtorDecl->isImplicit()) &&
+        DestRecordDecl->isAggregate() &&
         DestRecordDecl->hasUninitializedExplicitInitFields()) {
       S.Diag(Kind.getLocation(), diag::warn_field_requires_explicit_init)
           << /* Var-in-Record */ 1 << DestRecordDecl;
diff --git a/clang/test/SemaCXX/uninitialized.cpp b/clang/test/SemaCXX/uninitialized.cpp
index 7578b288d7b3f..4af2c998f082e 100644
--- a/clang/test/SemaCXX/uninitialized.cpp
+++ b/clang/test/SemaCXX/uninitialized.cpp
@@ -1542,9 +1542,15 @@ void aggregate() {
     };
   };
 
+  struct CopyAndMove {
+    CopyAndMove() = default;
+    CopyAndMove(const CopyAndMove &) {}
+    CopyAndMove(CopyAndMove &&) {}
+  };
   struct Embed {
     int embed1;  // #FIELD_EMBED1
     int embed2 [[clang::require_explicit_initialization]];  // #FIELD_EMBED2
+    CopyAndMove force_separate_move_ctor;
   };
   struct EmbedDerived : Embed {};
   struct F {
@@ -1582,7 +1588,33 @@ void aggregate() {
       F("___"),
       F("____")
   };
-  (void)ctors;
+
+  struct MoveOrCopy {
+    Embed e;
+    EmbedDerived ed;
+    F f;
+    // no-error
+    MoveOrCopy(const MoveOrCopy &c) : e(c.e), ed(c.ed), f(c.f) {}
+    // no-error
+    MoveOrCopy(MoveOrCopy &&c)
+        : e(std::move(c.e)), ed(std::move(c.ed)), f(std::move(c.f)) {}
+  };
+  F copy1(ctors[0]); // no-error
+  (void)copy1;
+  F move1(std::move(ctors[0])); // no-error
+  (void)move1;
+  F copy2{ctors[0]}; // no-error
+  (void)copy2;
+  F move2{std::move(ctors[0])}; // no-error
+  (void)move2;
+  F copy3 = ctors[0]; // no-error
+  (void)copy3;
+  F move3 = std::move(ctors[0]); // no-error
+  (void)move3;
+  F copy4 = {ctors[0]}; // no-error
+  (void)copy4;
+  F move4 = {std::move(ctors[0])}; // no-error
+  (void)move4;
 
   S::foo(S{1, 2, 3, 4});
   S::foo(S{.s1 = 100, .s4 = 100});

From ac97cff5a3684be98f4863191f0006cdf0fa89b4 Mon Sep 17 00:00:00 2001
From: Chuanqi Xu <yedeng.yd@linux.alibaba.com>
Date: Tue, 11 Feb 2025 14:08:47 +0800
Subject: [PATCH 113/282] [C++20] [Modules] Don't diagnose duplicated
 declarations in different modules which is not in file scope

Close https://github.com/llvm/llvm-project/issues/126373

Although the root problems should be we shouldn't place the friend
declaration to the incorrect module, let's avoid bleeding the edge by
stoping diagnosing declarations not in file scope.

(cherry picked from commit 569e94f8f1c3e6998860e2b2ff577870433bdac9)
---
 clang/lib/Serialization/ASTReaderDecl.cpp |  7 +++++
 clang/test/Modules/pr126373.cppm          | 34 +++++++++++++++++++++++
 2 files changed, 41 insertions(+)
 create mode 100644 clang/test/Modules/pr126373.cppm

diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index 1aa94d5a22abe..8fbb0a8d3edd8 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -3751,6 +3751,13 @@ void ASTDeclReader::checkMultipleDefinitionInNamedModules(ASTReader &Reader,
   if (D->getFriendObjectKind() || Previous->getFriendObjectKind())
     return;
 
+  // Skip diagnosing in-class declarations.
+  if (!Previous->getLexicalDeclContext()
+           ->getNonTransparentContext()
+           ->isFileContext() ||
+      !D->getLexicalDeclContext()->getNonTransparentContext()->isFileContext())
+    return;
+
   Module *M = Previous->getOwningModule();
   if (!M)
     return;
diff --git a/clang/test/Modules/pr126373.cppm b/clang/test/Modules/pr126373.cppm
new file mode 100644
index 0000000000000..f176a587b51ce
--- /dev/null
+++ b/clang/test/Modules/pr126373.cppm
@@ -0,0 +1,34 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// RUN: %clang_cc1 -std=c++20 %t/module1.cppm -emit-module-interface -o %t/module1.pcm
+// RUN: %clang_cc1 -std=c++20 -fmodule-file=module1=%t/module1.pcm  %t/module2.cppm \
+// RUN:     -emit-module-interface -o %t/module2.pcm
+// RUN: %clang_cc1 -std=c++20 %t/module2.pcm -fmodule-file=module1=%t/module1.pcm \
+// RUN:     -emit-llvm -o - | FileCheck %t/module2.cppm
+
+//--- test.h
+template<typename T>
+struct Test {
+  template<typename U>
+  friend class Test;
+};
+
+//--- module1.cppm
+module;
+#include "test.h"
+export module module1;
+export void f1(Test<int>) {}
+
+//--- module2.cppm
+module;
+#include "test.h"
+export module module2;
+import module1;
+export void f2(Test<float>) {}
+
+extern "C" void func() {}
+
+// Fine enough to check the IR is emitted correctly.
+// CHECK: define{{.*}}@func

From 1c36697fbb554b49b00bd2e9bd842ffcb73d9a0f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Miko=C5=82aj=20Pir=C3=B3g?= <mikolaj.maciej.pirog@intel.com>
Date: Tue, 11 Feb 2025 06:13:36 +0100
Subject: [PATCH 114/282] [AVX10.2] Fix wrong mask casting in some convert
 intrinsics (#126627)

Found during work on #120927. This caused the compiler to silently drop
ignore half of the mask in the specific intrinsics.

(cherry picked from commit af522c5dd3a38cc5e11e8e62009d7dbe2cde2d86)
---
 clang/lib/Headers/avx10_2convertintrin.h         | 16 ++++++++--------
 clang/test/CodeGen/X86/avx10_2convert-builtins.c | 16 ++++++++--------
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/clang/lib/Headers/avx10_2convertintrin.h b/clang/lib/Headers/avx10_2convertintrin.h
index c67a5b890f195..79d9def2207b8 100644
--- a/clang/lib/Headers/avx10_2convertintrin.h
+++ b/clang/lib/Headers/avx10_2convertintrin.h
@@ -260,13 +260,13 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvt2ph_bf8(__m256h __A,
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_cvt2ph_bf8(__m256i __W, __mmask32 __U, __m256h __A, __m256h __B) {
   return (__m256i)__builtin_ia32_selectb_256(
-      (__mmask16)__U, (__v32qi)_mm256_cvt2ph_bf8(__A, __B), (__v32qi)__W);
+      (__mmask32)__U, (__v32qi)_mm256_cvt2ph_bf8(__A, __B), (__v32qi)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvt2ph_bf8(__mmask32 __U, __m256h __A, __m256h __B) {
   return (__m256i)__builtin_ia32_selectb_256(
-      (__mmask16)__U, (__v32qi)_mm256_cvt2ph_bf8(__A, __B),
+      (__mmask32)__U, (__v32qi)_mm256_cvt2ph_bf8(__A, __B),
       (__v32qi)(__m256i)_mm256_setzero_si256());
 }
 
@@ -297,13 +297,13 @@ _mm256_cvts2ph_bf8(__m256h __A, __m256h __B) {
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_cvts2ph_bf8(__m256i __W, __mmask32 __U, __m256h __A, __m256h __B) {
   return (__m256i)__builtin_ia32_selectb_256(
-      (__mmask16)__U, (__v32qi)_mm256_cvts2ph_bf8(__A, __B), (__v32qi)__W);
+      (__mmask32)__U, (__v32qi)_mm256_cvts2ph_bf8(__A, __B), (__v32qi)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvts2ph_bf8(__mmask32 __U, __m256h __A, __m256h __B) {
   return (__m256i)__builtin_ia32_selectb_256(
-      (__mmask16)__U, (__v32qi)_mm256_cvts2ph_bf8(__A, __B),
+      (__mmask32)__U, (__v32qi)_mm256_cvts2ph_bf8(__A, __B),
       (__v32qi)(__m256i)_mm256_setzero_si256());
 }
 
@@ -334,13 +334,13 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvt2ph_hf8(__m256h __A,
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_cvt2ph_hf8(__m256i __W, __mmask32 __U, __m256h __A, __m256h __B) {
   return (__m256i)__builtin_ia32_selectb_256(
-      (__mmask16)__U, (__v32qi)_mm256_cvt2ph_hf8(__A, __B), (__v32qi)__W);
+      (__mmask32)__U, (__v32qi)_mm256_cvt2ph_hf8(__A, __B), (__v32qi)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvt2ph_hf8(__mmask32 __U, __m256h __A, __m256h __B) {
   return (__m256i)__builtin_ia32_selectb_256(
-      (__mmask16)__U, (__v32qi)_mm256_cvt2ph_hf8(__A, __B),
+      (__mmask32)__U, (__v32qi)_mm256_cvt2ph_hf8(__A, __B),
       (__v32qi)(__m256i)_mm256_setzero_si256());
 }
 
@@ -371,13 +371,13 @@ _mm256_cvts2ph_hf8(__m256h __A, __m256h __B) {
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_cvts2ph_hf8(__m256i __W, __mmask32 __U, __m256h __A, __m256h __B) {
   return (__m256i)__builtin_ia32_selectb_256(
-      (__mmask16)__U, (__v32qi)_mm256_cvts2ph_hf8(__A, __B), (__v32qi)__W);
+      (__mmask32)__U, (__v32qi)_mm256_cvts2ph_hf8(__A, __B), (__v32qi)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvts2ph_hf8(__mmask32 __U, __m256h __A, __m256h __B) {
   return (__m256i)__builtin_ia32_selectb_256(
-      (__mmask16)__U, (__v32qi)_mm256_cvts2ph_hf8(__A, __B),
+      (__mmask32)__U, (__v32qi)_mm256_cvts2ph_hf8(__A, __B),
       (__v32qi)(__m256i)_mm256_setzero_si256());
 }
 
diff --git a/clang/test/CodeGen/X86/avx10_2convert-builtins.c b/clang/test/CodeGen/X86/avx10_2convert-builtins.c
index efd9a31c40875..e5e6f867e119e 100644
--- a/clang/test/CodeGen/X86/avx10_2convert-builtins.c
+++ b/clang/test/CodeGen/X86/avx10_2convert-builtins.c
@@ -231,7 +231,7 @@ __m256i test_mm256_cvt2ph_bf8(__m256h __A, __m256h __B) {
   return _mm256_cvt2ph_bf8(__A, __B);
 }
 
-__m256i test_mm256_mask_cvt2ph_bf8(__m256i __W, __mmask16 __U, __m256h __A, __m256h __B) {
+__m256i test_mm256_mask_cvt2ph_bf8(__m256i __W, __mmask32 __U, __m256h __A, __m256h __B) {
   // CHECK-LABEL: @test_mm256_mask_cvt2ph_bf8(
   // CHECK: call <32 x i8> @llvm.x86.avx10.vcvt2ph2bf8256(
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
@@ -239,7 +239,7 @@ __m256i test_mm256_mask_cvt2ph_bf8(__m256i __W, __mmask16 __U, __m256h __A, __m2
   return _mm256_mask_cvt2ph_bf8(__W, __U, __A, __B);
 }
 
-__m256i test_mm256_maskz_cvt2ph_bf8(__mmask16 __U, __m256h __A, __m256h __B) {
+__m256i test_mm256_maskz_cvt2ph_bf8(__mmask32 __U, __m256h __A, __m256h __B) {
   // CHECK-LABEL: @test_mm256_maskz_cvt2ph_bf8(
   // CHECK: call <32 x i8> @llvm.x86.avx10.vcvt2ph2bf8256(
   // CHECK: zeroinitializer
@@ -275,7 +275,7 @@ __m256i test_mm256_cvts2ph_bf8(__m256h __A, __m256h __B) {
   return _mm256_cvts2ph_bf8(__A, __B);
 }
 
-__m256i test_mm256_mask_cvts2ph_bf8(__m256i __W, __mmask16 __U, __m256h __A, __m256h __B) {
+__m256i test_mm256_mask_cvts2ph_bf8(__m256i __W, __mmask32 __U, __m256h __A, __m256h __B) {
   // CHECK-LABEL: @test_mm256_mask_cvts2ph_bf8(
   // CHECK: call <32 x i8> @llvm.x86.avx10.vcvt2ph2bf8s256(
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
@@ -283,7 +283,7 @@ __m256i test_mm256_mask_cvts2ph_bf8(__m256i __W, __mmask16 __U, __m256h __A, __m
   return _mm256_mask_cvts2ph_bf8(__W, __U, __A, __B);
 }
 
-__m256i test_mm256_maskz_cvts2ph_bf8(__mmask16 __U, __m256h __A, __m256h __B) {
+__m256i test_mm256_maskz_cvts2ph_bf8(__mmask32 __U, __m256h __A, __m256h __B) {
   // CHECK-LABEL: @test_mm256_maskz_cvts2ph_bf8(
   // CHECK: call <32 x i8> @llvm.x86.avx10.vcvt2ph2bf8s256(
   // CHECK: zeroinitializer
@@ -319,7 +319,7 @@ __m256i test_mm256_cvt2ph_hf8(__m256h __A, __m256h __B) {
   return _mm256_cvt2ph_hf8(__A, __B);
 }
 
-__m256i test_mm256_mask_cvt2ph_hf8(__m256i __W, __mmask16 __U, __m256h __A, __m256h __B) {
+__m256i test_mm256_mask_cvt2ph_hf8(__m256i __W, __mmask32 __U, __m256h __A, __m256h __B) {
   // CHECK-LABEL: @test_mm256_mask_cvt2ph_hf8(
   // CHECK: call <32 x i8> @llvm.x86.avx10.vcvt2ph2hf8256(
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
@@ -327,7 +327,7 @@ __m256i test_mm256_mask_cvt2ph_hf8(__m256i __W, __mmask16 __U, __m256h __A, __m2
   return _mm256_mask_cvt2ph_hf8(__W, __U, __A, __B);
 }
 
-__m256i test_mm256_maskz_cvt2ph_hf8(__mmask16 __U, __m256h __A, __m256h __B) {
+__m256i test_mm256_maskz_cvt2ph_hf8(__mmask32 __U, __m256h __A, __m256h __B) {
   // CHECK-LABEL: @test_mm256_maskz_cvt2ph_hf8(
   // CHECK: call <32 x i8> @llvm.x86.avx10.vcvt2ph2hf8256(
   // CHECK: zeroinitializer
@@ -363,7 +363,7 @@ __m256i test_mm256_cvts2ph_hf8(__m256h __A, __m256h __B) {
   return _mm256_cvts2ph_hf8(__A, __B);
 }
 
-__m256i test_mm256_mask_cvts2ph_hf8(__m256i __W, __mmask16 __U, __m256h __A, __m256h __B) {
+__m256i test_mm256_mask_cvts2ph_hf8(__m256i __W, __mmask32 __U, __m256h __A, __m256h __B) {
   // CHECK-LABEL: @test_mm256_mask_cvts2ph_hf8(
   // CHECK: call <32 x i8> @llvm.x86.avx10.vcvt2ph2hf8s256(
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
@@ -371,7 +371,7 @@ __m256i test_mm256_mask_cvts2ph_hf8(__m256i __W, __mmask16 __U, __m256h __A, __m
   return _mm256_mask_cvts2ph_hf8(__W, __U, __A, __B);
 }
 
-__m256i test_mm256_maskz_cvts2ph_hf8(__mmask16 __U, __m256h __A, __m256h __B) {
+__m256i test_mm256_maskz_cvts2ph_hf8(__mmask32 __U, __m256h __A, __m256h __B) {
   // CHECK-LABEL: @test_mm256_maskz_cvts2ph_hf8(
   // CHECK: call <32 x i8> @llvm.x86.avx10.vcvt2ph2hf8s256(
   // CHECK: zeroinitializer

From f33b128b3dc147a973cef55222549345b3201ad5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Miko=C5=82aj=20Pir=C3=B3g?= <mikolaj.maciej.pirog@intel.com>
Date: Mon, 10 Feb 2025 05:48:02 +0100
Subject: [PATCH 115/282] [AVX10.2] Fix wrong intrinsic names after rename 
 (#126390)

In my previous PR (#123656) to update the names of AVX10.2 intrinsics
and mnemonics, I have erroneously deleted `_ph` from few intrinsics.
This PR corrects this.

(cherry picked from commit 161cfc6f39bef8994eb944687033ebd3570196e8)
---
 clang/lib/Headers/avx10_2_512convertintrin.h  |  6 ++--
 clang/lib/Headers/avx10_2convertintrin.h      | 17 +++++----
 .../CodeGen/X86/avx10_2_512convert-builtins.c | 18 +++++-----
 .../CodeGen/X86/avx10_2convert-builtins.c     | 36 +++++++++----------
 4 files changed, 38 insertions(+), 39 deletions(-)

diff --git a/clang/lib/Headers/avx10_2_512convertintrin.h b/clang/lib/Headers/avx10_2_512convertintrin.h
index 0b5fca5cda522..516ccc68672d6 100644
--- a/clang/lib/Headers/avx10_2_512convertintrin.h
+++ b/clang/lib/Headers/avx10_2_512convertintrin.h
@@ -213,19 +213,19 @@ _mm512_maskz_cvts2ph_hf8(__mmask64 __U, __m512h __A, __m512h __B) {
       (__v64qi)(__m512i)_mm512_setzero_si512());
 }
 
-static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_cvthf8(__m256i __A) {
+static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_cvthf8_ph(__m256i __A) {
   return (__m512h)__builtin_ia32_vcvthf8_2ph512_mask(
       (__v32qi)__A, (__v32hf)(__m512h)_mm512_undefined_ph(), (__mmask32)-1);
 }
 
 static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask_cvthf8(__m512h __W, __mmask32 __U, __m256i __A) {
+_mm512_mask_cvthf8_ph(__m512h __W, __mmask32 __U, __m256i __A) {
   return (__m512h)__builtin_ia32_vcvthf8_2ph512_mask(
       (__v32qi)__A, (__v32hf)(__m512h)__W, (__mmask32)__U);
 }
 
 static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvthf8(__mmask32 __U, __m256i __A) {
+_mm512_maskz_cvthf8_ph(__mmask32 __U, __m256i __A) {
   return (__m512h)__builtin_ia32_vcvthf8_2ph512_mask(
       (__v32qi)__A, (__v32hf)(__m512h)_mm512_setzero_ph(), (__mmask32)__U);
 }
diff --git a/clang/lib/Headers/avx10_2convertintrin.h b/clang/lib/Headers/avx10_2convertintrin.h
index 79d9def2207b8..07722090c30ee 100644
--- a/clang/lib/Headers/avx10_2convertintrin.h
+++ b/clang/lib/Headers/avx10_2convertintrin.h
@@ -381,37 +381,36 @@ _mm256_maskz_cvts2ph_hf8(__mmask32 __U, __m256h __A, __m256h __B) {
       (__v32qi)(__m256i)_mm256_setzero_si256());
 }
 
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvthf8(__m128i __A) {
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvthf8_ph(__m128i __A) {
   return (__m128h)__builtin_ia32_vcvthf8_2ph128_mask(
       (__v16qi)__A, (__v8hf)(__m128h)_mm_undefined_ph(), (__mmask8)-1);
 }
 
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvthf8(__m128h __W,
-                                                                __mmask8 __U,
-                                                                __m128i __A) {
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask_cvthf8_ph(__m128h __W, __mmask8 __U, __m128i __A) {
   return (__m128h)__builtin_ia32_vcvthf8_2ph128_mask(
       (__v16qi)__A, (__v8hf)(__m128h)__W, (__mmask8)__U);
 }
 
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_cvthf8(__mmask8 __U,
-                                                                 __m128i __A) {
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_cvthf8_ph(__mmask8 __U, __m128i __A) {
   return (__m128h)__builtin_ia32_vcvthf8_2ph128_mask(
       (__v16qi)__A, (__v8hf)(__m128h)_mm_setzero_ph(), (__mmask8)__U);
 }
 
-static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_cvthf8(__m128i __A) {
+static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_cvthf8_ph(__m128i __A) {
   return (__m256h)__builtin_ia32_vcvthf8_2ph256_mask(
       (__v16qi)__A, (__v16hf)(__m256h)_mm256_undefined_ph(), (__mmask16)-1);
 }
 
 static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask_cvthf8(__m256h __W, __mmask16 __U, __m128i __A) {
+_mm256_mask_cvthf8_ph(__m256h __W, __mmask16 __U, __m128i __A) {
   return (__m256h)__builtin_ia32_vcvthf8_2ph256_mask(
       (__v16qi)__A, (__v16hf)(__m256h)__W, (__mmask16)__U);
 }
 
 static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvthf8(__mmask16 __U, __m128i __A) {
+_mm256_maskz_cvthf8_ph(__mmask16 __U, __m128i __A) {
   return (__m256h)__builtin_ia32_vcvthf8_2ph256_mask(
       (__v16qi)__A, (__v16hf)(__m256h)_mm256_setzero_ph(), (__mmask16)__U);
 }
diff --git a/clang/test/CodeGen/X86/avx10_2_512convert-builtins.c b/clang/test/CodeGen/X86/avx10_2_512convert-builtins.c
index 22503c640a727..dcf7bbc005a7c 100644
--- a/clang/test/CodeGen/X86/avx10_2_512convert-builtins.c
+++ b/clang/test/CodeGen/X86/avx10_2_512convert-builtins.c
@@ -201,22 +201,22 @@ __m512i test_mm512_maskz_cvts2ph_hf8(__mmask64 __U, __m512h __A, __m512h __B) {
   return _mm512_maskz_cvts2ph_hf8(__U, __A, __B);
 }
 
-__m512h test_mm512_cvthf8(__m256i __A) {
-  // CHECK-LABEL: @test_mm512_cvthf8(
+__m512h test_mm512_cvthf8_ph(__m256i __A) {
+  // CHECK-LABEL: @test_mm512_cvthf8_ph(
   // CHECK: call <32 x half> @llvm.x86.avx10.mask.vcvthf82ph512(
-  return _mm512_cvthf8(__A);
+  return _mm512_cvthf8_ph(__A);
 }
 
-__m512h test_mm512_mask_cvthf8(__m512h __A, __mmask32 __B, __m256i __C) {
-  // CHECK-LABEL: @test_mm512_mask_cvthf8(
+__m512h test_mm512_mask_cvthf8_ph(__m512h __A, __mmask32 __B, __m256i __C) {
+  // CHECK-LABEL: @test_mm512_mask_cvthf8_ph(
   // CHECK: call <32 x half> @llvm.x86.avx10.mask.vcvthf82ph512(
-  return _mm512_mask_cvthf8(__A, __B, __C);
+  return _mm512_mask_cvthf8_ph(__A, __B, __C);
 }
 
-__m512h test_mm512_maskz_cvthf8(__mmask32 __A, __m256i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_cvthf8(
+__m512h test_mm512_maskz_cvthf8_ph(__mmask32 __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_cvthf8_ph(
   // CHECK: call <32 x half> @llvm.x86.avx10.mask.vcvthf82ph512(
-  return _mm512_maskz_cvthf8(__A, __B);
+  return _mm512_maskz_cvthf8_ph(__A, __B);
 }
 
 __m256i test_mm512_cvtph_bf8(__m512h __A) {
diff --git a/clang/test/CodeGen/X86/avx10_2convert-builtins.c b/clang/test/CodeGen/X86/avx10_2convert-builtins.c
index e5e6f867e119e..87fc6ffd7bc17 100644
--- a/clang/test/CodeGen/X86/avx10_2convert-builtins.c
+++ b/clang/test/CodeGen/X86/avx10_2convert-builtins.c
@@ -379,40 +379,40 @@ __m256i test_mm256_maskz_cvts2ph_hf8(__mmask32 __U, __m256h __A, __m256h __B) {
   return _mm256_maskz_cvts2ph_hf8(__U, __A, __B);
 }
 
-__m128h test_mm_cvthf8(__m128i __A) {
-  // CHECK-LABEL: @test_mm_cvthf8(
+__m128h test_mm_cvthf8_ph(__m128i __A) {
+  // CHECK-LABEL: @test_mm_cvthf8_ph(
   // CHECK: call <8 x half> @llvm.x86.avx10.mask.vcvthf82ph128(
-  return _mm_cvthf8(__A);
+  return _mm_cvthf8_ph(__A);
 }
 
-__m128h test_mm_mask_cvthf8(__m128h __A, __mmask8 __B, __m128i __C) {
-  // CHECK-LABEL: @test_mm_mask_cvthf8(
+__m128h test_mm_mask_cvthf8_ph(__m128h __A, __mmask8 __B, __m128i __C) {
+  // CHECK-LABEL: @test_mm_mask_cvthf8_ph(
   // CHECK: call <8 x half> @llvm.x86.avx10.mask.vcvthf82ph128(
-  return _mm_mask_cvthf8(__A, __B, __C);
+  return _mm_mask_cvthf8_ph(__A, __B, __C);
 }
 
-__m128h test_mm_maskz_cvthf8(__mmask8 __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm_maskz_cvthf8(
+__m128h test_mm_maskz_cvthf8_ph(__mmask8 __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_cvthf8_ph(
   // CHECK: call <8 x half> @llvm.x86.avx10.mask.vcvthf82ph128(
-  return _mm_maskz_cvthf8(__A, __B);
+  return _mm_maskz_cvthf8_ph(__A, __B);
 }
 
-__m256h test_mm256_cvthf8(__m128i __A) {
-  // CHECK-LABEL: @test_mm256_cvthf8(
+__m256h test_mm256_cvthf8_ph(__m128i __A) {
+  // CHECK-LABEL: @test_mm256_cvthf8_ph(
   // CHECK: call <16 x half> @llvm.x86.avx10.mask.vcvthf82ph256(
-  return _mm256_cvthf8(__A);
+  return _mm256_cvthf8_ph(__A);
 }
 
-__m256h test_mm256_mask_cvthf8(__m256h __A, __mmask16 __B, __m128i __C) {
-  // CHECK-LABEL: @test_mm256_mask_cvthf8(
+__m256h test_mm256_mask_cvthf8_ph(__m256h __A, __mmask16 __B, __m128i __C) {
+  // CHECK-LABEL: @test_mm256_mask_cvthf8_ph(
   // CHECK: call <16 x half> @llvm.x86.avx10.mask.vcvthf82ph256(
-  return _mm256_mask_cvthf8(__A, __B, __C);
+  return _mm256_mask_cvthf8_ph(__A, __B, __C);
 }
 
-__m256h test_mm256_maskz_cvthf8(__mmask16 __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm256_maskz_cvthf8(
+__m256h test_mm256_maskz_cvthf8_ph(__mmask16 __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm256_maskz_cvthf8_ph(
   // CHECK: call <16 x half> @llvm.x86.avx10.mask.vcvthf82ph256(
-  return _mm256_maskz_cvthf8(__A, __B);
+  return _mm256_maskz_cvthf8_ph(__A, __B);
 }
 
 __m128i test_mm_cvtph_bf8(__m128h __A) {

From 94c1a8ea1bfea3cbd7191783c85b2cead642f653 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Mon, 10 Feb 2025 10:34:03 +0100
Subject: [PATCH 116/282] [DSE] Don't use initializes on byval argument
 (#126259)

There are two ways we can fix this problem, depending on how the
semantics of byval and initializes should interact:

* Don't infer initializes on byval arguments. initializes on byval
refers to the original caller memory (or having both attributes is made
a verifier error).
* Infer initializes on byval, but don't use it in DSE. initializes on
byval refers to the callee copy. This matches the semantics of readonly
on byval. This is slightly more powerful, for example, we could do a
backend optimization where byval + initializes will allocate the full
size of byval on the stack but not copy over the parts covered by
initializes.

I went with the second variant here, skipping byval + initializes in DSE
(FunctionAttrs already doesn't propagate initializes past byval). I'm
open to going in the other direction though.

Fixes https://github.com/llvm/llvm-project/issues/126181.

(cherry picked from commit 2d31a12dbe2339d20844ede70cbb54dbaf4ceea9)
---
 llvm/docs/LangRef.rst                              |  4 ++++
 .../lib/Transforms/Scalar/DeadStoreElimination.cpp |  4 +++-
 .../DeadStoreElimination/inter-procedural.ll       | 14 ++++++++++++++
 3 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index d004ced9dff14..e002195cb7ed5 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -1725,6 +1725,10 @@ Currently, only the following parameter attributes are defined:
     and negative values are allowed in case the argument points partway into
     an allocation. An empty list is not allowed.
 
+    On a ``byval`` argument, ``initializes`` refers to the given parts of the
+    callee copy being overwritten. A ``byval`` callee can never initialize the
+    original caller memory passed to the ``byval`` argument.
+
 ``dead_on_unwind``
     At a high level, this attribute indicates that the pointer argument is dead
     if the call unwinds, in the sense that the caller will not depend on the
diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 13f3de07c3c44..0fdc3354753b1 100644
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -2281,7 +2281,9 @@ DSEState::getInitializesArgMemLoc(const Instruction *I) {
   for (unsigned Idx = 0, Count = CB->arg_size(); Idx < Count; ++Idx) {
     ConstantRangeList Inits;
     Attribute InitializesAttr = CB->getParamAttr(Idx, Attribute::Initializes);
-    if (InitializesAttr.isValid())
+    // initializes on byval arguments refers to the callee copy, not the
+    // original memory the caller passed in.
+    if (InitializesAttr.isValid() && !CB->isByValArgument(Idx))
       Inits = InitializesAttr.getValueAsConstantRangeList();
 
     Value *CurArg = CB->getArgOperand(Idx);
diff --git a/llvm/test/Transforms/DeadStoreElimination/inter-procedural.ll b/llvm/test/Transforms/DeadStoreElimination/inter-procedural.ll
index e590c5bf4004a..5f8ab56c22754 100644
--- a/llvm/test/Transforms/DeadStoreElimination/inter-procedural.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/inter-procedural.ll
@@ -338,3 +338,17 @@ define i16 @global_var_alias() {
   ret i16 %l
 }
 
+declare void @byval_fn(ptr byval(i32) initializes((0, 4)) %am)
+
+define void @test_byval() {
+; CHECK-LABEL: @test_byval(
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 0, ptr [[A]], align 4
+; CHECK-NEXT:    call void @byval_fn(ptr [[A]])
+; CHECK-NEXT:    ret void
+;
+  %a = alloca i32
+  store i32 0, ptr %a
+  call void @byval_fn(ptr %a)
+  ret void
+}

From 9bbf3a98b793f8fc6269a20a026ca6fe029a1790 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 7 Feb 2025 12:41:06 +0100
Subject: [PATCH 117/282] [IndVars] Add test for #126012 (NFC)

(cherry picked from commit ae08969a2068dd327fbf4d0f606550574fbb9e45)
---
 .../Transforms/IndVarSimplify/pr126012.ll     | 49 +++++++++++++++++++
 1 file changed, 49 insertions(+)
 create mode 100644 llvm/test/Transforms/IndVarSimplify/pr126012.ll

diff --git a/llvm/test/Transforms/IndVarSimplify/pr126012.ll b/llvm/test/Transforms/IndVarSimplify/pr126012.ll
new file mode 100644
index 0000000000000..725ea89b8e651
--- /dev/null
+++ b/llvm/test/Transforms/IndVarSimplify/pr126012.ll
@@ -0,0 +1,49 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=indvars < %s | FileCheck %s
+
+; FIXME: This is a miscompile.
+define i32 @test() {
+; CHECK-LABEL: define i32 @test() {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_PREHEADER:.*]]
+; CHECK:       [[FOR_PREHEADER]]:
+; CHECK-NEXT:    [[INDVAR1:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[PHI:%.*]], %[[FOR_INC:.*]] ]
+; CHECK-NEXT:    [[INDVAR3:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_INC]] ]
+; CHECK-NEXT:    [[COND1:%.*]] = icmp eq i32 [[INDVAR3]], 0
+; CHECK-NEXT:    br i1 [[COND1]], label %[[FOR_INC]], label %[[FOR_END:.*]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    [[EXT:%.*]] = zext i1 true to i32
+; CHECK-NEXT:    br label %[[FOR_INC]]
+; CHECK:       [[FOR_INC]]:
+; CHECK-NEXT:    [[PHI]] = phi i32 [ [[EXT]], %[[FOR_END]] ], [ 0, %[[FOR_PREHEADER]] ]
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[INDVAR3]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INDVAR3]], 2
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_EXIT:.*]], label %[[FOR_PREHEADER]]
+; CHECK:       [[FOR_EXIT]]:
+; CHECK-NEXT:    [[INDVAR1_LCSSA:%.*]] = phi i32 [ [[INDVAR1]], %[[FOR_INC]] ]
+; CHECK-NEXT:    ret i32 [[INDVAR1_LCSSA]]
+;
+entry:
+  br label %for.preheader
+
+for.preheader:
+  %indvar1 = phi i32 [ 0, %entry ], [ %phi, %for.inc ]
+  %indvar2 = phi i32 [ 1, %entry ], [ %indvar3, %for.inc ]
+  %indvar3 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cond1 = icmp eq i32 %indvar3, 0
+  br i1 %cond1, label %for.inc, label %for.end
+
+for.end:
+  %cmp = icmp sgt i32 %indvar2, 0
+  %ext = zext i1 %cmp to i32
+  br label %for.inc
+
+for.inc:
+  %phi = phi i32 [ %ext, %for.end ], [ 0, %for.preheader ]
+  %inc = add i32 %indvar3, 1
+  %exitcond = icmp eq i32 %indvar3, 2
+  br i1 %exitcond, label %for.exit, label %for.preheader
+
+for.exit:
+  ret i32 %indvar1
+}

From af970cd8753c37e7fcf66b6211f2a2d1e261325c Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Mon, 10 Feb 2025 10:07:21 +0100
Subject: [PATCH 118/282] [ScalarEvolution] Handle addrec incoming value in
 isImpliedViaMerge() (#126236)

The code already guards against values coming from a previous iteration
using properlyDominates(). However, addrecs are considered to properly
dominate the loop they are defined in.

Handle this special case separately, by checking for expressions that
have computable loop evolution (this should cover cases like a zext of
an addrec as well).

I considered changing the definition of properlyDominates() instead, but
decided against it. The current definition is useful in other context,
e.g. when deciding whether an expression is safe to expand in a given
block.

Fixes https://github.com/llvm/llvm-project/issues/126012.

(cherry picked from commit 7aed53eb1982113e825534f0f66d0a0e46e7a5ed)
---
 llvm/lib/Analysis/ScalarEvolution.cpp           |  6 ++++++
 llvm/test/Transforms/IndVarSimplify/pr126012.ll | 10 +++++++---
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 2ce40877b523e..c71202c8dd58e 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -12402,6 +12402,12 @@ bool ScalarEvolution::isImpliedViaMerge(CmpPredicate Pred, const SCEV *LHS,
       // iteration of a loop.
       if (!properlyDominates(L, LBB))
         return false;
+      // Addrecs are considered to properly dominate their loop, so are missed
+      // by the previous check. Discard any values that have computable
+      // evolution in this loop.
+      if (auto *Loop = LI.getLoopFor(LBB))
+        if (hasComputableLoopEvolution(L, Loop))
+          return false;
       if (!ProvedEasily(L, RHS))
         return false;
     }
diff --git a/llvm/test/Transforms/IndVarSimplify/pr126012.ll b/llvm/test/Transforms/IndVarSimplify/pr126012.ll
index 725ea89b8e651..5189fe020dd3b 100644
--- a/llvm/test/Transforms/IndVarSimplify/pr126012.ll
+++ b/llvm/test/Transforms/IndVarSimplify/pr126012.ll
@@ -1,18 +1,22 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -S -passes=indvars < %s | FileCheck %s
 
-; FIXME: This is a miscompile.
+; Do not infer that %cmp is true. The %indvar3 input of %indvar2 comes from
+; a previous iteration, so we should not compare it to a value from the current
+; iteration.
 define i32 @test() {
 ; CHECK-LABEL: define i32 @test() {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br label %[[FOR_PREHEADER:.*]]
 ; CHECK:       [[FOR_PREHEADER]]:
 ; CHECK-NEXT:    [[INDVAR1:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[PHI:%.*]], %[[FOR_INC:.*]] ]
-; CHECK-NEXT:    [[INDVAR3:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_INC]] ]
+; CHECK-NEXT:    [[INDVAR2:%.*]] = phi i32 [ 1, %[[ENTRY]] ], [ [[INDVAR3:%.*]], %[[FOR_INC]] ]
+; CHECK-NEXT:    [[INDVAR3]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_INC]] ]
 ; CHECK-NEXT:    [[COND1:%.*]] = icmp eq i32 [[INDVAR3]], 0
 ; CHECK-NEXT:    br i1 [[COND1]], label %[[FOR_INC]], label %[[FOR_END:.*]]
 ; CHECK:       [[FOR_END]]:
-; CHECK-NEXT:    [[EXT:%.*]] = zext i1 true to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[INDVAR2]], 0
+; CHECK-NEXT:    [[EXT:%.*]] = zext i1 [[CMP]] to i32
 ; CHECK-NEXT:    br label %[[FOR_INC]]
 ; CHECK:       [[FOR_INC]]:
 ; CHECK-NEXT:    [[PHI]] = phi i32 [ [[EXT]], %[[FOR_END]] ], [ 0, %[[FOR_PREHEADER]] ]

From a89e04e7f0caa28d607e38099b905063b47a88fb Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Mon, 3 Feb 2025 17:37:07 +0100
Subject: [PATCH 119/282] [ValueTracking] Add additional tests for
 computeKnownBits on GEPs (NFC)

These demonstrate miscompiles in the existing code.

(cherry picked from commit 3dc1ef1650c8389a6f195a474781cf2281208bed)
---
 llvm/unittests/Analysis/ValueTrackingTest.cpp | 35 +++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/llvm/unittests/Analysis/ValueTrackingTest.cpp b/llvm/unittests/Analysis/ValueTrackingTest.cpp
index ee44aac45594d..39865fa195cf7 100644
--- a/llvm/unittests/Analysis/ValueTrackingTest.cpp
+++ b/llvm/unittests/Analysis/ValueTrackingTest.cpp
@@ -2679,6 +2679,41 @@ TEST_F(ComputeKnownBitsTest, ComputeKnownBitsAbsoluteSymbol) {
   EXPECT_EQ(0u, Known_0_256_Align8.countMinTrailingOnes());
 }
 
+TEST_F(ComputeKnownBitsTest, ComputeKnownBitsGEPExtendBeforeMul) {
+  // FIXME: The index should be extended before multiplying with the scale.
+  parseAssembly(R"(
+    target datalayout = "p:16:16:16"
+
+    define void @test(i16 %arg) {
+      %and = and i16 %arg, u0x8000
+      %base = inttoptr i16 %and to ptr
+      %A = getelementptr i32, ptr %base, i8 80
+      ret void
+    }
+    )");
+  KnownBits Known = computeKnownBits(A, M->getDataLayout());
+  EXPECT_EQ(~64 & 0x7fff, Known.Zero);
+  EXPECT_EQ(64, Known.One);
+}
+
+TEST_F(ComputeKnownBitsTest, ComputeKnownBitsGEPOnlyIndexBits) {
+  // FIXME: GEP should only affect the index width.
+  parseAssembly(R"(
+    target datalayout = "p:16:16:16:8"
+
+    define void @test(i16 %arg) {
+      %and = and i16 %arg, u0x8000
+      %or = or i16 %and, u0x00ff
+      %base = inttoptr i16 %or to ptr
+      %A = getelementptr i8, ptr %base, i8 1
+      ret void
+    }
+    )");
+  KnownBits Known = computeKnownBits(A, M->getDataLayout());
+  EXPECT_EQ(0x7eff, Known.Zero);
+  EXPECT_EQ(0x100, Known.One);
+}
+
 TEST_F(ValueTrackingTest, HaveNoCommonBitsSet) {
   {
     // Check for an inverted mask: (X & ~M) op (Y & M).

From 5777d5df62a659e165b4df74aefae29ae01d2509 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Tue, 4 Feb 2025 14:29:58 +0100
Subject: [PATCH 120/282] [ValueTracking] Fix bit width handling in
 computeKnownBits() for GEPs (#125532)

For GEPs, we have three bit widths involved: The pointer bit width, the
index bit width, and the bit width of the GEP operands.

The correct behavior here is:
* We need to sextOrTrunc the GEP operand to the index width *before*
multiplying by the scale.
* If the index width and pointer width differ, GEP only ever modifies
the low bits. Adds should not overflow into the high bits.

I'm testing this via unit tests because it's a bit tricky to test in IR
with InstCombine canonicalization getting in the way.

(cherry picked from commit 3bd11b502c1846afa5e1257c94b7a70566e34686)
---
 llvm/lib/Analysis/ValueTracking.cpp           | 66 ++++++++++---------
 llvm/unittests/Analysis/ValueTrackingTest.cpp | 12 ++--
 2 files changed, 42 insertions(+), 36 deletions(-)

diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index b63a0a07f7de2..8a674914641a8 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -1445,7 +1445,22 @@ static void computeKnownBitsFromOperator(const Operator *I,
     computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
     // Accumulate the constant indices in a separate variable
     // to minimize the number of calls to computeForAddSub.
-    APInt AccConstIndices(BitWidth, 0, /*IsSigned*/ true);
+    unsigned IndexWidth = Q.DL.getIndexTypeSizeInBits(I->getType());
+    APInt AccConstIndices(IndexWidth, 0);
+
+    auto AddIndexToKnown = [&](KnownBits IndexBits) {
+      if (IndexWidth == BitWidth) {
+        // Note that inbounds does *not* guarantee nsw for the addition, as only
+        // the offset is signed, while the base address is unsigned.
+        Known = KnownBits::add(Known, IndexBits);
+      } else {
+        // If the index width is smaller than the pointer width, only add the
+        // value to the low bits.
+        assert(IndexWidth < BitWidth &&
+               "Index width can't be larger than pointer width");
+        Known.insertBits(KnownBits::add(Known.trunc(IndexWidth), IndexBits), 0);
+      }
+    };
 
     gep_type_iterator GTI = gep_type_begin(I);
     for (unsigned i = 1, e = I->getNumOperands(); i != e; ++i, ++GTI) {
@@ -1483,43 +1498,34 @@ static void computeKnownBitsFromOperator(const Operator *I,
         break;
       }
 
-      unsigned IndexBitWidth = Index->getType()->getScalarSizeInBits();
-      KnownBits IndexBits(IndexBitWidth);
-      computeKnownBits(Index, IndexBits, Depth + 1, Q);
-      TypeSize IndexTypeSize = GTI.getSequentialElementStride(Q.DL);
-      uint64_t TypeSizeInBytes = IndexTypeSize.getKnownMinValue();
-      KnownBits ScalingFactor(IndexBitWidth);
+      TypeSize Stride = GTI.getSequentialElementStride(Q.DL);
+      uint64_t StrideInBytes = Stride.getKnownMinValue();
+      if (!Stride.isScalable()) {
+        // Fast path for constant offset.
+        if (auto *CI = dyn_cast<ConstantInt>(Index)) {
+          AccConstIndices +=
+              CI->getValue().sextOrTrunc(IndexWidth) * StrideInBytes;
+          continue;
+        }
+      }
+
+      KnownBits IndexBits =
+          computeKnownBits(Index, Depth + 1, Q).sextOrTrunc(IndexWidth);
+      KnownBits ScalingFactor(IndexWidth);
       // Multiply by current sizeof type.
       // &A[i] == A + i * sizeof(*A[i]).
-      if (IndexTypeSize.isScalable()) {
+      if (Stride.isScalable()) {
         // For scalable types the only thing we know about sizeof is
         // that this is a multiple of the minimum size.
-        ScalingFactor.Zero.setLowBits(llvm::countr_zero(TypeSizeInBytes));
-      } else if (IndexBits.isConstant()) {
-        APInt IndexConst = IndexBits.getConstant();
-        APInt ScalingFactor(IndexBitWidth, TypeSizeInBytes);
-        IndexConst *= ScalingFactor;
-        AccConstIndices += IndexConst.sextOrTrunc(BitWidth);
-        continue;
+        ScalingFactor.Zero.setLowBits(llvm::countr_zero(StrideInBytes));
       } else {
         ScalingFactor =
-            KnownBits::makeConstant(APInt(IndexBitWidth, TypeSizeInBytes));
+            KnownBits::makeConstant(APInt(IndexWidth, StrideInBytes));
       }
-      IndexBits = KnownBits::mul(IndexBits, ScalingFactor);
-
-      // If the offsets have a different width from the pointer, according
-      // to the language reference we need to sign-extend or truncate them
-      // to the width of the pointer.
-      IndexBits = IndexBits.sextOrTrunc(BitWidth);
-
-      // Note that inbounds does *not* guarantee nsw for the addition, as only
-      // the offset is signed, while the base address is unsigned.
-      Known = KnownBits::add(Known, IndexBits);
-    }
-    if (!Known.isUnknown() && !AccConstIndices.isZero()) {
-      KnownBits Index = KnownBits::makeConstant(AccConstIndices);
-      Known = KnownBits::add(Known, Index);
+      AddIndexToKnown(KnownBits::mul(IndexBits, ScalingFactor));
     }
+    if (!Known.isUnknown() && !AccConstIndices.isZero())
+      AddIndexToKnown(KnownBits::makeConstant(AccConstIndices));
     break;
   }
   case Instruction::PHI: {
diff --git a/llvm/unittests/Analysis/ValueTrackingTest.cpp b/llvm/unittests/Analysis/ValueTrackingTest.cpp
index 39865fa195cf7..50e5e0e6b2ff5 100644
--- a/llvm/unittests/Analysis/ValueTrackingTest.cpp
+++ b/llvm/unittests/Analysis/ValueTrackingTest.cpp
@@ -2680,7 +2680,7 @@ TEST_F(ComputeKnownBitsTest, ComputeKnownBitsAbsoluteSymbol) {
 }
 
 TEST_F(ComputeKnownBitsTest, ComputeKnownBitsGEPExtendBeforeMul) {
-  // FIXME: The index should be extended before multiplying with the scale.
+  // The index should be extended before multiplying with the scale.
   parseAssembly(R"(
     target datalayout = "p:16:16:16"
 
@@ -2692,12 +2692,12 @@ TEST_F(ComputeKnownBitsTest, ComputeKnownBitsGEPExtendBeforeMul) {
     }
     )");
   KnownBits Known = computeKnownBits(A, M->getDataLayout());
-  EXPECT_EQ(~64 & 0x7fff, Known.Zero);
-  EXPECT_EQ(64, Known.One);
+  EXPECT_EQ(~320 & 0x7fff, Known.Zero);
+  EXPECT_EQ(320, Known.One);
 }
 
 TEST_F(ComputeKnownBitsTest, ComputeKnownBitsGEPOnlyIndexBits) {
-  // FIXME: GEP should only affect the index width.
+  // GEP should only affect the index width.
   parseAssembly(R"(
     target datalayout = "p:16:16:16:8"
 
@@ -2710,8 +2710,8 @@ TEST_F(ComputeKnownBitsTest, ComputeKnownBitsGEPOnlyIndexBits) {
     }
     )");
   KnownBits Known = computeKnownBits(A, M->getDataLayout());
-  EXPECT_EQ(0x7eff, Known.Zero);
-  EXPECT_EQ(0x100, Known.One);
+  EXPECT_EQ(0x7fff, Known.Zero);
+  EXPECT_EQ(0, Known.One);
 }
 
 TEST_F(ValueTrackingTest, HaveNoCommonBitsSet) {

From bc87f9b80946dfe651d953c2fb4967ea32277a34 Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Sat, 8 Feb 2025 23:22:33 -0800
Subject: [PATCH 121/282] [clang-format] Handle C-style cast of member function
 pointer type (#126340)

Fixes #125012.

(cherry picked from commit 8d373ceaec1f1b27c9e682cfaf71aae19ea48d98)
---
 clang/lib/Format/TokenAnnotator.cpp           | 7 +++++--
 clang/unittests/Format/TokenAnnotatorTest.cpp | 6 ++++++
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index a172df5291ae6..4246ade6e19be 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -477,8 +477,9 @@ class AnnotatingParser {
     FormatToken *PossibleObjCForInToken = nullptr;
     while (CurrentToken) {
       const auto &Prev = *CurrentToken->Previous;
+      const auto *PrevPrev = Prev.Previous;
       if (Prev.is(TT_PointerOrReference) &&
-          Prev.Previous->isOneOf(tok::l_paren, tok::coloncolon)) {
+          PrevPrev->isOneOf(tok::l_paren, tok::coloncolon)) {
         ProbablyFunctionType = true;
       }
       if (CurrentToken->is(tok::comma))
@@ -486,8 +487,10 @@ class AnnotatingParser {
       if (Prev.is(TT_BinaryOperator))
         Contexts.back().IsExpression = true;
       if (CurrentToken->is(tok::r_paren)) {
-        if (Prev.is(TT_PointerOrReference) && Prev.Previous == &OpeningParen)
+        if (Prev.is(TT_PointerOrReference) &&
+            (PrevPrev == &OpeningParen || PrevPrev->is(tok::coloncolon))) {
           MightBeFunctionType = true;
+        }
         if (OpeningParen.isNot(TT_CppCastLParen) && MightBeFunctionType &&
             ProbablyFunctionType && CurrentToken->Next &&
             (CurrentToken->Next->is(tok::l_paren) ||
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index fc77e277947c5..2147a1b950dd1 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -845,6 +845,12 @@ TEST_F(TokenAnnotatorTest, UnderstandsCasts) {
   EXPECT_TOKEN(Tokens[14], tok::r_paren, TT_CastRParen);
   EXPECT_TOKEN(Tokens[15], tok::amp, TT_UnaryOperator);
 
+  Tokens = annotate("return (Foo (Bar::*)())&Bar::foo;");
+  ASSERT_EQ(Tokens.size(), 17u) << Tokens;
+  EXPECT_TOKEN(Tokens[3], tok::l_paren, TT_FunctionTypeLParen);
+  EXPECT_TOKEN(Tokens[10], tok::r_paren, TT_CastRParen);
+  EXPECT_TOKEN(Tokens[11], tok::amp, TT_UnaryOperator);
+
   auto Style = getLLVMStyle();
   Style.TypeNames.push_back("Foo");
   Tokens = annotate("#define FOO(bar) foo((Foo)&bar)", Style);

From d43a97163c43d3cfbfc7c11287aea2233bc7ffb4 Mon Sep 17 00:00:00 2001
From: Amr Hesham <amr96@programmer.net>
Date: Tue, 11 Feb 2025 23:22:01 +0100
Subject: [PATCH 122/282] release/20.x: [llvm-objcopy][ReleaseNotes] Fix prints
 wrong path when dump-section output path doesn't exist #125345 (#126607)

Add release note for llvm-objcopy fixing prints wrong path when
dump-section output path doesn't exist in #125345
---
 llvm/docs/ReleaseNotes.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index 44a0b17d6a07b..28908490b8f7c 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -460,6 +460,8 @@ Changes to the LLVM tools
   `--localize-symbol`, `--localize-symbols`,
   `--skip-symbol`, `--skip-symbols`.
 
+* llvm-objcopy now prints the correct file path in the error message when the output file specified by `--dump-section` cannot be opened.
+
 Changes to LLDB
 ---------------------------------
 

From cb51906de1e394b458e3ed9dd06cd9a0b732eb68 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Tue, 11 Feb 2025 13:03:12 +0100
Subject: [PATCH 123/282] [VPlan] Only skip expansion for SCEVUnknown if it
 isn't an instruction. (#125235)

Update getOrCreateVPValueForSCEVExpr to only skip expansion of
SCEVUnknown if the underlying value isn't an instruction. Instructions
may be defined in a loop and using them without expansion may break
LCSSA form. SCEVExpander will take care of preserving LCSSA if needed.

We could also try to pass LoopInfo, but there are some users of the
function where it won't be available and main benefit from skipping
expansion is slightly more concise VPlans.

Note that SCEVExpander is now used to expand SCEVUnknown with floats.
Adjust the check in expandCodeFor to only check the types and casts if
the type of the value is different to the requested type. Otherwise we
crash when trying to expand a float and requesting a float type.

Fixes https://github.com/llvm/llvm-project/issues/121518.

PR: https://github.com/llvm/llvm-project/pull/125235
(cherry picked from commit e258bca9505f35e0a22cb213a305eea9b76d11ea)
---
 .../Utils/ScalarEvolutionExpander.cpp         |   2 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |   2 +-
 llvm/lib/Transforms/Vectorize/VPlanUtils.cpp  |  15 ++-
 .../X86/scev-checks-unprofitable.ll           | 110 ++++++++++++++++++
 .../LoopVectorize/float-induction.ll          |   6 +-
 5 files changed, 127 insertions(+), 8 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/X86/scev-checks-unprofitable.ll

diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
index 3a761bc4e8119..d429fe96f9bec 100644
--- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
+++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
@@ -1451,7 +1451,7 @@ Value *SCEVExpander::expandCodeFor(const SCEV *SH, Type *Ty) {
   // Expand the code for this SCEV.
   Value *V = expand(SH);
 
-  if (Ty) {
+  if (Ty && Ty != V->getType()) {
     assert(SE.getTypeSizeInBits(Ty) == SE.getTypeSizeInBits(SH->getType()) &&
            "non-trivial casts should be done with the SCEVs directly!");
     V = InsertNoopCastOfTo(V, Ty);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 2679ed6b26b5d..6b746a48c46ec 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3324,7 +3324,7 @@ void VPExpandSCEVRecipe::execute(VPTransformState &State) {
   }
 
   const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
-  SCEVExpander Exp(SE, DL, "induction");
+  SCEVExpander Exp(SE, DL, "induction", /*PreserveLCSSA=*/true);
 
   Value *Res = Exp.expandCodeFor(Expr, Expr->getType(),
                                  &*State.Builder.GetInsertPoint());
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index e40af3e2e3d30..1a7322ec0aff6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -30,11 +30,18 @@ VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr,
   VPValue *Expanded = nullptr;
   if (auto *E = dyn_cast<SCEVConstant>(Expr))
     Expanded = Plan.getOrAddLiveIn(E->getValue());
-  else if (auto *E = dyn_cast<SCEVUnknown>(Expr))
-    Expanded = Plan.getOrAddLiveIn(E->getValue());
   else {
-    Expanded = new VPExpandSCEVRecipe(Expr, SE);
-    Plan.getEntry()->appendRecipe(Expanded->getDefiningRecipe());
+    auto *U = dyn_cast<SCEVUnknown>(Expr);
+    // Skip SCEV expansion if Expr is a SCEVUnknown wrapping a non-instruction
+    // value. Otherwise the value may be defined in a loop and using it directly
+    // will break LCSSA form. The SCEV expansion takes care of preserving LCSSA
+    // form.
+    if (U && !isa<Instruction>(U->getValue())) {
+      Expanded = Plan.getOrAddLiveIn(U->getValue());
+    } else {
+      Expanded = new VPExpandSCEVRecipe(Expr, SE);
+      Plan.getEntry()->appendRecipe(Expanded->getDefiningRecipe());
+    }
   }
   Plan.addSCEVExpansion(Expr, Expanded);
   return Expanded;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/scev-checks-unprofitable.ll b/llvm/test/Transforms/LoopVectorize/X86/scev-checks-unprofitable.ll
new file mode 100644
index 0000000000000..868e07aa0490e
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/scev-checks-unprofitable.ll
@@ -0,0 +1,110 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-vectorize -vectorizer-min-trip-count=8  -mcpu=skylake-avx512 -S %s | FileCheck %s
+
+target triple = "x86_64-unknown-linux-gnu"
+
+; Test case for https://github.com/llvm/llvm-project/issues/121518. Make sure
+; that we preserve LCSSA form when using %iv.1 from loop.1 in the trip count
+; expression when vectorizing loop.2
+define void @value_defined_in_loop1_used_for_trip_counts(i32 %start, i1 %c, ptr %dst) {
+; CHECK-LABEL: define void @value_defined_in_loop1_used_for_trip_counts(
+; CHECK-SAME: i32 [[START:%.*]], i1 [[C:%.*]], ptr [[DST:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[C]], i32 0, i32 7
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i32 [[SELECT]] to i64
+; CHECK-NEXT:    br label %[[LOOP_1:.*]]
+; CHECK:       [[LOOP_1]]:
+; CHECK-NEXT:    [[IV_1:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[ZEXT]], %[[LOOP_1]] ]
+; CHECK-NEXT:    br i1 false, label %[[LOOP_1_EXIT:.*]], label %[[LOOP_1]]
+; CHECK:       [[LOOP_1_EXIT]]:
+; CHECK-NEXT:    [[IV_1_LCSSA2:%.*]] = phi i64 [ [[IV_1]], %[[LOOP_1]] ]
+; CHECK-NEXT:    [[IV_1_LCSSA:%.*]] = phi i64 [ [[IV_1]], %[[LOOP_1]] ]
+; CHECK-NEXT:    br i1 [[C]], label %[[LOOP_2_PREHEADER:.*]], label %[[LOOP_3_PREHEADER:.*]]
+; CHECK:       [[LOOP_3_PREHEADER]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[IV_1_LCSSA2]], 15
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[IV_1_LCSSA2]], 1
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ule <16 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[DST]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> zeroinitializer, ptr [[TMP2]], i32 1, <16 x i1> [[TMP0]])
+; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br i1 true, label %[[EXIT_1_LOOPEXIT1:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_3_PREHEADER]] ]
+; CHECK-NEXT:    br label %[[LOOP_3:.*]]
+; CHECK:       [[LOOP_2_PREHEADER]]:
+; CHECK-NEXT:    br label %[[LOOP_2:.*]]
+; CHECK:       [[LOOP_2]]:
+; CHECK-NEXT:    [[IV_2:%.*]] = phi i64 [ [[IV_2_NEXT:%.*]], %[[LOOP_2]] ], [ 0, %[[LOOP_2_PREHEADER]] ]
+; CHECK-NEXT:    [[IV_3:%.*]] = phi i32 [ [[IV_3_NEXT:%.*]], %[[LOOP_2]] ], [ [[START]], %[[LOOP_2_PREHEADER]] ]
+; CHECK-NEXT:    [[IV_3_NEXT]] = add i32 [[IV_3]], 1
+; CHECK-NEXT:    [[IV_2_NEXT]] = add i64 [[IV_2]], 1
+; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[IV_3]], 1
+; CHECK-NEXT:    [[ZEXT8:%.*]] = zext i32 [[SHL]] to i64
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr i8, ptr [[DST]], i64 [[ZEXT8]]
+; CHECK-NEXT:    store i16 0, ptr [[GEP_DST]], align 2
+; CHECK-NEXT:    [[EC_2:%.*]] = icmp ult i64 [[IV_2]], [[IV_1_LCSSA]]
+; CHECK-NEXT:    br i1 [[EC_2]], label %[[LOOP_2]], label %[[EXIT_1_LOOPEXIT:.*]]
+; CHECK:       [[LOOP_3]]:
+; CHECK-NEXT:    [[IV_4:%.*]] = phi i64 [ [[IV_4_NEXT:%.*]], %[[LOOP_3]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[GEP_DST_2:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV_4]]
+; CHECK-NEXT:    store i8 0, ptr [[GEP_DST_2]], align 1
+; CHECK-NEXT:    [[IV_4_NEXT]] = add i64 [[IV_4]], 1
+; CHECK-NEXT:    [[EC_3:%.*]] = icmp ult i64 [[IV_4_NEXT]], [[IV_1_LCSSA]]
+; CHECK-NEXT:    br i1 [[EC_3]], label %[[LOOP_3]], label %[[EXIT_1_LOOPEXIT1]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[EXIT_1_LOOPEXIT]]:
+; CHECK-NEXT:    br label %[[EXIT_1:.*]]
+; CHECK:       [[EXIT_1_LOOPEXIT1]]:
+; CHECK-NEXT:    br label %[[EXIT_1]]
+; CHECK:       [[EXIT_1]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %select = select i1 %c, i32 0, i32 7
+  %zext = zext i32 %select to i64
+  br label %loop.1
+
+loop.1:
+  %iv.1 = phi i64 [ 0, %entry ], [ %zext, %loop.1 ]
+  br i1 false, label %loop.1.exit, label %loop.1
+
+loop.1.exit:
+  br i1 %c, label %loop.2, label %loop.3
+
+loop.2:
+  %iv.2 = phi i64 [ 0, %loop.1.exit ], [ %iv.2.next, %loop.2 ]
+  %iv.3 = phi i32 [ %start, %loop.1.exit ], [ %iv.3.next, %loop.2 ]
+  %iv.3.next = add i32 %iv.3, 1
+  %iv.2.next = add i64 %iv.2, 1
+  %shl = shl i32 %iv.3, 1
+  %zext8 = zext i32 %shl to i64
+  %gep.dst = getelementptr i8, ptr %dst, i64 %zext8
+  store i16 0, ptr %gep.dst, align 2
+  %ec.2 = icmp ult i64 %iv.2, %iv.1
+  br i1 %ec.2, label %loop.2, label %exit.1
+
+loop.3:
+  %iv.4 = phi i64 [ 0, %loop.1.exit ], [ %iv.4.next, %loop.3 ]
+  %gep.dst.2 = getelementptr i8, ptr %dst, i64 %iv.4
+  store i8 0, ptr %gep.dst.2, align 1
+  %iv.4.next = add i64 %iv.4, 1
+  %ec.3 = icmp ult i64 %iv.4.next, %iv.1
+  br i1 %ec.3, label %loop.3, label %exit.1
+
+exit.1:
+  ret void
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[META2]] = !{!"llvm.loop.isvectorized", i32 1}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/float-induction.ll b/llvm/test/Transforms/LoopVectorize/float-induction.ll
index 601c475df56cd..29b29c500c46e 100644
--- a/llvm/test/Transforms/LoopVectorize/float-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/float-induction.ll
@@ -842,14 +842,16 @@ define void @fp_iv_loop3(float %init, ptr noalias nocapture %A, ptr noalias noca
 ; VEC4_INTERL2-NEXT:    [[TMP3:%.*]] = fmul fast float [[TMP0]], [[DOTCAST2]]
 ; VEC4_INTERL2-NEXT:    [[IND_END3:%.*]] = fadd fast float [[INIT:%.*]], [[TMP3]]
 ; VEC4_INTERL2-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0
-; VEC4_INTERL2-NEXT:    [[BROADCAST:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT2]], <4 x float> poison, <4 x i32> zeroinitializer
-; VEC4_INTERL2-NEXT:    [[DOTSPLAT5:%.*]] = fmul fast <4 x float> [[BROADCAST]], splat (float 4.000000e+00)
+; VEC4_INTERL2-NEXT:    [[TMP19:%.*]] = fmul fast <4 x float> [[DOTSPLATINSERT2]], <float 4.000000e+00, float poison, float poison, float poison>
+; VEC4_INTERL2-NEXT:    [[DOTSPLAT5:%.*]] = shufflevector <4 x float> [[TMP19]], <4 x float> poison, <4 x i32> zeroinitializer
 ; VEC4_INTERL2-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0
 ; VEC4_INTERL2-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
 ; VEC4_INTERL2-NEXT:    [[DOTSPLATINSERT6:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0
 ; VEC4_INTERL2-NEXT:    [[DOTSPLAT7:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT6]], <4 x float> poison, <4 x i32> zeroinitializer
 ; VEC4_INTERL2-NEXT:    [[TMP4:%.*]] = fmul fast <4 x float> [[DOTSPLAT7]], <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
 ; VEC4_INTERL2-NEXT:    [[INDUCTION:%.*]] = fadd fast <4 x float> [[DOTSPLAT]], [[TMP4]]
+; VEC4_INTERL2-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0
+; VEC4_INTERL2-NEXT:    [[BROADCAST:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT7]], <4 x float> poison, <4 x i32> zeroinitializer
 ; VEC4_INTERL2-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; VEC4_INTERL2:       vector.body:
 ; VEC4_INTERL2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]

From 203cd187957eda87cfc576ccabd2aabf62d34bc9 Mon Sep 17 00:00:00 2001
From: Vigneshwar Jayakumar <vigneshwar.jayakumar@amd.com>
Date: Tue, 11 Feb 2025 09:30:16 -0600
Subject: [PATCH 124/282] AMDGPU: Handle gfx950 XDL-write-VGPR-Overlap-Src-AB
 wait state (#126732)

gfx950 needs more additional waitstates from gfx940

(cherry picked from commit c837f572865eb2980b82a8415da45dc1157627bf)
---
 .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 16 ++---
 .../CodeGen/AMDGPU/mai-hazards-gfx940.mir     | 60 ++++++++++++-------
 .../AMDGPU/mai-hazards-mfma-scale.gfx950.mir  |  4 +-
 3 files changed, 51 insertions(+), 29 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 873d18e30a430..844441308275f 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -2297,12 +2297,14 @@ GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) {
   return NumPasses + 2;
 }
 
-static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) {
-  // 2 pass -> 5
-  // 4 pass -> 7
-  // 8 pass -> 11
-  // 16 pass -> 19
-  return NumPasses + 3;
+static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses,
+                                                                bool IsGFX950) {
+  // xdl def cycles | gfx940 | gfx950
+  // 2 pass         |  5        5
+  // 4 pass         |  7        8
+  // 8 pass         |  11       12
+  // 16 pass        |  19       20
+  return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
 }
 
 int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
@@ -2471,7 +2473,7 @@ int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
           NeedWaitStates =
               isXDL(ST, *MI1)
                   ? GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(
-                        NumPasses)
+                        NumPasses, ST.hasGFX950Insts())
                   : GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(
                         NumPasses);
           break;
diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir
index 52891989b88fb..1eb7ec4c142f2 100644
--- a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir
+++ b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir
@@ -417,7 +417,8 @@ body:             |
 # GCN-LABEL: name: xdl_sgemm16x16_mfma_write_agpr_mfma_srca_read_overlap
 # GCN:      V_MFMA
 # GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GFX940-NEXT: S_NOP 2
+# GFX950-NEXT: S_NOP 3
 # GCN-NEXT: V_MFMA
 name:            xdl_sgemm16x16_mfma_write_agpr_mfma_srca_read_overlap
 body:             |
@@ -439,7 +440,8 @@ body:             |
 # GCN-LABEL: name: smfmac32x32_write_agpr_mfma_srca_read_overlap
 # GCN:      V_SMFMAC
 # GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GFX940-NEXT: S_NOP 2
+# GFX950-NEXT: S_NOP 3
 # GCN-NEXT: V_MFMA
 name:            smfmac32x32_write_agpr_mfma_srca_read_overlap
 body:             |
@@ -450,7 +452,8 @@ body:             |
 # GCN-LABEL: name: smfmac32x32_write_agpr_smfmac_srcc_read_overlap
 # GCN:      V_SMFMAC
 # GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GFX940-NEXT: S_NOP 2
+# GFX950-NEXT: S_NOP 3
 # GCN-NEXT: V_SMFMAC
 name:            smfmac32x32_write_agpr_smfmac_srcc_read_overlap
 body:             |
@@ -462,7 +465,8 @@ body:             |
 # GCN:      V_MFMA
 # GCN-NEXT: S_NOP 7
 # GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GFX940-NEXT: S_NOP 2
+# GFX950-NEXT: S_NOP 3
 # GCN-NEXT: V_MFMA
 name:            xdl_sgemm32x32_mfma_write_agpr_mfma_srca_read_overlap
 body:             |
@@ -1715,7 +1719,8 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_sgemm16X16X16_mfma_write_agpr_mfma_srca_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 6
+# GFX940-NEXT: S_NOP 6
+# GFX950-NEXT: S_NOP 7
 # GCN-NEXT: V_MFMA
 name:            xdl_sgemm16X16X16_mfma_write_agpr_mfma_srca_read_overlap
 body:             |
@@ -1725,7 +1730,8 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_sgemm16X16X32_mfma_write_agpr_mfma_srcb_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 6
+# GFX940-NEXT: S_NOP 6
+# GFX950-NEXT: S_NOP 7
 # GCN-NEXT: V_MFMA
 name:            xdl_sgemm16X16X32_mfma_write_agpr_mfma_srcb_read_overlap
 body:             |
@@ -1735,7 +1741,8 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_sgemm16X16X16_mfma_write_vgpr_dmfma16x16_srca_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 6
+# GFX940-NEXT: S_NOP 6
+# GFX950-NEXT: S_NOP 7
 # GCN-NEXT: V_MFMA
 name:            xdl_sgemm16X16X16_mfma_write_vgpr_dmfma16x16_srca_read_overlap
 body:             |
@@ -1826,7 +1833,8 @@ body:             |
 ...
 # GCN-LABEL: name: smfmac16x16x32_mfma_write_vgpr_smfmac_read_idx
 # GCN:      V_SMFMAC
-# GCN-NEXT: S_NOP 6
+# GFX940-NEXT: S_NOP 6
+# GFX950-NEXT: S_NOP 7
 # GCN-NEXT: V_SMFMAC
 name:            smfmac16x16x32_mfma_write_vgpr_smfmac_read_idx
 body:             |
@@ -2188,7 +2196,8 @@ body:             |
 # 4 pass source
 # GCN-LABEL: name: xdl_mfma_4pass_write_vgpr_xdl_mfma_read_overlap_srca
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 6
+# GFX940-NEXT: S_NOP 6
+# GFX950-NEXT: S_NOP 7
 # GCN-NEXT: V_MFMA
 name:            xdl_mfma_4pass_write_vgpr_xdl_mfma_read_overlap_srca
 body:             |
@@ -2202,7 +2211,8 @@ body:             |
 # 4 pass source
 # GCN-LABEL: name: xdl_mfma_4pass_write_vgpr_xdl_mfma_read_overlap_srcb
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 6
+# GFX940-NEXT: S_NOP 6
+# GFX950-NEXT: S_NOP 7
 # GCN-NEXT: V_MFMA
 name:            xdl_mfma_4pass_write_vgpr_xdl_mfma_read_overlap_srcb
 body:             |
@@ -2276,7 +2286,8 @@ body:             |
 # 4 pass source
 # GCN-LABEL: name: xdl_mfma_4pass_write_vgpr_sgemm_mfma_read_overlap_srca
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 6
+# GFX940-NEXT: S_NOP 6
+# GFX950-NEXT: S_NOP 7
 # GCN-NEXT: V_MFMA
 name:            xdl_mfma_4pass_write_vgpr_sgemm_mfma_read_overlap_srca
 body:             |
@@ -2290,7 +2301,8 @@ body:             |
 # 4 pass source
 # GCN-LABEL: name: xdl_mfma_4pass_write_vgpr_sgemm_mfma_read_overlap_srcb
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 6
+# GFX940-NEXT: S_NOP 6
+# GFX950-NEXT: S_NOP 7
 # GCN-NEXT: V_MFMA
 name:            xdl_mfma_4pass_write_vgpr_sgemm_mfma_read_overlap_srcb
 body:             |
@@ -2321,7 +2333,8 @@ body:             |
 # GCN-LABEL: name: xdl_mfma_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srca
 # GCN:      V_MFMA
 # GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GFX940-NEXT: S_NOP 2
+# GFX950-NEXT: S_NOP 3
 # GCN-NEXT: V_MFMA
 name: xdl_mfma_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srca
 body:             |
@@ -2336,7 +2349,8 @@ body:             |
 # GCN-LABEL: name: xdl_mfma_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srcb
 # GCN:      V_MFMA
 # GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GFX940-NEXT: S_NOP 2
+# GFX950-NEXT: S_NOP 3
 # GCN-NEXT: V_MFMA
 name: xdl_mfma_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srcb
 body:             |
@@ -2370,7 +2384,8 @@ body:             |
 # GCN:      V_MFMA
 # GCN-NEXT: S_NOP 7
 # GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GFX940-NEXT: S_NOP 2
+# GFX950-NEXT: S_NOP 3
 # GCN-NEXT: V_MFMA
 name:            xdl_16pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srca
 body:             |
@@ -2386,7 +2401,8 @@ body:             |
 # GCN:      V_MFMA
 # GCN-NEXT: S_NOP 7
 # GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GFX940-NEXT: S_NOP 2
+# GFX950-NEXT: S_NOP 3
 # GCN-NEXT: V_MFMA
 name:            xdl_16pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srcb
 body:             |
@@ -2456,7 +2472,8 @@ body:             |
 # GCN-LABEL: name: xdl_mfma_8pass_write_vgpr_xdl_mfma_read_overlap_srca
 # GCN:      V_MFMA
 # GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GFX940-NEXT: S_NOP 2
+# GFX950-NEXT: S_NOP 3
 # GCN-NEXT: V_MFMA
 name:            xdl_mfma_8pass_write_vgpr_xdl_mfma_read_overlap_srca
 body:             |
@@ -2470,7 +2487,8 @@ body:             |
 # GCN-LABEL: name: xdl_mfma_8pass_write_vgpr_xdl_mfma_read_overlap_srcb
 # GCN:      V_MFMA
 # GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GFX940-NEXT: S_NOP 2
+# GFX950-NEXT: S_NOP 3
 # GCN-NEXT: V_MFMA
 name:            xdl_mfma_8pass_write_vgpr_xdl_mfma_read_overlap_srcb
 body:             |
@@ -2502,7 +2520,8 @@ body:             |
 # GCN:      V_MFMA
 # GCN-NEXT: S_NOP 7
 # GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GFX940-NEXT: S_NOP 2
+# GFX950-NEXT: S_NOP 3
 # GCN-NEXT: V_MFMA
 name:            xdl_16pass_write_vgpr_xdl_mfma_read_overlap_srca
 body:             |
@@ -2519,7 +2538,8 @@ body:             |
 # GCN:      V_MFMA
 # GCN-NEXT: S_NOP 7
 # GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GFX940-NEXT: S_NOP 2
+# GFX950-NEXT: S_NOP 3
 # GCN-NEXT: V_MFMA
 name:            xdl_16pass_write_vgpr_xdl_mfma_read_overlap_srcb
 body:             |
diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir
index 433236180b137..4585eca8fe894 100644
--- a/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir
+++ b/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir
@@ -254,7 +254,7 @@ body:             |
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, $sgpr4, $vgpr21, 12, 4, implicit $mode, implicit $exec
     ; GCN-NEXT: S_NOP 7
-    ; GCN-NEXT: S_NOP 2
+    ; GCN-NEXT: S_NOP 3
     ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $sgpr4, killed $vgpr21, 12, 4, implicit $mode, implicit $exec
     ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
     renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, $sgpr4, $vgpr21, 12, 4, implicit $mode, implicit $exec
@@ -275,7 +275,7 @@ body:             |
     ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 2, 2, $sgpr4, $vgpr21, 12, 4, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 6
+    ; GCN-NEXT: S_NOP 7
     ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $sgpr4, killed $vgpr21, 12, 4, implicit $mode, implicit $exec
     ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
     renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 2, 2, $sgpr4, $vgpr21, 12, 4, implicit $mode, implicit $exec

From f874fac4858d3610f3d847e5c1ec098059a7b530 Mon Sep 17 00:00:00 2001
From: Vigneshwar Jayakumar <vigneshwar.jayakumar@amd.com>
Date: Tue, 11 Feb 2025 12:30:53 -0600
Subject: [PATCH 125/282] AMDGPU: Handle gfx950 XDL-write-VGPR-VALU-Mem-Exp
 wait state change (#126727)

(cherry picked from commit a2263eba4d3be0daa96bd154de3b8f2a67aa67fb)
---
 .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp |  17 +-
 .../AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll    |  12 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll |  72 +++---
 ....amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll | 190 +++++++-------
 ...m.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll | 234 +++++++++---------
 .../AMDGPU/llvm.amdgcn.smfmac.gfx950.ll       | 202 +++++++--------
 .../CodeGen/AMDGPU/mai-hazards-gfx940.mir     |  36 ++-
 7 files changed, 389 insertions(+), 374 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 844441308275f..537181710ed32 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -2613,12 +2613,14 @@ static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses) {
   return NumPasses + 3;
 }
 
-static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) {
-  // 2 pass -> 5
-  // 4 pass -> 7
-  // 8 pass -> 11
-  // 16 pass -> 19
-  return NumPasses + 3;
+static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses,
+                                                              bool IsGFX950) {
+  // xdl def cycles | gfx940 | gfx950
+  // 2 pass         |  5        5
+  // 4 pass         |  7        8
+  // 8 pass         |  11       12
+  // 16 pass        |  19       20
+  return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
 }
 
 static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) {
@@ -2769,7 +2771,8 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
       } else if (ST.hasGFX940Insts()) {
         NeedWaitStates =
             isXDL(ST, *MFMA)
-                ? GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(NumPasses)
+                ? GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(
+                      NumPasses, ST.hasGFX950Insts())
                 : GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(
                       NumPasses);
       } else {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
index 8d380516df8b5..452033f332659 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
@@ -49,7 +49,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16(<8 x bfloat> %arg0, <8 x
 ; GCN-NEXT:    v_mov_b32_e32 v9, s17
 ; GCN-NEXT:    v_mov_b32_e32 v10, s18
 ; GCN-NEXT:    v_mov_b32_e32 v11, s19
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 4
 ; GCN-NEXT:    global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
@@ -122,7 +122,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__flags(<8 x bfloat> %arg0
 ; GCN-NEXT:    v_mov_b32_e32 v9, s17
 ; GCN-NEXT:    v_mov_b32_e32 v10, s18
 ; GCN-NEXT:    v_mov_b32_e32 v11, s19
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 4
 ; GCN-NEXT:    global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
@@ -179,7 +179,7 @@ define <16 x float> @test_mfma_f32_32x32x16_bf16__mac(<8 x bfloat> %arg0, <8 x b
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15]
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -224,7 +224,7 @@ define <16 x float> @test_mfma_f32_32x32x16_bf16__mac__flags(<8 x bfloat> %arg0,
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -417,7 +417,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac(<8 x bfloat>
 ; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15]
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    s_nop 2
 ; GCN-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
 ; GCN-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
 ; GCN-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
@@ -459,7 +459,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac_flags(<8 x bf
 ; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    s_nop 2
 ; GCN-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
 ; GCN-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
 ; GCN-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
index 44cb4e803ffad..4628a9c15391b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
@@ -19,7 +19,7 @@ define <4 x float> @test_mfma_f32_16x16x32_f16(<8 x half> %arg0, <8 x half> %arg
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v11
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
-; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -39,7 +39,7 @@ define <4 x float> @test_mfma_f32_16x16x32_f16__flags(<8 x half> %arg0, <8 x hal
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v11
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
-; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -67,7 +67,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
-; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    global_store_dwordx4 v8, a[0:3], s[6:7]
 ; SDAG-NEXT:    s_endpgm
 ;
@@ -88,7 +88,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GISEL-NEXT:    s_nop 5
+; GISEL-NEXT:    s_nop 6
 ; GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
 ; GISEL-NEXT:    s_endpgm
   %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
@@ -114,7 +114,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
-; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    global_store_dwordx4 v8, a[0:3], s[6:7]
 ; SDAG-NEXT:    s_endpgm
 ;
@@ -135,7 +135,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GISEL-NEXT:    s_nop 5
+; GISEL-NEXT:    s_nop 6
 ; GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
 ; GISEL-NEXT:    s_endpgm
   %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 3, i32 2, i32 1)
@@ -186,7 +186,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
 ; SDAG-NEXT:    v_mov_b32_e32 v9, s17
 ; SDAG-NEXT:    v_mov_b32_e32 v10, s18
 ; SDAG-NEXT:    v_mov_b32_e32 v11, s19
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 4
 ; SDAG-NEXT:    global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
@@ -253,7 +253,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
 ; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
 ; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[18:19]
-; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    s_nop 4
 ; GISEL-NEXT:    global_store_dwordx4 v[20:21], a[16:19], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    global_store_dwordx4 v[22:23], a[20:23], off sc0 sc1
@@ -316,7 +316,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
 ; SDAG-NEXT:    v_mov_b32_e32 v9, s17
 ; SDAG-NEXT:    v_mov_b32_e32 v10, s18
 ; SDAG-NEXT:    v_mov_b32_e32 v11, s19
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 4
 ; SDAG-NEXT:    global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
@@ -383,7 +383,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
 ; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
 ; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[18:19]
-; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    s_nop 4
 ; GISEL-NEXT:    global_store_dwordx4 v[20:21], a[16:19], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    global_store_dwordx4 v[22:23], a[20:23], off sc0 sc1
@@ -430,7 +430,7 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac(<8 x half> %arg0, <8 x half
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15]
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -475,7 +475,7 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac__flags(<8 x half> %arg0, <8
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -776,7 +776,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar
 ; SDAG-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15]
 ; SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    s_nop 2
 ; SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
 ; SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
 ; SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
@@ -813,7 +813,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar
 ; GISEL-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15]
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    s_nop 2
 ; GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
@@ -855,7 +855,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal
 ; SDAG-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
 ; SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    s_nop 2
 ; SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
 ; SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
 ; SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
@@ -892,7 +892,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal
 ; GISEL-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    s_nop 2
 ; GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
@@ -919,7 +919,7 @@ define <4 x i32> @test_mfma_i32_16x16x64_i8(<4 x i32> %arg0, <4 x i32> %arg1, <4
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v11
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3]
-; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -939,7 +939,7 @@ define <4 x i32> @test_mfma_i32_16x16x64_i8__flags(<4 x i32> %arg0, <4 x i32> %a
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v11
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
-; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -971,7 +971,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3]
-; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    global_store_dwordx4 v8, a[0:3], s[6:7]
 ; SDAG-NEXT:    s_endpgm
 ;
@@ -992,7 +992,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3]
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GISEL-NEXT:    s_nop 5
+; GISEL-NEXT:    s_nop 6
 ; GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
 ; GISEL-NEXT:    s_endpgm
   %result = call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x64.i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2, i32 0, i32 0, i32 0)
@@ -1022,7 +1022,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
-; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    global_store_dwordx4 v8, a[0:3], s[6:7]
 ; SDAG-NEXT:    s_endpgm
 ;
@@ -1043,7 +1043,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GISEL-NEXT:    s_nop 5
+; GISEL-NEXT:    s_nop 6
 ; GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
 ; GISEL-NEXT:    s_endpgm
   %result = call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x64.i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2, i32 3, i32 2, i32 1)
@@ -1097,7 +1097,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
 ; SDAG-NEXT:    v_mov_b32_e32 v1, s17
 ; SDAG-NEXT:    v_mov_b32_e32 v2, s18
 ; SDAG-NEXT:    v_mov_b32_e32 v3, s19
-; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
@@ -1169,7 +1169,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
 ; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
 ; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[18:19]
-; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    s_nop 4
 ; GISEL-NEXT:    global_store_dwordx4 v[20:21], a[16:19], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    global_store_dwordx4 v[22:23], a[20:23], off sc0 sc1
@@ -1233,7 +1233,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
 ; SDAG-NEXT:    v_mov_b32_e32 v1, s17
 ; SDAG-NEXT:    v_mov_b32_e32 v2, s18
 ; SDAG-NEXT:    v_mov_b32_e32 v3, s19
-; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
@@ -1305,7 +1305,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
 ; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
 ; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[18:19]
-; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    s_nop 4
 ; GISEL-NEXT:    global_store_dwordx4 v[20:21], a[16:19], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    global_store_dwordx4 v[22:23], a[20:23], off sc0 sc1
@@ -1352,7 +1352,7 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac(<4 x i32> %arg0, <4 x i32> %ar
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15]
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1397,7 +1397,7 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac__flags(<4 x i32> %arg0, <4 x i
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1717,7 +1717,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0
 ; SDAG-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15]
 ; SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    s_nop 2
 ; SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
 ; SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
 ; SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
@@ -1754,7 +1754,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0
 ; GISEL-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15]
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    s_nop 2
 ; GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
@@ -1801,7 +1801,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
 ; SDAG-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
 ; SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    s_nop 2
 ; SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
 ; SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
 ; SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
@@ -1838,7 +1838,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
 ; GISEL-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    s_nop 2
 ; GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
@@ -1865,7 +1865,7 @@ define <4 x float> @test_mfma_f32_16x16x32_bf16(<8 x bfloat> %arg0, <8 x bfloat>
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v11
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
-; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1885,7 +1885,7 @@ define <4 x float> @test_mfma_f32_16x16x32_bf16__flags(<8 x bfloat> %arg0, <8 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v11
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
-; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1913,7 +1913,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrs
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, s3
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
-; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    global_store_dwordx4 v8, a[0:3], s[6:7]
 ; GCN-NEXT:    s_endpgm
   %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
@@ -1939,7 +1939,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, s3
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
-; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    global_store_dwordx4 v8, a[0:3], s[6:7]
 ; GCN-NEXT:    s_endpgm
   %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 3, i32 2, i32 1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
index 9a8282231ac15..25b857f8f47dd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
@@ -24,7 +24,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0(<8 x
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -48,7 +48,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1(<8 x
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -72,7 +72,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1(<8 x
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -96,7 +96,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1(<8 x
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -120,7 +120,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1(<8 x
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -144,7 +144,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1(<8 x
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -168,7 +168,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1(<8 x
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -192,7 +192,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1(<8 x
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -217,7 +217,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__cons
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3]
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -242,7 +242,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1(<8 x
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:1
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -267,7 +267,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1__cons
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] blgp:1
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -292,7 +292,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2(<8 x
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:2
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -317,7 +317,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2__cons
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:2
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -342,7 +342,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3(<8 x
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:3
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -367,7 +367,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3__cons
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:3
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -392,7 +392,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4(<8 x
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] blgp:4
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -417,7 +417,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4__cons
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] blgp:4
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -442,7 +442,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0(<8 x
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -467,7 +467,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0__cons
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -492,7 +492,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1(<8 x
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1 blgp:1
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -518,7 +518,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1__cons
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1 blgp:1
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -543,7 +543,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2(<8 x
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:2
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -567,7 +567,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2__cons
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:2
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -592,7 +592,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3(<8 x
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:3
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -617,7 +617,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3__cons
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:3
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -642,7 +642,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4(<8 x
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:1 blgp:4
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -667,7 +667,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4__cons
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] cbsz:1 blgp:4
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -692,7 +692,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0(<6 x
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -717,7 +717,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0__cons
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -742,7 +742,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1(<6 x
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2 blgp:1
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -767,7 +767,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1__cons
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2 blgp:1
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -791,7 +791,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2(<6 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:2
-; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -815,7 +815,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:2 blgp:2
-; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -839,7 +839,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3(<6 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:3
-; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -863,7 +863,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:2 blgp:3
-; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -889,7 +889,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0(<6 x
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -914,7 +914,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0__cons
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -939,7 +939,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1(<6 x
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3 blgp:1
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -964,7 +964,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1__cons
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3 blgp:1
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -988,7 +988,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2(<6 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:2
-; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1012,7 +1012,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:3 blgp:2
-; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1036,7 +1036,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4(<6 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v13
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:3 blgp:4
-; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1060,7 +1060,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v13
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3] cbsz:3 blgp:4
-; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1084,7 +1084,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3(<6 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:3
-; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1108,7 +1108,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:3 blgp:3
-; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1132,7 +1132,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4(<6 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v13
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:2 blgp:4
-; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1156,7 +1156,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v13
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3] cbsz:2 blgp:4
-; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1181,7 +1181,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0(<4 x
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1206,7 +1206,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0__cons
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1231,7 +1231,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1(<4 x
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4 blgp:1
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1256,7 +1256,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1__cons
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4 blgp:1
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1280,7 +1280,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2(<4 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v13
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:2
-; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1304,7 +1304,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v13
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3] cbsz:4 blgp:2
-; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1328,7 +1328,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3(<4 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v13
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:3
-; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1352,7 +1352,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v13
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3] cbsz:4 blgp:3
-; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1376,7 +1376,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4(<4 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v11
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:7], a[0:3], v12, v13 op_sel_hi:[0,0,0] cbsz:4 blgp:4
-; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1400,7 +1400,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v11
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:4 blgp:4
-; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1429,7 +1429,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__sgpr_
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, v16 op_sel_hi:[0,0,0]
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1450,7 +1450,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__vgpr_
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, v20 op_sel_hi:[0,0,0]
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1471,7 +1471,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__vgpr_scaleA__sgpr_
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, s0 op_sel_hi:[0,0,0]
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1512,7 +1512,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inr
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[12:19], v[4:11], a[0:3], v2, v3 op_sel_hi:[0,0,0]
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1545,7 +1545,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inr
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[4:11], v[12:19], a[0:3], v2, v3 op_sel_hi:[0,0,0]
 ; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1574,7 +1574,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], s20, v12 op_sel_hi:[0,0,0]
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1599,7 +1599,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], s20, v12 op_sel_hi:[0,0,0]
 ; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1628,7 +1628,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v12, s20 op_sel_hi:[0,0,0]
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1653,7 +1653,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v12, s20 op_sel_hi:[0,0,0]
 ; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1682,7 +1682,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[14:21], a[0:3], v12, s20 op_sel_hi:[0,0,0]
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1707,7 +1707,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[14:21], a[0:3], v12, s20 op_sel_hi:[0,0,0]
 ; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1728,7 +1728,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_vgpr_sgpr__vgp
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, s16 op_sel_hi:[0,0,0]
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1757,7 +1757,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[10:17], v[0:7], a[0:3], v8, s24 op_sel_hi:[0,0,0]
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1782,7 +1782,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[10:17], v[0:7], a[0:3], v8, s24 op_sel_hi:[0,0,0]
 ; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1803,7 +1803,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 33, -2 op_sel_hi:[0,0,0]
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1825,7 +1825,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, -2 op_sel_hi:[0,0,0]
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1843,7 +1843,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, -2 op_sel_hi:[0,0,0]
 ; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1866,7 +1866,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, v16 op_sel_hi:[0,0,0]
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1885,7 +1885,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0]
 ; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1927,7 +1927,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s12, v17 op_sel_hi:[0,0,0] blgp:2
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    global_store_dwordx4 v16, a[0:3], s[14:15]
 ; SDAG-NEXT:    s_endpgm
 ;
@@ -1953,7 +1953,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32
 ; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s28, v16 op_sel_hi:[0,0,0] blgp:2
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    s_nop 2
 ; GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[30:31]
 ; GISEL-NEXT:    s_endpgm
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 2, i32 3, i32 %scale0, i32 1, i32 %scale1)
@@ -1993,7 +1993,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s6, -2 op_sel_hi:[0,0,0]
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    global_store_dwordx4 v16, a[0:3], s[4:5]
 ; SDAG-NEXT:    s_endpgm
 ;
@@ -2020,7 +2020,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
 ; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, -2 op_sel_hi:[0,0,0]
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    s_nop 2
 ; GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[4:5]
 ; GISEL-NEXT:    s_endpgm
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 65, i32 1, i32 -2)
@@ -2040,7 +2040,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a(
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3]
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2062,7 +2062,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_b(
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3]
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2083,7 +2083,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1(<8
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 0, 1 op_sel_hi:[0,0,0]
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2104,7 +2104,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a(
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1, 0 op_sel_hi:[0,0,0]
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2129,7 +2129,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6(
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:2
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2153,7 +2153,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8(
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:2
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2176,7 +2176,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6(
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:2 blgp:2
-; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2199,7 +2199,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6_
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:2 blgp:2
-; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2223,7 +2223,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4(
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:4
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2247,7 +2247,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8(
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:4
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2271,7 +2271,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4(
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:4
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2295,7 +2295,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8(
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:4
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2318,7 +2318,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4(
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:4 blgp:4
-; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2341,7 +2341,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4_
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:4 blgp:4
-; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2365,4 +2365,4 @@ declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8
 declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32>, <6 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1
 
 attributes #0 = { "amdgpu-flat-work-group-size"="512,512" }
-attributes #1 = { convergent nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #1 = { convergent nocallback nofree nosync nounwind willreturn memory(none) }
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
index 05f8739e7cb89..3d959393a8fa7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
@@ -40,7 +40,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0(<8 x
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0]
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -85,7 +85,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0(<8 x
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0]
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -137,7 +137,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1(<8 x
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0]
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -182,7 +182,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1(<8 x
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0]
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -234,7 +234,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_2__cbsz1__blgp1(<8 x
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0]
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -279,7 +279,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_2__cbsz1__blgp1(<8 x
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0]
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -331,7 +331,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_3__cbsz1__blgp1(<8 x
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0]
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -376,7 +376,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_3__cbsz1__blgp1(<8 x
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0]
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -428,7 +428,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_3__cbsz1__blgp1(<8 x
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0]
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -473,7 +473,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_3__cbsz1__blgp1(<8 x
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0]
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -525,7 +525,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_0__cbsz1__blgp1(<8 x
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0]
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -570,7 +570,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_0__cbsz1__blgp1(<8 x
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0]
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -622,7 +622,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_3__cbsz1__blgp1(<8 x
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0]
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -667,7 +667,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_3__cbsz1__blgp1(<8 x
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0]
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -719,7 +719,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1(<8 x
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0]
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -764,7 +764,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1(<8 x
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0]
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -815,7 +815,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0__cons
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15]
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -868,7 +868,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1(<8 x
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] blgp:1
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -913,7 +913,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1(<8 x
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] blgp:1
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -963,7 +963,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1__cons
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] blgp:1
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1015,7 +1015,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2(<8 x
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] blgp:2
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1064,7 +1064,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2__cons
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] blgp:2
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1116,7 +1116,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3(<8 x
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] blgp:3
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1165,7 +1165,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3__cons
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] blgp:3
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1215,7 +1215,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4(<8 x
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] blgp:4
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1264,7 +1264,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4__cons
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15] blgp:4
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1317,7 +1317,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0(<8 x
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:1
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1362,7 +1362,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0(<8 x
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:1
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1412,7 +1412,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0__cons
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:1
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1465,7 +1465,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1(<8 x
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:1 blgp:1
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1510,7 +1510,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1(<8 x
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:1 blgp:1
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1561,7 +1561,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1__cons
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:1 blgp:1
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1613,7 +1613,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2(<8 x
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:1 blgp:2
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1662,7 +1662,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2__cons
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] cbsz:1 blgp:2
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1714,7 +1714,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3(<8 x
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:1 blgp:3
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1763,7 +1763,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3__cons
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] cbsz:1 blgp:3
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1813,7 +1813,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4(<8 x
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:1 blgp:4
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1862,7 +1862,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4__cons
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15] cbsz:1 blgp:4
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1914,7 +1914,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0(<6 x
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:2
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1963,7 +1963,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0__cons
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:2
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2015,7 +2015,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1(<6 x
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:2 blgp:1
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2064,7 +2064,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1__cons
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:2 blgp:1
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2113,7 +2113,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2(<6 x
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:2
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2161,7 +2161,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2__cons
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:2 blgp:2
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2210,7 +2210,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3(<6 x
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:3
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2258,7 +2258,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3__cons
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:2 blgp:3
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2311,7 +2311,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0(<6 x
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:3
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2360,7 +2360,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0__cons
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:3
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2412,7 +2412,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1(<6 x
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:3 blgp:1
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2461,7 +2461,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1__cons
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:3 blgp:1
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2510,7 +2510,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2(<6 x
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:2
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2558,7 +2558,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2__cons
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:3 blgp:2
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2607,7 +2607,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4(<6 x
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:3 blgp:4
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2655,7 +2655,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4__cons
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15] cbsz:3 blgp:4
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2704,7 +2704,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3(<6 x
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:3
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2752,7 +2752,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3__cons
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:3 blgp:3
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2801,7 +2801,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4(<6 x
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:2 blgp:4
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2849,7 +2849,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4__cons
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15] cbsz:2 blgp:4
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2899,7 +2899,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0(<4 x
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:4
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2948,7 +2948,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0__cons
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15] cbsz:4
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2998,7 +2998,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1(<4 x
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:4 blgp:1
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -3047,7 +3047,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1__cons
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15] cbsz:4 blgp:1
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -3096,7 +3096,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2(<4 x
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:2
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -3144,7 +3144,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2__cons
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15] cbsz:4 blgp:2
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -3193,7 +3193,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3(<4 x
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:3
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -3241,7 +3241,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3__cons
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15] cbsz:4 blgp:3
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -3290,7 +3290,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4(<4 x
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:4
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -3338,7 +3338,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4__cons
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:4 blgp:4
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -3393,7 +3393,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__sgpr_
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v16 op_sel_hi:[0,0,0]
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -3441,7 +3441,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__vgpr_
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v31 op_sel_hi:[0,0,0]
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -3489,7 +3489,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgpr_scaleA__sgpr_
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, s0 op_sel_hi:[0,0,0]
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -3567,7 +3567,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[32:39], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0]
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -3637,7 +3637,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[32:39], a[0:15], v14, v15 op_sel_hi:[0,0,0]
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -3691,7 +3691,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], s20, v24 op_sel_hi:[0,0,0]
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -3741,7 +3741,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], s20, v24 op_sel_hi:[0,0,0]
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -3795,7 +3795,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v24, s20 op_sel_hi:[0,0,0]
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -3845,7 +3845,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v24, s20 op_sel_hi:[0,0,0]
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -3899,7 +3899,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[26:33], a[0:15], v24, s20 op_sel_hi:[0,0,0]
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -3949,7 +3949,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[26:33], a[0:15], v24, s20 op_sel_hi:[0,0,0]
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -3995,7 +3995,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_vgpr_sgpr__vgp
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, s28 op_sel_hi:[0,0,0]
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -4041,7 +4041,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_vgpr_sgpr__vgp
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, s28 op_sel_hi:[0,0,0]
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -4111,7 +4111,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[32:39], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0]
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -4177,7 +4177,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[32:39], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0]
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -4224,7 +4224,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 33, -2 op_sel_hi:[0,0,0]
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -4272,7 +4272,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, -2 op_sel_hi:[0,0,0]
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -4316,7 +4316,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, -2 op_sel_hi:[0,0,0]
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -4365,7 +4365,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v31 op_sel_hi:[0,0,0]
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -4410,7 +4410,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0]
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -4477,7 +4477,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32>
 ; SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    s_nop 2
 ; SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[2:3] offset:48
 ; SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[2:3] offset:32
 ; SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[2:3] offset:16
@@ -4520,7 +4520,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32>
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    s_nop 2
 ; GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[2:3]
 ; GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[2:3] offset:16
 ; GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[2:3] offset:32
@@ -4576,7 +4576,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_
 ; SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    s_nop 2
 ; SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
 ; SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
 ; SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
@@ -4619,7 +4619,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    s_nop 2
 ; GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
@@ -4765,7 +4765,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    s_nop 2
 ; GISEL-NEXT:    global_store_dwordx4 v[16:17], a[0:3], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    global_store_dwordx4 v[18:19], a[4:7], off sc0 sc1
@@ -4912,7 +4912,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    global_store_dwordx4 v[16:17], a[0:3], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    global_store_dwordx4 v[18:19], a[4:7], off sc0 sc1
@@ -5059,7 +5059,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    global_store_dwordx4 v[16:17], a[0:3], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    global_store_dwordx4 v[18:19], a[4:7], off sc0 sc1
@@ -5206,7 +5206,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    global_store_dwordx4 v[16:17], a[0:3], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    global_store_dwordx4 v[18:19], a[4:7], off sc0 sc1
@@ -5247,7 +5247,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_a(
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15]
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -5294,7 +5294,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_b(
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15]
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -5341,7 +5341,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1(<8
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 0, 1 op_sel_hi:[0,0,0]
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -5388,7 +5388,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a(
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 1, 0 op_sel_hi:[0,0,0]
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -5441,7 +5441,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6(
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] blgp:2
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -5486,7 +5486,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6(
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] blgp:2
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -5538,7 +5538,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8(
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:2
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -5583,7 +5583,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8(
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:2
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -5634,7 +5634,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6(
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:2 blgp:2
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -5678,7 +5678,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6(
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:2 blgp:2
 ; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -5727,7 +5727,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6_
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:2 blgp:2
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -5779,7 +5779,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4(
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] blgp:4
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -5824,7 +5824,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4(
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] blgp:4
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -5876,7 +5876,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8(
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:4
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -5921,7 +5921,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8(
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:4
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -5972,7 +5972,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v6i32_fp4(
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] blgp:4
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -6023,7 +6023,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v6i32_fp4__v8i32_fp8(
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:4
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -6074,7 +6074,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4(
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:4 blgp:4
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -6118,7 +6118,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4(
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:4 blgp:4
 ; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -6167,7 +6167,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4_
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:4 blgp:4
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -6204,4 +6204,4 @@ declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8
 
 attributes #0 = { "amdgpu-flat-work-group-size"="512,512" }
 attributes #1 = { "amdgpu-flat-work-group-size"="128,128" }
-attributes #2 = { convergent nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #2 = { convergent nocallback nofree nosync nounwind willreturn memory(none) }
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
index 66c02a9bd0c6a..6b922fcd9b550 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
@@ -33,7 +33,7 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) %
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_smfmac_f32_16x16x64_f16 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
-; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7]
 ; SDAG-NEXT:    s_endpgm
 ;
@@ -59,7 +59,7 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) %
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_smfmac_f32_16x16x64_f16 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GISEL-NEXT:    s_nop 5
+; GISEL-NEXT:    s_nop 6
 ; GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[6:7]
 ; GISEL-NEXT:    s_endpgm
 bb:
@@ -81,7 +81,7 @@ define <4 x float> @test_smfmac_f32_16x16x64_f16(<8 x half> %arg0, <16 x half> %
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_16x16x64_f16 a[0:3], v[0:3], v[4:11], v16
-; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -92,7 +92,7 @@ define <4 x float> @test_smfmac_f32_16x16x64_f16(<8 x half> %arg0, <16 x half> %
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16
-; GISEL-NEXT:    s_nop 6
+; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v12
 ; GISEL-NEXT:    v_mov_b32_e32 v1, v13
 ; GISEL-NEXT:    v_mov_b32_e32 v2, v14
@@ -112,7 +112,7 @@ define <4 x float> @test_smfmac_f32_16x16x64_f16__flags0(<8 x half> %arg0, <16 x
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_16x16x64_f16 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
-; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -123,7 +123,7 @@ define <4 x float> @test_smfmac_f32_16x16x64_f16__flags0(<8 x half> %arg0, <16 x
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
-; GISEL-NEXT:    s_nop 6
+; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v12
 ; GISEL-NEXT:    v_mov_b32_e32 v1, v13
 ; GISEL-NEXT:    v_mov_b32_e32 v2, v14
@@ -143,7 +143,7 @@ define <4 x float> @test_smfmac_f32_16x16x64_f16__flags1(<8 x half> %arg0, <16 x
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_16x16x64_f16 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
-; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -154,7 +154,7 @@ define <4 x float> @test_smfmac_f32_16x16x64_f16__flags1(<8 x half> %arg0, <16 x
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
-; GISEL-NEXT:    s_nop 6
+; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v12
 ; GISEL-NEXT:    v_mov_b32_e32 v1, v13
 ; GISEL-NEXT:    v_mov_b32_e32 v2, v14
@@ -187,7 +187,7 @@ define <4 x float> @test_smfmac_f32_16x16x64_f16__sgpr(<8 x half> inreg %arg0, <
 ; SDAG-NEXT:    v_mov_b32_e32 v12, s28
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_16x16x64_f16 a[0:3], v[8:11], v[0:7], v12
-; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -246,7 +246,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_f16__vgpr(ptr addrspace(1) %
 ; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
 ; SDAG-NEXT:    v_mov_b32_e32 v16, 0
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    s_nop 2
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7]
@@ -279,7 +279,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_f16__vgpr(ptr addrspace(1) %
 ; GISEL-NEXT:    v_smfmac_f32_32x32x32_f16 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
 ; GISEL-NEXT:    v_mov_b32_e32 v16, 0
 ; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    s_nop 2
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7]
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
@@ -317,7 +317,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16(<8 x half> %arg0, <16 x half>
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 a[0:15], v[0:3], v[4:11], v28
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -389,7 +389,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__flags0(<8 x half> %arg0, <16
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -461,7 +461,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__flags1(<8 x half> %arg0, <16
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -561,7 +561,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__sgpr(<8 x half> inreg %arg0,
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 a[0:15], v[28:31], v[0:7], v10
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -650,7 +650,7 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_bf16__vgpr(ptr addrspace(1)
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_smfmac_f32_16x16x64_bf16 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
-; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7]
 ; GCN-NEXT:    s_endpgm
 bb:
@@ -672,7 +672,7 @@ define <4 x float> @test_smfmac_f32_16x16x64_bf16(<8 x bfloat> %arg0, <16 x bflo
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_smfmac_f32_16x16x64_bf16 a[0:3], v[0:3], v[4:11], v16
-; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -692,7 +692,7 @@ define <4 x float> @test_smfmac_f32_16x16x64_bf16__flags0(<8 x bfloat> %arg0, <1
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_smfmac_f32_16x16x64_bf16 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
-; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -712,7 +712,7 @@ define <4 x float> @test_smfmac_f32_16x16x64_bf16__flags1(<8 x bfloat> %arg0, <1
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_smfmac_f32_16x16x64_bf16 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
-; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -745,7 +745,7 @@ define <4 x float> @test_smfmac_f32_16x16x64_bf16__sgpr(<8 x bfloat> inreg %arg0
 ; GCN-NEXT:    v_mov_b32_e32 v12, s28
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_smfmac_f32_16x16x64_bf16 a[0:3], v[8:11], v[0:7], v12
-; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -788,7 +788,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_bf16__vgpr(ptr addrspace(1)
 ; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
 ; GCN-NEXT:    v_mov_b32_e32 v16, 0
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    s_nop 2
 ; GCN-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
 ; GCN-NEXT:    global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
 ; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7]
@@ -826,7 +826,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16(<8 x bfloat> %arg0, <16 x bfl
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -871,7 +871,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags0(<8 x bfloat> %arg0, <
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -916,7 +916,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags1(<8 x bfloat> %arg0, <
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -989,7 +989,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__sgpr(<8 x bfloat> inreg %arg
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 a[0:15], v[28:31], v[0:7], v10
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1046,7 +1046,7 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) %
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_smfmac_i32_16x16x128_i8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
-; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7]
 ; SDAG-NEXT:    s_endpgm
 ;
@@ -1072,7 +1072,7 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) %
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_smfmac_i32_16x16x128_i8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GISEL-NEXT:    s_nop 5
+; GISEL-NEXT:    s_nop 6
 ; GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
 ; GISEL-NEXT:    s_endpgm
 bb:
@@ -1094,7 +1094,7 @@ define <4 x i32> @test_smfmac_i32_16x16x128_i8(<4 x i32> %arg0, <8 x i32> %arg1,
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_i32_16x16x128_i8 a[0:3], v[0:3], v[4:11], v16
-; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1105,7 +1105,7 @@ define <4 x i32> @test_smfmac_i32_16x16x128_i8(<4 x i32> %arg0, <8 x i32> %arg1,
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16
-; GISEL-NEXT:    s_nop 6
+; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v12
 ; GISEL-NEXT:    v_mov_b32_e32 v1, v13
 ; GISEL-NEXT:    v_mov_b32_e32 v2, v14
@@ -1125,7 +1125,7 @@ define <4 x i32> @test_smfmac_i32_16x16x128_i8__flags0(<4 x i32> %arg0, <8 x i32
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_i32_16x16x128_i8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
-; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1136,7 +1136,7 @@ define <4 x i32> @test_smfmac_i32_16x16x128_i8__flags0(<4 x i32> %arg0, <8 x i32
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
-; GISEL-NEXT:    s_nop 6
+; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v12
 ; GISEL-NEXT:    v_mov_b32_e32 v1, v13
 ; GISEL-NEXT:    v_mov_b32_e32 v2, v14
@@ -1156,7 +1156,7 @@ define <4 x i32> @test_smfmac_i32_16x16x128_i8__flags1(<4 x i32> %arg0, <8 x i32
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_i32_16x16x128_i8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
-; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1167,7 +1167,7 @@ define <4 x i32> @test_smfmac_i32_16x16x128_i8__flags1(<4 x i32> %arg0, <8 x i32
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
-; GISEL-NEXT:    s_nop 6
+; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v12
 ; GISEL-NEXT:    v_mov_b32_e32 v1, v13
 ; GISEL-NEXT:    v_mov_b32_e32 v2, v14
@@ -1200,7 +1200,7 @@ define <4 x i32> @test_smfmac_i32_16x16x128_i8__sgpr(<4 x i32> inreg %arg0, <8 x
 ; SDAG-NEXT:    v_mov_b32_e32 v12, s28
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_i32_16x16x128_i8 a[0:3], v[8:11], v[0:7], v12
-; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1265,7 +1265,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a
 ; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
 ; SDAG-NEXT:    v_mov_b32_e32 v16, 0
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    s_nop 2
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
@@ -1298,7 +1298,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a
 ; GISEL-NEXT:    v_smfmac_i32_32x32x64_i8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
 ; GISEL-NEXT:    v_mov_b32_e32 v16, 0
 ; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    s_nop 2
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
@@ -1336,7 +1336,7 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8(<4 x i32> %arg0, <8 x i32> %arg1,
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 a[0:15], v[0:3], v[4:11], v28
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1408,7 +1408,7 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags0(<4 x i32> %arg0, <8 x i32
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1480,7 +1480,7 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags1(<4 x i32> %arg0, <8 x i32
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1580,7 +1580,7 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__sgpr(<4 x i32> inreg %arg0, <8 x
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 a[0:15], v[28:31], v[0:7], v10
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1675,7 +1675,7 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_smfmac_f32_16x16x128_bf8_bf8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
-; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7]
 ; SDAG-NEXT:    s_endpgm
 ;
@@ -1701,7 +1701,7 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_smfmac_f32_16x16x128_bf8_bf8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GISEL-NEXT:    s_nop 5
+; GISEL-NEXT:    s_nop 6
 ; GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
 ; GISEL-NEXT:    s_endpgm
 bb:
@@ -1723,7 +1723,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8(<4 x i32> %arg0, <8 x i32>
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[0:3], v[4:11], v16
-; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1734,7 +1734,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8(<4 x i32> %arg0, <8 x i32>
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16
-; GISEL-NEXT:    s_nop 6
+; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v12
 ; GISEL-NEXT:    v_mov_b32_e32 v1, v13
 ; GISEL-NEXT:    v_mov_b32_e32 v2, v14
@@ -1754,7 +1754,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__flags0(<4 x i32> %arg0, <
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
-; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1765,7 +1765,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__flags0(<4 x i32> %arg0, <
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
-; GISEL-NEXT:    s_nop 6
+; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v12
 ; GISEL-NEXT:    v_mov_b32_e32 v1, v13
 ; GISEL-NEXT:    v_mov_b32_e32 v2, v14
@@ -1785,7 +1785,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__flags1(<4 x i32> %arg0, <
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
-; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1796,7 +1796,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__flags1(<4 x i32> %arg0, <
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
-; GISEL-NEXT:    s_nop 6
+; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v12
 ; GISEL-NEXT:    v_mov_b32_e32 v1, v13
 ; GISEL-NEXT:    v_mov_b32_e32 v2, v14
@@ -1829,7 +1829,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__sgpr(<4 x i32> inreg %arg
 ; SDAG-NEXT:    v_mov_b32_e32 v12, s28
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[8:11], v[0:7], v12
-; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1890,7 +1890,7 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_smfmac_f32_16x16x128_bf8_fp8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
-; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7]
 ; SDAG-NEXT:    s_endpgm
 ;
@@ -1916,7 +1916,7 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_smfmac_f32_16x16x128_bf8_fp8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GISEL-NEXT:    s_nop 5
+; GISEL-NEXT:    s_nop 6
 ; GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
 ; GISEL-NEXT:    s_endpgm
 bb:
@@ -1938,7 +1938,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8(<4 x i32> %arg0, <8 x i32>
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[0:3], v[4:11], v16
-; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1949,7 +1949,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8(<4 x i32> %arg0, <8 x i32>
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16
-; GISEL-NEXT:    s_nop 6
+; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v12
 ; GISEL-NEXT:    v_mov_b32_e32 v1, v13
 ; GISEL-NEXT:    v_mov_b32_e32 v2, v14
@@ -1969,7 +1969,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__flags0(<4 x i32> %arg0, <
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
-; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1980,7 +1980,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__flags0(<4 x i32> %arg0, <
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
-; GISEL-NEXT:    s_nop 6
+; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v12
 ; GISEL-NEXT:    v_mov_b32_e32 v1, v13
 ; GISEL-NEXT:    v_mov_b32_e32 v2, v14
@@ -2000,7 +2000,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__flags1(<4 x i32> %arg0, <
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
-; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2011,7 +2011,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__flags1(<4 x i32> %arg0, <
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
-; GISEL-NEXT:    s_nop 6
+; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v12
 ; GISEL-NEXT:    v_mov_b32_e32 v1, v13
 ; GISEL-NEXT:    v_mov_b32_e32 v2, v14
@@ -2044,7 +2044,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__sgpr(<4 x i32> inreg %arg
 ; SDAG-NEXT:    v_mov_b32_e32 v12, s28
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[8:11], v[0:7], v12
-; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2105,7 +2105,7 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_smfmac_f32_16x16x128_fp8_bf8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
-; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7]
 ; SDAG-NEXT:    s_endpgm
 ;
@@ -2131,7 +2131,7 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_smfmac_f32_16x16x128_fp8_bf8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GISEL-NEXT:    s_nop 5
+; GISEL-NEXT:    s_nop 6
 ; GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
 ; GISEL-NEXT:    s_endpgm
 bb:
@@ -2153,7 +2153,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8(<4 x i32> %arg0, <8 x i32>
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[0:3], v[4:11], v16
-; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2164,7 +2164,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8(<4 x i32> %arg0, <8 x i32>
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16
-; GISEL-NEXT:    s_nop 6
+; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v12
 ; GISEL-NEXT:    v_mov_b32_e32 v1, v13
 ; GISEL-NEXT:    v_mov_b32_e32 v2, v14
@@ -2184,7 +2184,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__flags0(<4 x i32> %arg0, <
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
-; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2195,7 +2195,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__flags0(<4 x i32> %arg0, <
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
-; GISEL-NEXT:    s_nop 6
+; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v12
 ; GISEL-NEXT:    v_mov_b32_e32 v1, v13
 ; GISEL-NEXT:    v_mov_b32_e32 v2, v14
@@ -2215,7 +2215,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__flags1(<4 x i32> %arg0, <
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
-; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2226,7 +2226,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__flags1(<4 x i32> %arg0, <
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
-; GISEL-NEXT:    s_nop 6
+; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v12
 ; GISEL-NEXT:    v_mov_b32_e32 v1, v13
 ; GISEL-NEXT:    v_mov_b32_e32 v2, v14
@@ -2259,7 +2259,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__sgpr(<4 x i32> inreg %arg
 ; SDAG-NEXT:    v_mov_b32_e32 v12, s28
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[8:11], v[0:7], v12
-; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2320,7 +2320,7 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_smfmac_f32_16x16x128_fp8_fp8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
-; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7]
 ; SDAG-NEXT:    s_endpgm
 ;
@@ -2346,7 +2346,7 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_smfmac_f32_16x16x128_fp8_fp8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GISEL-NEXT:    s_nop 5
+; GISEL-NEXT:    s_nop 6
 ; GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
 ; GISEL-NEXT:    s_endpgm
 bb:
@@ -2368,7 +2368,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8(<4 x i32> %arg0, <8 x i32>
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[0:3], v[4:11], v16
-; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2379,7 +2379,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8(<4 x i32> %arg0, <8 x i32>
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16
-; GISEL-NEXT:    s_nop 6
+; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v12
 ; GISEL-NEXT:    v_mov_b32_e32 v1, v13
 ; GISEL-NEXT:    v_mov_b32_e32 v2, v14
@@ -2399,7 +2399,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__flags0(<4 x i32> %arg0, <
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
-; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2410,7 +2410,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__flags0(<4 x i32> %arg0, <
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
-; GISEL-NEXT:    s_nop 6
+; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v12
 ; GISEL-NEXT:    v_mov_b32_e32 v1, v13
 ; GISEL-NEXT:    v_mov_b32_e32 v2, v14
@@ -2430,7 +2430,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__flags1(<4 x i32> %arg0, <
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
-; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2441,7 +2441,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__flags1(<4 x i32> %arg0, <
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
-; GISEL-NEXT:    s_nop 6
+; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v12
 ; GISEL-NEXT:    v_mov_b32_e32 v1, v13
 ; GISEL-NEXT:    v_mov_b32_e32 v2, v14
@@ -2474,7 +2474,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__sgpr(<4 x i32> inreg %arg
 ; SDAG-NEXT:    v_mov_b32_e32 v12, s28
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[8:11], v[0:7], v12
-; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2539,7 +2539,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace(
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
 ; SDAG-NEXT:    v_mov_b32_e32 v16, 0
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    s_nop 2
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
@@ -2572,7 +2572,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace(
 ; GISEL-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
 ; GISEL-NEXT:    v_mov_b32_e32 v16, 0
 ; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    s_nop 2
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
@@ -2610,7 +2610,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8(<4 x i32> %arg0, <8 x i32>
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[0:3], v[4:11], v28
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2682,7 +2682,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags0(<4 x i32> %arg0, <
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2754,7 +2754,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags1(<4 x i32> %arg0, <
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2854,7 +2854,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__sgpr(<4 x i32> inreg %arg
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[28:31], v[0:7], v10
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2953,7 +2953,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace(
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
 ; SDAG-NEXT:    v_mov_b32_e32 v16, 0
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    s_nop 2
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
@@ -2986,7 +2986,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace(
 ; GISEL-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
 ; GISEL-NEXT:    v_mov_b32_e32 v16, 0
 ; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    s_nop 2
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
@@ -3024,7 +3024,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8(<4 x i32> %arg0, <8 x i32>
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[0:3], v[4:11], v28
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -3096,7 +3096,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags0(<4 x i32> %arg0, <
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -3168,7 +3168,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags1(<4 x i32> %arg0, <
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -3268,7 +3268,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__sgpr(<4 x i32> inreg %arg
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[28:31], v[0:7], v10
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -3367,7 +3367,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace(
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
 ; SDAG-NEXT:    v_mov_b32_e32 v16, 0
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    s_nop 2
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
@@ -3400,7 +3400,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace(
 ; GISEL-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
 ; GISEL-NEXT:    v_mov_b32_e32 v16, 0
 ; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    s_nop 2
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
@@ -3438,7 +3438,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8(<4 x i32> %arg0, <8 x i32>
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[0:3], v[4:11], v28
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -3510,7 +3510,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags0(<4 x i32> %arg0, <
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -3582,7 +3582,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags1(<4 x i32> %arg0, <
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -3682,7 +3682,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__sgpr(<4 x i32> inreg %arg
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[28:31], v[0:7], v10
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -3781,7 +3781,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace(
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
 ; SDAG-NEXT:    v_mov_b32_e32 v16, 0
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    s_nop 2
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
@@ -3814,7 +3814,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace(
 ; GISEL-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
 ; GISEL-NEXT:    v_mov_b32_e32 v16, 0
 ; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    s_nop 2
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
@@ -3852,7 +3852,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8(<4 x i32> %arg0, <8 x i32>
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[0:3], v[4:11], v28
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -3924,7 +3924,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags0(<4 x i32> %arg0, <
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -3996,7 +3996,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags1(<4 x i32> %arg0, <
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -4096,7 +4096,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[28:31], v[0:7], v10
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -4156,4 +4156,4 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg
   ret <16 x float> %result
 }
 
-attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
+attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir
index 1eb7ec4c142f2..ef30c9a44b2b5 100644
--- a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir
+++ b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir
@@ -734,7 +734,8 @@ body:             |
 ...
 # GCN-LABEL: name: smfmac16x16_write_vgpr_flat_read
 # GCN:      V_SMFMAC
-# GCN-NEXT: S_NOP 6
+# GFX940-NEXT: S_NOP 6
+# GFX950-NEXT: S_NOP 7
 # GCN-NEXT: FLAT_STORE_DWORD
 name:            smfmac16x16_write_vgpr_flat_read
 body:             |
@@ -745,7 +746,8 @@ body:             |
 # GCN-LABEL: name: xdl_smfma16x16_write_vgpr_flat_read
 # GCN:      V_MFMA
 # GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GFX940-NEXT: S_NOP 2
+# GFX950-NEXT: S_NOP 3
 # GCN-NEXT: FLAT_STORE_DWORD
 name:            xdl_smfma16x16_write_vgpr_flat_read
 body:             |
@@ -756,7 +758,8 @@ body:             |
 # GCN-LABEL: name: smfmac32x32_write_vgpr_flat_read
 # GCN:      V_SMFMAC
 # GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GFX940-NEXT: S_NOP 2
+# GFX950-NEXT: S_NOP 3
 # GCN-NEXT: FLAT_STORE_DWORD
 name:            smfmac32x32_write_vgpr_flat_read
 body:             |
@@ -768,7 +771,8 @@ body:             |
 # GCN:      V_MFMA
 # GCN-NEXT: S_NOP 7
 # GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GFX940-NEXT: S_NOP 2
+# GFX950-NEXT: S_NOP 3
 # GCN-NEXT: FLAT_STORE_DWORD
 name:            xdl_smfma32x32_write_vgpr_flat_read
 body:             |
@@ -823,7 +827,8 @@ body:             |
 # GCN-LABEL: name: xdl_smfma16x16_write_vgpr_valu_read
 # GCN:      V_MFMA
 # GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GFX940-NEXT: S_NOP 2
+# GFX950-NEXT: S_NOP 3
 # GCN-NEXT: V_MOV_B32
 name:            xdl_smfma16x16_write_vgpr_valu_read
 body:             |
@@ -835,7 +840,8 @@ body:             |
 # GCN:      V_MFMA
 # GCN-NEXT: S_NOP 7
 # GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GFX940-NEXT: S_NOP 2
+# GFX950-NEXT: S_NOP 3
 # GCN-NEXT: V_MOV_B32
 name:            xdl_smfma32x32_write_vgpr_valu_read
 body:             |
@@ -881,7 +887,8 @@ body:             |
 # GCN-LABEL: name: xdl_smfma16x16_write_vgpr_accv_read
 # GCN:      V_MFMA
 # GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GFX940-NEXT: S_NOP 2
+# GFX950-NEXT: S_NOP 3
 # GCN-NEXT: V_ACCVGPR_WRITE_B32_e64
 name:            xdl_smfma16x16_write_vgpr_accv_read
 body:             |
@@ -893,7 +900,8 @@ body:             |
 # GCN:      V_MFMA
 # GCN-NEXT: S_NOP 7
 # GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GFX940-NEXT: S_NOP 2
+# GFX950-NEXT: S_NOP 3
 # GCN-NEXT: V_ACCVGPR_WRITE_B32_e64
 name:            xdl_smfma32x32_write_vgpr_accv_read
 body:             |
@@ -1028,7 +1036,8 @@ body:             |
 # GCN:      V_MFMA
 # GCN-NEXT: S_NOP 7
 # GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GFX940-NEXT: S_NOP 2
+# GFX950-NEXT: S_NOP 3
 # GCN-NEXT: V_MOV_B32_sdwa
 name:            xdl_smfma32x32_write_vgpr_valu_sdwa_write
 body:             |
@@ -1762,7 +1771,8 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_sgemm16X16X16_mfma_write_vgpr_vm_read
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 6
+# GFX940-NEXT: S_NOP 6
+# GFX950-NEXT: S_NOP 7
 # GCN-NEXT: BUFFER_STORE_DWORD
 name:            xdl_sgemm16X16X16_mfma_write_vgpr_vm_read
 body:             |
@@ -1772,7 +1782,8 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_sgemm16X16X16_mfma_write_vgpr_valu_read
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 6
+# GFX940-NEXT: S_NOP 6
+# GFX950-NEXT: S_NOP 7
 # GCN-NEXT: V_MOV_B32
 name:            xdl_sgemm16X16X16_mfma_write_vgpr_valu_read
 body:             |
@@ -1782,7 +1793,8 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_sgemm16X16X16_mfma_write_vgpr_dot_read
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 6
+# GFX940-NEXT: S_NOP 6
+# GFX950-NEXT: S_NOP 7
 # GCN-NEXT: V_DOT
 name:            xdl_sgemm16X16X16_mfma_write_vgpr_dot_read
 body:             |

From 95354f3ea45a831783b86d8459e90ce8f69216f5 Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Sat, 1 Feb 2025 20:41:15 +0800
Subject: [PATCH 126/282] [InstCombine] Check nowrap flags when folding
 comparison of GEPs with the same base pointer (#121892)

Alive2: https://alive2.llvm.org/ce/z/P5XbMx
Closes https://github.com/llvm/llvm-project/issues/121890

TODO: It is still safe to perform this transform without nowrap flags if
the corresponding scale factor is 1 byte:
https://alive2.llvm.org/ce/z/J-JCJd

(cherry picked from commit 9725595f3acc0c1aaa354e15ac4ee2b1f8ff4cc9)
---
 .../InstCombine/InstCombineCompares.cpp       |  2 +-
 .../test/Transforms/InstCombine/opaque-ptr.ll | 35 +++++++++++++++++++
 2 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 5a4791870ac77..b64ac20ab0533 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -834,7 +834,7 @@ Instruction *InstCombinerImpl::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
         return replaceInstUsesWith(I, // No comparison is needed here.
           ConstantInt::get(I.getType(), ICmpInst::isTrueWhenEqual(Cond)));
 
-      else if (NumDifferences == 1 && CanFold(NW)) {
+      else if (NumDifferences == 1 && NW != GEPNoWrapFlags::none()) {
         Value *LHSV = GEPLHS->getOperand(DiffOperand);
         Value *RHSV = GEPRHS->getOperand(DiffOperand);
         return NewICmp(NW, LHSV, RHSV);
diff --git a/llvm/test/Transforms/InstCombine/opaque-ptr.ll b/llvm/test/Transforms/InstCombine/opaque-ptr.ll
index b05274658e812..be734243d14a1 100644
--- a/llvm/test/Transforms/InstCombine/opaque-ptr.ll
+++ b/llvm/test/Transforms/InstCombine/opaque-ptr.ll
@@ -467,6 +467,41 @@ define i1 @cmp_gep_same_base_same_type(ptr %ptr, i64 %idx1, i64 %idx2) {
   ret i1 %cmp
 }
 
+define i1 @cmp_gep_same_base_same_type_maywrap(ptr %ptr, i64 %idx1, i64 %idx2) {
+; CHECK-LABEL: @cmp_gep_same_base_same_type_maywrap(
+; CHECK-NEXT:    [[CMP_UNSHIFTED:%.*]] = xor i64 [[IDX1:%.*]], [[IDX2:%.*]]
+; CHECK-NEXT:    [[CMP_MASK:%.*]] = and i64 [[CMP_UNSHIFTED]], 4611686018427387903
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[CMP_MASK]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %gep1 = getelementptr i32, ptr %ptr, i64 %idx1
+  %gep2 = getelementptr i32, ptr %ptr, i64 %idx2
+  %cmp = icmp eq ptr %gep1, %gep2
+  ret i1 %cmp
+}
+
+define i1 @cmp_gep_same_base_same_type_nuw(ptr %ptr, i64 %idx1, i64 %idx2) {
+; CHECK-LABEL: @cmp_gep_same_base_same_type_nuw(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IDX1:%.*]], [[IDX2:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %gep1 = getelementptr nuw i32, ptr %ptr, i64 %idx1
+  %gep2 = getelementptr nuw i32, ptr %ptr, i64 %idx2
+  %cmp = icmp eq ptr %gep1, %gep2
+  ret i1 %cmp
+}
+
+define i1 @cmp_gep_same_base_same_type_nusw(ptr %ptr, i64 %idx1, i64 %idx2) {
+; CHECK-LABEL: @cmp_gep_same_base_same_type_nusw(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IDX1:%.*]], [[IDX2:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %gep1 = getelementptr nusw i32, ptr %ptr, i64 %idx1
+  %gep2 = getelementptr nusw i32, ptr %ptr, i64 %idx2
+  %cmp = icmp eq ptr %gep1, %gep2
+  ret i1 %cmp
+}
+
 define i1 @cmp_gep_same_base_different_type(ptr %ptr, i64 %idx1, i64 %idx2) {
 ; CHECK-LABEL: @cmp_gep_same_base_different_type(
 ; CHECK-NEXT:    [[GEP1_IDX:%.*]] = shl nsw i64 [[IDX1:%.*]], 2

From 3661520b46dcf704ebb5e85df011eb30ff3ae67d Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Wed, 29 Jan 2025 03:58:29 +0000
Subject: [PATCH 127/282] [ORC][LLI] Remove redundant eh-frame registration
 plugin construction from lli.

As of d0052ebbe2e the setUpGenericLLVMIRPlatform function will automatically
add an instance of the EHFrameRegistrationPlugin (for LLJIT instances whose
object linking layers are ObjectLinkingLayers, not RTDyldObjectLinkingLayers).

This commit removes the redundant plugin creation in the object linking
layer constructor function in lli.cpp to prevent duplicate registration of
eh-frames, which is likely the cause of recent bot failures, e.g.
https://lab.llvm.org/buildbot/#/builders/108/builds/8685.

(cherry picked from commit 9052b37ab1aa67a039b34356f37236fecc42bac2)
---
 llvm/tools/lli/lli.cpp | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/llvm/tools/lli/lli.cpp b/llvm/tools/lli/lli.cpp
index 448660a539a0b..19246f0394167 100644
--- a/llvm/tools/lli/lli.cpp
+++ b/llvm/tools/lli/lli.cpp
@@ -27,9 +27,7 @@
 #include "llvm/ExecutionEngine/Orc/AbsoluteSymbols.h"
 #include "llvm/ExecutionEngine/Orc/DebugUtils.h"
 #include "llvm/ExecutionEngine/Orc/Debugging/DebuggerSupport.h"
-#include "llvm/ExecutionEngine/Orc/EHFrameRegistrationPlugin.h"
 #include "llvm/ExecutionEngine/Orc/EPCDynamicLibrarySearchGenerator.h"
-#include "llvm/ExecutionEngine/Orc/EPCEHFrameRegistrar.h"
 #include "llvm/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.h"
 #include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
 #include "llvm/ExecutionEngine/Orc/IRPartitionLayer.h"
@@ -1033,14 +1031,10 @@ int runOrcJIT(const char *ProgName) {
     Builder.getJITTargetMachineBuilder()
         ->setRelocationModel(Reloc::PIC_)
         .setCodeModel(CodeModel::Small);
-    Builder.setObjectLinkingLayerCreator([&P](orc::ExecutionSession &ES,
-                                              const Triple &TT) {
-      auto L = std::make_unique<orc::ObjectLinkingLayer>(ES);
-      if (P != LLJITPlatform::ExecutorNative)
-        L->addPlugin(std::make_unique<orc::EHFrameRegistrationPlugin>(
-            ES, ExitOnErr(orc::EPCEHFrameRegistrar::Create(ES))));
-      return L;
-    });
+    Builder.setObjectLinkingLayerCreator(
+        [&](orc::ExecutionSession &ES, const Triple &TT) {
+          return std::make_unique<orc::ObjectLinkingLayer>(ES);
+        });
   }
 
   auto J = ExitOnErr(Builder.create());

From 4806a5572e71e469015e04472a38afe35a7106d3 Mon Sep 17 00:00:00 2001
From: Piyou Chen <piyou.chen@sifive.com>
Date: Tue, 11 Feb 2025 15:19:19 +0800
Subject: [PATCH 128/282] [RISCV][compiler-rt] drop __riscv_vendor_feature_bits
 (#126460)

Address https://github.com/riscv-non-isa/riscv-c-api-doc/pull/101

(cherry picked from commit 2cd8207b26ea4269630feba661f68554d7ae3c15)
---
 compiler-rt/lib/builtins/cpu_model/riscv.c | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/compiler-rt/lib/builtins/cpu_model/riscv.c b/compiler-rt/lib/builtins/cpu_model/riscv.c
index 74534896057ef..6879c2ad48264 100644
--- a/compiler-rt/lib/builtins/cpu_model/riscv.c
+++ b/compiler-rt/lib/builtins/cpu_model/riscv.c
@@ -14,12 +14,6 @@ struct {
   unsigned long long features[RISCV_FEATURE_BITS_LENGTH];
 } __riscv_feature_bits __attribute__((visibility("hidden"), nocommon));
 
-#define RISCV_VENDOR_FEATURE_BITS_LENGTH 1
-struct {
-  unsigned length;
-  unsigned long long features[RISCV_VENDOR_FEATURE_BITS_LENGTH];
-} __riscv_vendor_feature_bits __attribute__((visibility("hidden"), nocommon));
-
 struct {
   unsigned mvendorid;
   unsigned long long marchid;
@@ -338,11 +332,11 @@ static int FeaturesBitCached = 0;
 void __init_riscv_feature_bits(void *);
 static void __init_riscv_feature_bits_ctor(void) CONSTRUCTOR_ATTRIBUTE;
 
-// A constructor function that sets __riscv_feature_bits, and
-// __riscv_vendor_feature_bits to the right values.  This needs to run
-// only once.  This constructor is given the highest priority and it should
-// run before constructors without the priority set.  However, it still runs
-// after ifunc initializers and needs to be called explicitly there.
+// A constructor function that sets __riscv_feature_bits
+// to the right values.  This needs to run only once.  This constructor is given
+// the highest priority and it should run before constructors without the
+// priority set.  However, it still runs after ifunc initializers and needs to
+// be called explicitly there.
 
 static void CONSTRUCTOR_ATTRIBUTE __init_riscv_feature_bits_ctor(void) {
   __init_riscv_feature_bits(0);
@@ -357,7 +351,6 @@ void __init_riscv_feature_bits(void *PlatformArgs) {
     return;
 
   __riscv_feature_bits.length = RISCV_FEATURE_BITS_LENGTH;
-  __riscv_vendor_feature_bits.length = RISCV_VENDOR_FEATURE_BITS_LENGTH;
 
 #if defined(__linux__)
   struct riscv_hwprobe Hwprobes[] = {

From 20621e2609ac49dcf01c4353dbfd69626cea3015 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Tue, 11 Feb 2025 21:59:18 -0800
Subject: [PATCH 129/282] Bump version to 20.1.0-rc2 (#126859)

---
 cmake/Modules/LLVMVersion.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/Modules/LLVMVersion.cmake b/cmake/Modules/LLVMVersion.cmake
index fd7cc0868aa3c..848c94feb19e1 100644
--- a/cmake/Modules/LLVMVersion.cmake
+++ b/cmake/Modules/LLVMVersion.cmake
@@ -10,6 +10,6 @@ if(NOT DEFINED LLVM_VERSION_PATCH)
   set(LLVM_VERSION_PATCH 0)
 endif()
 if(NOT DEFINED LLVM_VERSION_SUFFIX)
-  set(LLVM_VERSION_SUFFIX -rc1)
+  set(LLVM_VERSION_SUFFIX -rc2)
 endif()
 

From cc60c22a9247423d5e32bd848c405e09b1175a32 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Wed, 12 Feb 2025 11:04:08 +0000
Subject: [PATCH 130/282] [AArch64][DAG] Allow fptos/ui.sat to scalarized.
 (#126799)

We we previously running into problems with fp128 types and certain
integer sizes.

Fixes an issue reported on #124984

(cherry picked from commit bf7af2d12e3bb8c7bc322ed1c5bf4e9904ad409c)
---
 llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h |   1 +
 .../SelectionDAG/LegalizeVectorTypes.cpp      |  18 +++
 .../test/CodeGen/AArch64/fptosi-sat-vector.ll | 148 ++++++++++++++++++
 .../test/CodeGen/AArch64/fptoui-sat-vector.ll | 118 ++++++++++++++
 4 files changed, 285 insertions(+)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index f13f70e66cfaa..b58c160b5c8b8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -880,6 +880,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   bool ScalarizeVectorOperand(SDNode *N, unsigned OpNo);
   SDValue ScalarizeVecOp_BITCAST(SDNode *N);
   SDValue ScalarizeVecOp_UnaryOp(SDNode *N);
+  SDValue ScalarizeVecOp_UnaryOpWithExtraInput(SDNode *N);
   SDValue ScalarizeVecOp_UnaryOp_StrictFP(SDNode *N);
   SDValue ScalarizeVecOp_CONCAT_VECTORS(SDNode *N);
   SDValue ScalarizeVecOp_INSERT_SUBVECTOR(SDNode *N, unsigned OpNo);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 1000235ab4061..7c77e8a72c8ef 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -773,6 +773,10 @@ bool DAGTypeLegalizer::ScalarizeVectorOperand(SDNode *N, unsigned OpNo) {
   case ISD::LLRINT:
     Res = ScalarizeVecOp_UnaryOp(N);
     break;
+  case ISD::FP_TO_SINT_SAT:
+  case ISD::FP_TO_UINT_SAT:
+    Res = ScalarizeVecOp_UnaryOpWithExtraInput(N);
+    break;
   case ISD::STRICT_SINT_TO_FP:
   case ISD::STRICT_UINT_TO_FP:
   case ISD::STRICT_FP_TO_SINT:
@@ -880,6 +884,20 @@ SDValue DAGTypeLegalizer::ScalarizeVecOp_UnaryOp(SDNode *N) {
   return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), N->getValueType(0), Op);
 }
 
+/// Same as ScalarizeVecOp_UnaryOp with an extra operand (for example a
+/// typesize).
+SDValue DAGTypeLegalizer::ScalarizeVecOp_UnaryOpWithExtraInput(SDNode *N) {
+  assert(N->getValueType(0).getVectorNumElements() == 1 &&
+         "Unexpected vector type!");
+  SDValue Elt = GetScalarizedVector(N->getOperand(0));
+  SDValue Op =
+      DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0).getScalarType(),
+                  Elt, N->getOperand(1));
+  // Revectorize the result so the types line up with what the uses of this
+  // expression expect.
+  return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), N->getValueType(0), Op);
+}
+
 /// If the input is a vector that needs to be scalarized, it must be <1 x ty>.
 /// Do the strict FP operation on the element instead.
 SDValue DAGTypeLegalizer::ScalarizeVecOp_UnaryOp_StrictFP(SDNode *N) {
diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
index 9ef6d61c350ec..b2b3430f4d85e 100644
--- a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
@@ -5548,3 +5548,151 @@ define <16 x i16> @test_signed_v16f64_v16i16(<16 x double> %f) {
     %x = call <16 x i16> @llvm.fptosi.sat.v16f64.v16i16(<16 x double> %f)
     ret <16 x i16> %x
 }
+
+define <2 x i64> @test_signed_v2f128_v2i64(<2 x fp128> %f) {
+; CHECK-SD-LABEL: test_signed_v2f128_v2i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sub sp, sp, #96
+; CHECK-SD-NEXT:    stp x30, x21, [sp, #64] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 96
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-NEXT:    .cfi_offset w21, -24
+; CHECK-SD-NEXT:    .cfi_offset w30, -32
+; CHECK-SD-NEXT:    mov v2.16b, v1.16b
+; CHECK-SD-NEXT:    stp q1, q0, [sp, #32] // 32-byte Folded Spill
+; CHECK-SD-NEXT:    adrp x8, .LCPI86_0
+; CHECK-SD-NEXT:    ldr q1, [x8, :lo12:.LCPI86_0]
+; CHECK-SD-NEXT:    mov v0.16b, v2.16b
+; CHECK-SD-NEXT:    str q1, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    bl __getf2
+; CHECK-SD-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov w19, w0
+; CHECK-SD-NEXT:    bl __fixtfdi
+; CHECK-SD-NEXT:    adrp x8, .LCPI86_1
+; CHECK-SD-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    cmp w19, #0
+; CHECK-SD-NEXT:    ldr q1, [x8, :lo12:.LCPI86_1]
+; CHECK-SD-NEXT:    mov x20, #-9223372036854775808 // =0x8000000000000000
+; CHECK-SD-NEXT:    csel x19, x20, x0, lt
+; CHECK-SD-NEXT:    str q1, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    bl __gttf2
+; CHECK-SD-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov x21, #9223372036854775807 // =0x7fffffffffffffff
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    csel x19, x21, x19, gt
+; CHECK-SD-NEXT:    mov v1.16b, v0.16b
+; CHECK-SD-NEXT:    bl __unordtf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    csel x8, xzr, x19, ne
+; CHECK-SD-NEXT:    fmov d0, x8
+; CHECK-SD-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    ldr q0, [sp, #48] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    bl __getf2
+; CHECK-SD-NEXT:    ldr q0, [sp, #48] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov w19, w0
+; CHECK-SD-NEXT:    bl __fixtfdi
+; CHECK-SD-NEXT:    ldr q0, [sp, #48] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    cmp w19, #0
+; CHECK-SD-NEXT:    csel x19, x20, x0, lt
+; CHECK-SD-NEXT:    bl __gttf2
+; CHECK-SD-NEXT:    ldr q0, [sp, #48] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    csel x19, x21, x19, gt
+; CHECK-SD-NEXT:    mov v1.16b, v0.16b
+; CHECK-SD-NEXT:    bl __unordtf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ldr q1, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    csel x8, xzr, x19, ne
+; CHECK-SD-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    fmov d0, x8
+; CHECK-SD-NEXT:    ldp x30, x21, [sp, #64] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT:    add sp, sp, #96
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_signed_v2f128_v2i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sub sp, sp, #112
+; CHECK-GI-NEXT:    stp x30, x23, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x22, x21, [sp, #80] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #96] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 112
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w21, -24
+; CHECK-GI-NEXT:    .cfi_offset w22, -32
+; CHECK-GI-NEXT:    .cfi_offset w23, -40
+; CHECK-GI-NEXT:    .cfi_offset w30, -48
+; CHECK-GI-NEXT:    adrp x8, .LCPI86_1
+; CHECK-GI-NEXT:    str q1, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI86_1]
+; CHECK-GI-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    mov v1.16b, v2.16b
+; CHECK-GI-NEXT:    str q2, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    bl __getf2
+; CHECK-GI-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    mov x20, #-4594234569871327232 // =0xc03e000000000000
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    csel x19, x8, xzr, lt
+; CHECK-GI-NEXT:    mov x8, v0.d[1]
+; CHECK-GI-NEXT:    mov v0.d[0], x19
+; CHECK-GI-NEXT:    csel x21, x8, x20, lt
+; CHECK-GI-NEXT:    adrp x8, .LCPI86_0
+; CHECK-GI-NEXT:    mov v0.d[1], x21
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI86_0]
+; CHECK-GI-NEXT:    str q1, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    bl __gttf2
+; CHECK-GI-NEXT:    mov x22, #-1125899906842624 // =0xfffc000000000000
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    mov x23, #4629137466983448575 // =0x403dffffffffffff
+; CHECK-GI-NEXT:    csel x8, x19, x22, gt
+; CHECK-GI-NEXT:    mov v0.d[0], x8
+; CHECK-GI-NEXT:    csel x8, x21, x23, gt
+; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    bl __fixtfdi
+; CHECK-GI-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov x19, x0
+; CHECK-GI-NEXT:    mov v1.16b, v0.16b
+; CHECK-GI-NEXT:    bl __unordtf2
+; CHECK-GI-NEXT:    ldr q0, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    csel x21, xzr, x19, ne
+; CHECK-GI-NEXT:    bl __getf2
+; CHECK-GI-NEXT:    ldr q0, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    csel x19, x8, xzr, lt
+; CHECK-GI-NEXT:    mov x8, v0.d[1]
+; CHECK-GI-NEXT:    mov v0.d[0], x19
+; CHECK-GI-NEXT:    csel x20, x8, x20, lt
+; CHECK-GI-NEXT:    mov v0.d[1], x20
+; CHECK-GI-NEXT:    bl __gttf2
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    csel x8, x19, x22, gt
+; CHECK-GI-NEXT:    mov v0.d[0], x8
+; CHECK-GI-NEXT:    csel x8, x20, x23, gt
+; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    bl __fixtfdi
+; CHECK-GI-NEXT:    ldr q0, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov x19, x0
+; CHECK-GI-NEXT:    mov v1.16b, v0.16b
+; CHECK-GI-NEXT:    bl __unordtf2
+; CHECK-GI-NEXT:    mov v0.d[0], x21
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    csel x8, xzr, x19, ne
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #96] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x22, x21, [sp, #80] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x30, x23, [sp, #64] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    add sp, sp, #112
+; CHECK-GI-NEXT:    ret
+    %x = call <2 x i64> @llvm.fptosi.sat.v2f128.v2i64(<2 x fp128> %f)
+    ret <2 x i64> %x
+}
diff --git a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
index e1670ad2dc053..b76df6a101e5f 100644
--- a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
@@ -4546,3 +4546,121 @@ define <16 x i16> @test_unsigned_v16f64_v16i16(<16 x double> %f) {
     %x = call <16 x i16> @llvm.fptoui.sat.v16f64.v16i16(<16 x double> %f)
     ret <16 x i16> %x
 }
+
+define <2 x i64> @test_signed_v2f128_v2i64(<2 x fp128> %f) {
+; CHECK-SD-LABEL: test_signed_v2f128_v2i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sub sp, sp, #80
+; CHECK-SD-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 80
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    mov v2.16b, v1.16b
+; CHECK-SD-NEXT:    stp q1, q0, [sp, #32] // 32-byte Folded Spill
+; CHECK-SD-NEXT:    adrp x8, .LCPI86_0
+; CHECK-SD-NEXT:    ldr q1, [x8, :lo12:.LCPI86_0]
+; CHECK-SD-NEXT:    mov v0.16b, v2.16b
+; CHECK-SD-NEXT:    str q1, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    bl __getf2
+; CHECK-SD-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov w19, w0
+; CHECK-SD-NEXT:    bl __fixunstfdi
+; CHECK-SD-NEXT:    adrp x8, .LCPI86_1
+; CHECK-SD-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    cmp w19, #0
+; CHECK-SD-NEXT:    ldr q1, [x8, :lo12:.LCPI86_1]
+; CHECK-SD-NEXT:    csel x19, xzr, x0, lt
+; CHECK-SD-NEXT:    str q1, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    bl __gttf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    csinv x8, x19, xzr, le
+; CHECK-SD-NEXT:    fmov d0, x8
+; CHECK-SD-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    ldr q0, [sp, #48] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    bl __getf2
+; CHECK-SD-NEXT:    ldr q0, [sp, #48] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov w19, w0
+; CHECK-SD-NEXT:    bl __fixunstfdi
+; CHECK-SD-NEXT:    ldr q0, [sp, #48] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    cmp w19, #0
+; CHECK-SD-NEXT:    csel x19, xzr, x0, lt
+; CHECK-SD-NEXT:    bl __gttf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ldr q1, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    csinv x8, x19, xzr, le
+; CHECK-SD-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    fmov d0, x8
+; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT:    add sp, sp, #80
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_signed_v2f128_v2i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sub sp, sp, #96
+; CHECK-GI-NEXT:    stp x30, x23, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x22, x21, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 96
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w21, -24
+; CHECK-GI-NEXT:    .cfi_offset w22, -32
+; CHECK-GI-NEXT:    .cfi_offset w23, -40
+; CHECK-GI-NEXT:    .cfi_offset w30, -48
+; CHECK-GI-NEXT:    adrp x8, .LCPI86_1
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI86_1]
+; CHECK-GI-NEXT:    stp q2, q1, [sp, #16] // 32-byte Folded Spill
+; CHECK-GI-NEXT:    mov v1.16b, v2.16b
+; CHECK-GI-NEXT:    bl __getf2
+; CHECK-GI-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    csel x19, x8, xzr, lt
+; CHECK-GI-NEXT:    mov x8, v0.d[1]
+; CHECK-GI-NEXT:    mov v0.d[0], x19
+; CHECK-GI-NEXT:    csel x20, x8, xzr, lt
+; CHECK-GI-NEXT:    adrp x8, .LCPI86_0
+; CHECK-GI-NEXT:    mov v0.d[1], x20
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI86_0]
+; CHECK-GI-NEXT:    str q1, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    bl __gttf2
+; CHECK-GI-NEXT:    mov x21, #-562949953421312 // =0xfffe000000000000
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    mov x22, #4629418941960159231 // =0x403effffffffffff
+; CHECK-GI-NEXT:    csel x8, x19, x21, gt
+; CHECK-GI-NEXT:    mov v0.d[0], x8
+; CHECK-GI-NEXT:    csel x8, x20, x22, gt
+; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    bl __fixunstfdi
+; CHECK-GI-NEXT:    ldp q1, q0, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    mov x19, x0
+; CHECK-GI-NEXT:    bl __getf2
+; CHECK-GI-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    csel x20, x8, xzr, lt
+; CHECK-GI-NEXT:    mov x8, v0.d[1]
+; CHECK-GI-NEXT:    mov v0.d[0], x20
+; CHECK-GI-NEXT:    csel x23, x8, xzr, lt
+; CHECK-GI-NEXT:    mov v0.d[1], x23
+; CHECK-GI-NEXT:    bl __gttf2
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    csel x8, x20, x21, gt
+; CHECK-GI-NEXT:    mov v0.d[0], x8
+; CHECK-GI-NEXT:    csel x8, x23, x22, gt
+; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    bl __fixunstfdi
+; CHECK-GI-NEXT:    mov v0.d[0], x19
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x22, x21, [sp, #64] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x30, x23, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v0.d[1], x0
+; CHECK-GI-NEXT:    add sp, sp, #96
+; CHECK-GI-NEXT:    ret
+    %x = call <2 x i64> @llvm.fptoui.sat.v2f128.v2i64(<2 x fp128> %f)
+    ret <2 x i64> %x
+}

From 2342bb2b004349b466e4473db53d190d797f9ef2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20G=C3=B3rny?= <mgorny@gentoo.org>
Date: Tue, 11 Feb 2025 17:20:23 +0100
Subject: [PATCH 131/282] [llvm] [cmake] Expose `LLVM_BUILD_TELEMETRY` in
 `LLVMConfig.cmake` (#126710)

Add `LLVM_BUILD_TELEMETRY` to the list of flags exposed in
`LLVMConfig.cmake`. This fixes telemetry library being misdetected as
`OFF` when building LLDB standalone.

Fixes bac62ee5b473e70981a6bd9759ec316315fca07d.

------

I guess this also needs a backport to 20.x.

(cherry picked from commit 2ad9d5f5f01cd4f29788a0cf7b21790df13fca71)
---
 llvm/cmake/modules/LLVMConfig.cmake.in | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/cmake/modules/LLVMConfig.cmake.in b/llvm/cmake/modules/LLVMConfig.cmake.in
index c49f10b9343ff..28655ee3ab87d 100644
--- a/llvm/cmake/modules/LLVMConfig.cmake.in
+++ b/llvm/cmake/modules/LLVMConfig.cmake.in
@@ -100,6 +100,8 @@ set(LLVM_ENABLE_PIC @LLVM_ENABLE_PIC@)
 
 set(LLVM_BUILD_32_BITS @LLVM_BUILD_32_BITS@)
 
+set(LLVM_BUILD_TELEMETRY @LLVM_BUILD_TELEMETRY@)
+
 if (NOT "@LLVM_PTHREAD_LIB@" STREQUAL "")
   set(LLVM_PTHREAD_LIB "@LLVM_PTHREAD_LIB@")
 endif()

From c99d6118fe0d1c323b2a978344491881197c8dee Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 12 Feb 2025 09:44:26 +0100
Subject: [PATCH 132/282] [BOLT] Use getMainExecutable() (#126698)

Use LLVM's getMainExecutable() helper instead of rolling our own. This
will result in standard behavior across platforms, such as making sure
that symlinks are always resolved.

(cherry picked from commit 0abe058d7f99c9c7bbaf4ee98308c5e78d229897)
---
 bolt/tools/driver/llvm-bolt.cpp | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/bolt/tools/driver/llvm-bolt.cpp b/bolt/tools/driver/llvm-bolt.cpp
index efa06cd68cb99..f151cf5f63fc5 100644
--- a/bolt/tools/driver/llvm-bolt.cpp
+++ b/bolt/tools/driver/llvm-bolt.cpp
@@ -173,16 +173,6 @@ void boltMode(int argc, char **argv) {
   }
 }
 
-static std::string GetExecutablePath(const char *Argv0) {
-  SmallString<256> ExecutablePath(Argv0);
-  // Do a PATH lookup if Argv0 isn't a valid path.
-  if (!llvm::sys::fs::exists(ExecutablePath))
-    if (llvm::ErrorOr<std::string> P =
-            llvm::sys::findProgramByName(ExecutablePath))
-      ExecutablePath = *P;
-  return std::string(ExecutablePath);
-}
-
 int main(int argc, char **argv) {
   // Print a stack trace if we signal out.
   sys::PrintStackTraceOnErrorSignal(argv[0]);
@@ -190,7 +180,7 @@ int main(int argc, char **argv) {
 
   llvm_shutdown_obj Y; // Call llvm_shutdown() on exit.
 
-  std::string ToolPath = GetExecutablePath(argv[0]);
+  std::string ToolPath = llvm::sys::fs::getMainExecutable(argv[0], nullptr);
 
   // Initialize targets and assembly printers/parsers.
   llvm::InitializeAllTargetInfos();

From 539174b470f48fa688c59276ebabc841beadcf37 Mon Sep 17 00:00:00 2001
From: CarolineConcatto <caroline.concatto@arm.com>
Date: Fri, 14 Feb 2025 02:30:50 +0000
Subject: [PATCH 133/282] Add release note for Armv9.6 updates (#126513)

---
 llvm/docs/ReleaseNotes.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index 28908490b8f7c..b42e111dc4283 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -151,6 +151,9 @@ Changes to the AArch64 Backend
 
 * Added support for the FUJITSU-MONAKA CPU.
 
+* Updated feature dependency in Armv9.6 for FEAT_FAMINMAX, FEAT_LUT and
+  FEAT_FP8, now they depend only on FEAT_NEON.
+
 Changes to the AMDGPU Backend
 -----------------------------
 

From 82150695fb944729a7fa2e6622318ca90e5d5e95 Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Tue, 4 Feb 2025 01:33:44 -0800
Subject: [PATCH 134/282] [clang-format] Hanlde qualified type name for
 `QualifierAlignment` (#125327)

Fixes #125178.

(cherry picked from commit eb6ca1242c1035fac6a8f1edfe7925b4994d4ecf)
---
 clang/lib/Format/QualifierAlignmentFixer.cpp  | 12 +++++++++++-
 clang/unittests/Format/QualifierFixerTest.cpp | 15 +++++++++++++++
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Format/QualifierAlignmentFixer.cpp b/clang/lib/Format/QualifierAlignmentFixer.cpp
index 21fb5074b4928..23e8b44eee15c 100644
--- a/clang/lib/Format/QualifierAlignmentFixer.cpp
+++ b/clang/lib/Format/QualifierAlignmentFixer.cpp
@@ -132,8 +132,10 @@ static void rotateTokens(const SourceManager &SourceMgr,
   // Then move through the other tokens.
   auto *Tok = Begin;
   while (Tok != End) {
-    if (!NewText.empty() && !endsWithSpace(NewText))
+    if (!NewText.empty() && !endsWithSpace(NewText) &&
+        Tok->isNot(tok::coloncolon)) {
       NewText += " ";
+    }
 
     NewText += Tok->TokenText;
     Tok = Tok->Next;
@@ -412,6 +414,14 @@ const FormatToken *LeftRightQualifierAlignmentFixer::analyzeLeft(
   // The case `const long long volatile int` -> `const volatile long long int`
   // The case `long volatile long int const` -> `const volatile long long int`
   if (TypeToken->isTypeName(LangOpts)) {
+    for (const auto *Prev = TypeToken->Previous;
+         Prev && Prev->is(tok::coloncolon); Prev = Prev->Previous) {
+      TypeToken = Prev;
+      Prev = Prev->Previous;
+      if (!(Prev && Prev->is(tok::identifier)))
+        break;
+      TypeToken = Prev;
+    }
     const FormatToken *LastSimpleTypeSpecifier = TypeToken;
     while (isConfiguredQualifierOrType(
         LastSimpleTypeSpecifier->getPreviousNonComment(),
diff --git a/clang/unittests/Format/QualifierFixerTest.cpp b/clang/unittests/Format/QualifierFixerTest.cpp
index 129828b0d187a..3eae39f267c3e 100644
--- a/clang/unittests/Format/QualifierFixerTest.cpp
+++ b/clang/unittests/Format/QualifierFixerTest.cpp
@@ -1291,6 +1291,21 @@ TEST_F(QualifierFixerTest, WithCpp11Attribute) {
                "[[maybe_unused]] constexpr static int A", Style);
 }
 
+TEST_F(QualifierFixerTest, WithQualifiedTypeName) {
+  auto Style = getLLVMStyle();
+  Style.QualifierAlignment = FormatStyle::QAS_Custom;
+  Style.QualifierOrder = {"constexpr", "type", "const"};
+
+  verifyFormat("constexpr ::int64_t x{1};", "::int64_t constexpr x{1};", Style);
+  verifyFormat("constexpr std::int64_t x{123};",
+               "std::int64_t constexpr x{123};", Style);
+  verifyFormat("constexpr ::std::int64_t x{123};",
+               "::std::int64_t constexpr x{123};", Style);
+
+  Style.TypeNames.push_back("bar");
+  verifyFormat("constexpr foo::bar x{12};", "foo::bar constexpr x{12};", Style);
+}
+
 TEST_F(QualifierFixerTest, DisableRegions) {
   FormatStyle Style = getLLVMStyle();
   Style.QualifierAlignment = FormatStyle::QAS_Custom;

From a7f00c8ed8c20a006fd7e592e99f3ca7b725b07e Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Thu, 30 Jan 2025 18:03:04 -0800
Subject: [PATCH 135/282] [clang-format] Fix a crash on parsing requires clause
 (#125021)

Fixes #124921.

(cherry picked from commit 14178deab0334d9ce095ae7adce408868659faee)
---
 clang/lib/Format/UnwrappedLineParser.cpp | 2 +-
 clang/unittests/Format/FormatTest.cpp    | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index 906fc11a07d5e..120922d271aab 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -3632,7 +3632,7 @@ void UnwrappedLineParser::parseRequiresClause(FormatToken *RequiresToken) {
   // It could be inlined into here.
   parseConstraintExpression();
 
-  if (!InRequiresExpression)
+  if (!InRequiresExpression && FormatTok->Previous)
     FormatTok->Previous->ClosesRequiresClause = true;
 }
 
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index 57f12221cdc7e..30f3533ac73f7 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -26478,6 +26478,9 @@ TEST_F(FormatTest, RequiresClauses) {
                "foo();\n"
                "#endif\n"
                "bar(requires);");
+
+  verifyNoCrash("template <class T>\n"
+                "    requires(requires { std::declval<T>()");
 }
 
 TEST_F(FormatTest, RequiresExpressionIndentation) {

From a54f1aaa4b6a59047be0d26b5bbc5e63393ba90b Mon Sep 17 00:00:00 2001
From: Tristan Ross <tristan.ross@midstall.com>
Date: Thu, 13 Feb 2025 14:56:55 -0800
Subject: [PATCH 136/282] [ORC][unittests] Remove hard coded 16k page size
 (#127115)

Fixes a couple hard coded 16k values which is being used as the page
size. Replaces the hard coded value with the system's page size. This
fixes #116753 on an Ampere Altra Q64-22

CC @lhames

(cherry picked from commit 415607e10b56d0e6c4661ff1ec5b9b46bf433cba)
---
 .../Orc/JITLinkRedirectionManagerTest.cpp              | 10 ++++++++--
 .../ExecutionEngine/Orc/ReOptimizeLayerTest.cpp        |  9 ++++++++-
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/llvm/unittests/ExecutionEngine/Orc/JITLinkRedirectionManagerTest.cpp b/llvm/unittests/ExecutionEngine/Orc/JITLinkRedirectionManagerTest.cpp
index 4b8a3efe680f1..a57241b8a3da6 100644
--- a/llvm/unittests/ExecutionEngine/Orc/JITLinkRedirectionManagerTest.cpp
+++ b/llvm/unittests/ExecutionEngine/Orc/JITLinkRedirectionManagerTest.cpp
@@ -48,12 +48,18 @@ class JITLinkRedirectionManagerTest : public testing::Test {
     if (Triple.isPPC())
       GTEST_SKIP();
 
+    auto PageSize = sys::Process::getPageSize();
+    if (!PageSize) {
+      consumeError(PageSize.takeError());
+      GTEST_SKIP();
+    }
+
     ES = std::make_unique<ExecutionSession>(
         std::make_unique<UnsupportedExecutorProcessControl>(
-            nullptr, nullptr, JTMB->getTargetTriple().getTriple()));
+            nullptr, nullptr, JTMB->getTargetTriple().getTriple(), *PageSize));
     JD = &ES->createBareJITDylib("main");
     ObjLinkingLayer = std::make_unique<ObjectLinkingLayer>(
-        *ES, std::make_unique<InProcessMemoryManager>(16384));
+        *ES, std::make_unique<InProcessMemoryManager>(*PageSize));
     DL = std::make_unique<DataLayout>(std::move(*DLOrErr));
   }
   JITDylib *JD{nullptr};
diff --git a/llvm/unittests/ExecutionEngine/Orc/ReOptimizeLayerTest.cpp b/llvm/unittests/ExecutionEngine/Orc/ReOptimizeLayerTest.cpp
index 083a924ce9aa1..991b12def55fa 100644
--- a/llvm/unittests/ExecutionEngine/Orc/ReOptimizeLayerTest.cpp
+++ b/llvm/unittests/ExecutionEngine/Orc/ReOptimizeLayerTest.cpp
@@ -66,10 +66,17 @@ class ReOptimizeLayerTest : public testing::Test {
       consumeError(DLOrErr.takeError());
       GTEST_SKIP();
     }
+
+    auto PageSize = sys::Process::getPageSize();
+    if (!PageSize) {
+      consumeError(PageSize.takeError());
+      GTEST_SKIP();
+    }
+
     ES = std::make_unique<ExecutionSession>(std::move(*EPC));
     JD = &ES->createBareJITDylib("main");
     ObjLinkingLayer = std::make_unique<ObjectLinkingLayer>(
-        *ES, std::make_unique<InProcessMemoryManager>(16384));
+        *ES, std::make_unique<InProcessMemoryManager>(*PageSize));
     DL = std::make_unique<DataLayout>(std::move(*DLOrErr));
 
     auto TM = JTMB->createTargetMachine();

From 79da30bceea05b3096096ce613faf9a39f9cae2c Mon Sep 17 00:00:00 2001
From: Nathan Ridge <zeratul976@hotmail.com>
Date: Thu, 13 Feb 2025 18:32:12 -0500
Subject: [PATCH 137/282] [clang][AST] Handle dependent representation of call
 to function with explicit object parameter in CallExpr::getBeginLoc()
 (#126868)

This fixes a crash where CallExpr::getBeginLoc() tries to access the
first argument of a CallExpr representing a call to a function with
an explicit object parameter, assuming that a first argument exists
because it's the object argument.

This is the case for non-dependent calls, but for dependent calls
the object argument is part of the callee (the semantic analysis
that separates it out has not been performed yet) and so there may
not be a first argument.

Fixes https://github.com/llvm/llvm-project/issues/126720

(cherry picked from commit 32c8754fbcb936ba6b5bc6cb6817cf3b6a4602f4)
---
 clang/lib/AST/Expr.cpp                        | 21 ++++++++++++++-----
 .../test/AST/ast-dump-cxx2b-deducing-this.cpp | 13 ++++++++++++
 2 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp
index 06b0491442673..aa7e14329a21b 100644
--- a/clang/lib/AST/Expr.cpp
+++ b/clang/lib/AST/Expr.cpp
@@ -1645,11 +1645,22 @@ SourceLocation CallExpr::getBeginLoc() const {
   if (const auto *OCE = dyn_cast<CXXOperatorCallExpr>(this))
     return OCE->getBeginLoc();
 
-  if (const auto *Method =
-          dyn_cast_if_present<const CXXMethodDecl>(getCalleeDecl());
-      Method && Method->isExplicitObjectMemberFunction()) {
-    assert(getNumArgs() > 0 && getArg(0));
-    return getArg(0)->getBeginLoc();
+  // A non-dependent call to a member function with an explicit object parameter
+  // is modelled with the object expression being the first argument, e.g. in
+  // `o.f(x)`, the callee will be just `f`, and `o` will be the first argument.
+  // Since the first argument is written before the callee, the expression's
+  // begin location should come from the first argument.
+  // This does not apply to dependent calls, which are modelled with `o.f`
+  // being the callee.
+  if (!isTypeDependent()) {
+    if (const auto *Method =
+            dyn_cast_if_present<const CXXMethodDecl>(getCalleeDecl());
+        Method && Method->isExplicitObjectMemberFunction()) {
+      bool HasFirstArg = getNumArgs() > 0 && getArg(0);
+      assert(HasFirstArg);
+      if (HasFirstArg)
+        return getArg(0)->getBeginLoc();
+    }
   }
 
   SourceLocation begin = getCallee()->getBeginLoc();
diff --git a/clang/test/AST/ast-dump-cxx2b-deducing-this.cpp b/clang/test/AST/ast-dump-cxx2b-deducing-this.cpp
index 854d12b4cdba6..abe9d6a5b5bc6 100644
--- a/clang/test/AST/ast-dump-cxx2b-deducing-this.cpp
+++ b/clang/test/AST/ast-dump-cxx2b-deducing-this.cpp
@@ -13,3 +13,16 @@ void main() {
   // CHECK-NEXT: | `-DeclRefExpr 0x{{[^ ]*}} <col:13> 'int (S &)' lvalue CXXMethod 0x{{[^ ]*}} 'f' 'int (S &)'
 }
 }
+
+namespace GH1269720 {
+template <typename T>
+struct S {
+  void f(this S&);
+  void g(S s) {
+    s.f();
+  }
+  // CHECK: CallExpr 0x{{[^ ]*}} <line:22:5, col:9> '<dependent type>'
+  // CHECK-NEXT: `-MemberExpr 0x{{[^ ]*}} <col:5, col:7> '<bound member function type>' .f
+  // CHECK-NEXT:   `-DeclRefExpr 0x{{[^ ]*}} <col:5> 'S<T>' lvalue ParmVar 0x{{[^ ]*}} 's' 'S<T>'
+};
+}

From 17dd52ca6b0e83f6ce5214b65056e462cf6a7ccd Mon Sep 17 00:00:00 2001
From: Sylvestre Ledru <sylvestre@debian.org>
Date: Thu, 13 Feb 2025 18:42:28 +0100
Subject: [PATCH 138/282] libc/cmake: don't fail if LLVM_VERSION_SUFFIX isn't
 defined (#126359)

Closes: #126358

cc @samvangysegem

---------

Co-authored-by: Joseph Huber <huberjn@outlook.com>
(cherry picked from commit c81139f417a209dbd2a4e06465483d4b0951a9ac)
---
 libc/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt
index 1c4c0cd5aa22b..47708c2267818 100644
--- a/libc/CMakeLists.txt
+++ b/libc/CMakeLists.txt
@@ -51,7 +51,7 @@ set(LIBC_KERNEL_HEADERS "/usr/include" CACHE STRING "Path to Linux kernel header
 # Defining a global namespace to enclose all libc functions.
 set(default_namespace "__llvm_libc")
 if(LLVM_VERSION_MAJOR)
-  string(REPLACE "-" "" NS_LLVM_VERSION_SUFFIX ${LLVM_VERSION_SUFFIX})
+  string(REPLACE "-" "" NS_LLVM_VERSION_SUFFIX "${LLVM_VERSION_SUFFIX}")
   set(default_namespace "__llvm_libc_${LLVM_VERSION_MAJOR}_${LLVM_VERSION_MINOR}_${LLVM_VERSION_PATCH}_${NS_LLVM_VERSION_SUFFIX}")
 endif()
 set(LIBC_NAMESPACE ${default_namespace}

From 4f10d6d3ae1a812c4c3e918d68797c1689fb1bef Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Thu, 13 Feb 2025 10:46:42 +0000
Subject: [PATCH 139/282] [reland][DebugInfo] Update DIBuilder insertion to
 take InsertPosition (#126967)

After #124287 updated several functions to return iterators rather than
Instruction *, it was no longer straightforward to pass their result to
DIBuilder. This commit updates DIBuilder methods to accept an
InsertPosition instead, so that they can be called with an iterator
(preferred), or with a deprecation warning an Instruction *, or a
BasicBlock *. This commit also updates the existing calls to the
DIBuilder methods to pass in iterators.

As a special exception, DIBuilder::insertDeclare() keeps a separate
overload accepting a BasicBlock *InsertAtEnd. This is because despite
the name, this method does not insert at the end of the block, therefore
this cannot be handled implicitly by using InsertPosition.

(cherry picked from commit 1083ec647f16314bcc9af8c4d6b11f50d288bca6)
---
 clang/lib/CodeGen/CGDebugInfo.cpp             |   6 +-
 llvm/include/llvm/IR/DIBuilder.h              |  52 ++------
 llvm/lib/IR/DIBuilder.cpp                     | 111 +++++-------------
 llvm/lib/IR/DebugInfo.cpp                     |  47 +++++---
 llvm/lib/Transforms/Coroutines/CoroFrame.cpp  |   4 +-
 llvm/lib/Transforms/Scalar/SROA.cpp           |   2 +-
 llvm/lib/Transforms/Utils/Debugify.cpp        |  14 ++-
 llvm/lib/Transforms/Utils/Local.cpp           |  23 +---
 .../Utils/PromoteMemoryToRegister.cpp         |   3 +-
 llvm/unittests/IR/IRBuilderTest.cpp           |  10 +-
 .../Transforms/Utils/CloningTest.cpp          |   5 +-
 11 files changed, 98 insertions(+), 179 deletions(-)

diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index f0abfaa7324fc..d28ef60f84e92 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -5119,7 +5119,7 @@ void CGDebugInfo::EmitLabel(const LabelDecl *D, CGBuilderTy &Builder) {
   DBuilder.insertLabel(L,
                        llvm::DILocation::get(CGM.getLLVMContext(), Line, Column,
                                              Scope, CurInlinedAt),
-                       Builder.GetInsertBlock());
+                       Builder.GetInsertBlock()->end());
 }
 
 llvm::DIType *CGDebugInfo::CreateSelfType(const QualType &QualTy,
@@ -5197,7 +5197,7 @@ void CGDebugInfo::EmitDeclareOfBlockDeclRefVariable(
                                   LexicalBlockStack.back(), CurInlinedAt);
   auto *Expr = DBuilder.createExpression(addr);
   if (InsertPoint)
-    DBuilder.insertDeclare(Storage, D, Expr, DL, InsertPoint);
+    DBuilder.insertDeclare(Storage, D, Expr, DL, InsertPoint->getIterator());
   else
     DBuilder.insertDeclare(Storage, D, Expr, DL, Builder.GetInsertBlock());
 }
@@ -5862,7 +5862,7 @@ void CGDebugInfo::EmitPseudoVariable(CGBuilderTy &Builder,
 
   if (auto InsertPoint = Value->getInsertionPointAfterDef()) {
     DBuilder.insertDbgValueIntrinsic(Value, D, DBuilder.createExpression(), DIL,
-                                     &**InsertPoint);
+                                     *InsertPoint);
   }
 }
 
diff --git a/llvm/include/llvm/IR/DIBuilder.h b/llvm/include/llvm/IR/DIBuilder.h
index 6c479415b9ed2..f777206d0d735 100644
--- a/llvm/include/llvm/IR/DIBuilder.h
+++ b/llvm/include/llvm/IR/DIBuilder.h
@@ -92,33 +92,15 @@ namespace llvm {
     /// Create an \a temporary node and track it in \a UnresolvedNodes.
     void trackIfUnresolved(MDNode *N);
 
-    /// Internal helper for insertDeclare.
-    DbgInstPtr insertDeclare(llvm::Value *Storage, DILocalVariable *VarInfo,
-                             DIExpression *Expr, const DILocation *DL,
-                             BasicBlock *InsertBB, Instruction *InsertBefore);
-
-    /// Internal helper for insertLabel.
-    DbgInstPtr insertLabel(DILabel *LabelInfo, const DILocation *DL,
-                           BasicBlock *InsertBB, Instruction *InsertBefore);
-
     /// Internal helper. Track metadata if untracked and insert \p DVR.
-    void insertDbgVariableRecord(DbgVariableRecord *DVR, BasicBlock *InsertBB,
-                                 Instruction *InsertBefore,
-                                 bool InsertAtHead = false);
+    void insertDbgVariableRecord(DbgVariableRecord *DVR,
+                                 InsertPosition InsertPt);
 
     /// Internal helper with common code used by insertDbg{Value,Addr}Intrinsic.
     Instruction *insertDbgIntrinsic(llvm::Function *Intrinsic, llvm::Value *Val,
                                     DILocalVariable *VarInfo,
                                     DIExpression *Expr, const DILocation *DL,
-                                    BasicBlock *InsertBB,
-                                    Instruction *InsertBefore);
-
-    /// Internal helper for insertDbgValueIntrinsic.
-    DbgInstPtr insertDbgValueIntrinsic(llvm::Value *Val,
-                                       DILocalVariable *VarInfo,
-                                       DIExpression *Expr, const DILocation *DL,
-                                       BasicBlock *InsertBB,
-                                       Instruction *InsertBefore);
+                                    InsertPosition InsertPt);
 
   public:
     /// Construct a builder for a module.
@@ -993,46 +975,28 @@ namespace llvm {
     /// \param VarInfo      Variable's debug info descriptor.
     /// \param Expr         A complex location expression.
     /// \param DL           Debug info location.
-    /// \param InsertBefore Location for the new intrinsic.
+    /// \param InsertPt     Location for the new intrinsic.
     DbgInstPtr insertDeclare(llvm::Value *Storage, DILocalVariable *VarInfo,
                              DIExpression *Expr, const DILocation *DL,
-                             Instruction *InsertBefore);
+                             InsertPosition InsertPt);
 
     /// Insert a new llvm.dbg.label intrinsic call.
     /// \param LabelInfo    Label's debug info descriptor.
     /// \param DL           Debug info location.
     /// \param InsertBefore Location for the new intrinsic.
     DbgInstPtr insertLabel(DILabel *LabelInfo, const DILocation *DL,
-                           Instruction *InsertBefore);
-
-    /// Insert a new llvm.dbg.label intrinsic call.
-    /// \param LabelInfo    Label's debug info descriptor.
-    /// \param DL           Debug info location.
-    /// \param InsertAtEnd Location for the new intrinsic.
-    DbgInstPtr insertLabel(DILabel *LabelInfo, const DILocation *DL,
-                           BasicBlock *InsertAtEnd);
+                           InsertPosition InsertPt);
 
     /// Insert a new llvm.dbg.value intrinsic call.
     /// \param Val          llvm::Value of the variable
     /// \param VarInfo      Variable's debug info descriptor.
     /// \param Expr         A complex location expression.
     /// \param DL           Debug info location.
-    /// \param InsertAtEnd Location for the new intrinsic.
-    DbgInstPtr insertDbgValueIntrinsic(llvm::Value *Val,
-                                       DILocalVariable *VarInfo,
-                                       DIExpression *Expr, const DILocation *DL,
-                                       BasicBlock *InsertAtEnd);
-
-    /// Insert a new llvm.dbg.value intrinsic call.
-    /// \param Val          llvm::Value of the variable
-    /// \param VarInfo      Variable's debug info descriptor.
-    /// \param Expr         A complex location expression.
-    /// \param DL           Debug info location.
-    /// \param InsertBefore Location for the new intrinsic.
+    /// \param InsertPt     Location for the new intrinsic.
     DbgInstPtr insertDbgValueIntrinsic(llvm::Value *Val,
                                        DILocalVariable *VarInfo,
                                        DIExpression *Expr, const DILocation *DL,
-                                       Instruction *InsertBefore);
+                                       InsertPosition InsertPt);
 
     /// Replace the vtable holder in the given type.
     ///
diff --git a/llvm/lib/IR/DIBuilder.cpp b/llvm/lib/IR/DIBuilder.cpp
index 8f9462ab46d88..752a43213b716 100644
--- a/llvm/lib/IR/DIBuilder.cpp
+++ b/llvm/lib/IR/DIBuilder.cpp
@@ -959,20 +959,15 @@ DILexicalBlock *DIBuilder::createLexicalBlock(DIScope *Scope, DIFile *File,
                                      File, Line, Col);
 }
 
-DbgInstPtr DIBuilder::insertDeclare(Value *Storage, DILocalVariable *VarInfo,
-                                    DIExpression *Expr, const DILocation *DL,
-                                    Instruction *InsertBefore) {
-  return insertDeclare(Storage, VarInfo, Expr, DL, InsertBefore->getParent(),
-                       InsertBefore);
-}
-
 DbgInstPtr DIBuilder::insertDeclare(Value *Storage, DILocalVariable *VarInfo,
                                     DIExpression *Expr, const DILocation *DL,
                                     BasicBlock *InsertAtEnd) {
   // If this block already has a terminator then insert this intrinsic before
   // the terminator. Otherwise, put it at the end of the block.
   Instruction *InsertBefore = InsertAtEnd->getTerminator();
-  return insertDeclare(Storage, VarInfo, Expr, DL, InsertAtEnd, InsertBefore);
+  return insertDeclare(Storage, VarInfo, Expr, DL,
+                       InsertBefore ? InsertBefore->getIterator()
+                                    : InsertAtEnd->end());
 }
 
 DbgInstPtr DIBuilder::insertDbgAssign(Instruction *LinkedInstr, Value *Val,
@@ -987,11 +982,10 @@ DbgInstPtr DIBuilder::insertDbgAssign(Instruction *LinkedInstr, Value *Val,
   if (M.IsNewDbgInfoFormat) {
     DbgVariableRecord *DVR = DbgVariableRecord::createDVRAssign(
         Val, SrcVar, ValExpr, Link, Addr, AddrExpr, DL);
-    BasicBlock *InsertBB = LinkedInstr->getParent();
     // Insert after LinkedInstr.
     BasicBlock::iterator NextIt = std::next(LinkedInstr->getIterator());
-    Instruction *InsertBefore = NextIt == InsertBB->end() ? nullptr : &*NextIt;
-    insertDbgVariableRecord(DVR, InsertBB, InsertBefore, true);
+    NextIt.setHeadBit(true);
+    insertDbgVariableRecord(DVR, NextIt);
     return DVR;
   }
 
@@ -1017,47 +1011,11 @@ DbgInstPtr DIBuilder::insertDbgAssign(Instruction *LinkedInstr, Value *Val,
   return DVI;
 }
 
-DbgInstPtr DIBuilder::insertLabel(DILabel *LabelInfo, const DILocation *DL,
-                                  Instruction *InsertBefore) {
-  return insertLabel(LabelInfo, DL,
-                     InsertBefore ? InsertBefore->getParent() : nullptr,
-                     InsertBefore);
-}
-
-DbgInstPtr DIBuilder::insertLabel(DILabel *LabelInfo, const DILocation *DL,
-                                  BasicBlock *InsertAtEnd) {
-  return insertLabel(LabelInfo, DL, InsertAtEnd, nullptr);
-}
-
-DbgInstPtr DIBuilder::insertDbgValueIntrinsic(Value *V,
-                                              DILocalVariable *VarInfo,
-                                              DIExpression *Expr,
-                                              const DILocation *DL,
-                                              Instruction *InsertBefore) {
-  DbgInstPtr DVI = insertDbgValueIntrinsic(
-      V, VarInfo, Expr, DL, InsertBefore ? InsertBefore->getParent() : nullptr,
-      InsertBefore);
-  if (auto *Inst = dyn_cast<Instruction *>(DVI))
-    cast<CallInst>(Inst)->setTailCall();
-  return DVI;
-}
-
-DbgInstPtr DIBuilder::insertDbgValueIntrinsic(Value *V,
-                                              DILocalVariable *VarInfo,
-                                              DIExpression *Expr,
-                                              const DILocation *DL,
-                                              BasicBlock *InsertAtEnd) {
-  return insertDbgValueIntrinsic(V, VarInfo, Expr, DL, InsertAtEnd, nullptr);
-}
-
 /// Initialize IRBuilder for inserting dbg.declare and dbg.value intrinsics.
 /// This abstracts over the various ways to specify an insert position.
 static void initIRBuilder(IRBuilder<> &Builder, const DILocation *DL,
-                          BasicBlock *InsertBB, Instruction *InsertBefore) {
-  if (InsertBefore)
-    Builder.SetInsertPoint(InsertBefore);
-  else if (InsertBB)
-    Builder.SetInsertPoint(InsertBB);
+                          InsertPosition InsertPt) {
+  Builder.SetInsertPoint(InsertPt.getBasicBlock(), InsertPt);
   Builder.SetCurrentDebugLocation(DL);
 }
 
@@ -1070,26 +1028,28 @@ static Function *getDeclareIntrin(Module &M) {
   return Intrinsic::getOrInsertDeclaration(&M, Intrinsic::dbg_declare);
 }
 
-DbgInstPtr DIBuilder::insertDbgValueIntrinsic(
-    llvm::Value *Val, DILocalVariable *VarInfo, DIExpression *Expr,
-    const DILocation *DL, BasicBlock *InsertBB, Instruction *InsertBefore) {
+DbgInstPtr DIBuilder::insertDbgValueIntrinsic(llvm::Value *Val,
+                                              DILocalVariable *VarInfo,
+                                              DIExpression *Expr,
+                                              const DILocation *DL,
+                                              InsertPosition InsertPt) {
   if (M.IsNewDbgInfoFormat) {
     DbgVariableRecord *DVR =
         DbgVariableRecord::createDbgVariableRecord(Val, VarInfo, Expr, DL);
-    insertDbgVariableRecord(DVR, InsertBB, InsertBefore);
+    insertDbgVariableRecord(DVR, InsertPt);
     return DVR;
   }
 
   if (!ValueFn)
     ValueFn = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::dbg_value);
-  return insertDbgIntrinsic(ValueFn, Val, VarInfo, Expr, DL, InsertBB,
-                            InsertBefore);
+  auto *DVI = insertDbgIntrinsic(ValueFn, Val, VarInfo, Expr, DL, InsertPt);
+  cast<CallInst>(DVI)->setTailCall();
+  return DVI;
 }
 
 DbgInstPtr DIBuilder::insertDeclare(Value *Storage, DILocalVariable *VarInfo,
                                     DIExpression *Expr, const DILocation *DL,
-                                    BasicBlock *InsertBB,
-                                    Instruction *InsertBefore) {
+                                    InsertPosition InsertPt) {
   assert(VarInfo && "empty or invalid DILocalVariable* passed to dbg.declare");
   assert(DL && "Expected debug loc");
   assert(DL->getScope()->getSubprogram() ==
@@ -1099,7 +1059,7 @@ DbgInstPtr DIBuilder::insertDeclare(Value *Storage, DILocalVariable *VarInfo,
   if (M.IsNewDbgInfoFormat) {
     DbgVariableRecord *DVR =
         DbgVariableRecord::createDVRDeclare(Storage, VarInfo, Expr, DL);
-    insertDbgVariableRecord(DVR, InsertBB, InsertBefore);
+    insertDbgVariableRecord(DVR, InsertPt);
     return DVR;
   }
 
@@ -1113,35 +1073,27 @@ DbgInstPtr DIBuilder::insertDeclare(Value *Storage, DILocalVariable *VarInfo,
                    MetadataAsValue::get(VMContext, Expr)};
 
   IRBuilder<> B(DL->getContext());
-  initIRBuilder(B, DL, InsertBB, InsertBefore);
+  initIRBuilder(B, DL, InsertPt);
   return B.CreateCall(DeclareFn, Args);
 }
 
 void DIBuilder::insertDbgVariableRecord(DbgVariableRecord *DVR,
-                                        BasicBlock *InsertBB,
-                                        Instruction *InsertBefore,
-                                        bool InsertAtHead) {
-  assert(InsertBefore || InsertBB);
+                                        InsertPosition InsertPt) {
+  assert(InsertPt.isValid());
   trackIfUnresolved(DVR->getVariable());
   trackIfUnresolved(DVR->getExpression());
   if (DVR->isDbgAssign())
     trackIfUnresolved(DVR->getAddressExpression());
 
-  BasicBlock::iterator InsertPt;
-  if (InsertBB && InsertBefore)
-    InsertPt = InsertBefore->getIterator();
-  else if (InsertBB)
-    InsertPt = InsertBB->end();
-  InsertPt.setHeadBit(InsertAtHead);
-  InsertBB->insertDbgRecordBefore(DVR, InsertPt);
+  auto *BB = InsertPt.getBasicBlock();
+  BB->insertDbgRecordBefore(DVR, InsertPt);
 }
 
 Instruction *DIBuilder::insertDbgIntrinsic(llvm::Function *IntrinsicFn,
                                            Value *V, DILocalVariable *VarInfo,
                                            DIExpression *Expr,
                                            const DILocation *DL,
-                                           BasicBlock *InsertBB,
-                                           Instruction *InsertBefore) {
+                                           InsertPosition InsertPt) {
   assert(IntrinsicFn && "must pass a non-null intrinsic function");
   assert(V && "must pass a value to a dbg intrinsic");
   assert(VarInfo &&
@@ -1158,13 +1110,12 @@ Instruction *DIBuilder::insertDbgIntrinsic(llvm::Function *IntrinsicFn,
                    MetadataAsValue::get(VMContext, Expr)};
 
   IRBuilder<> B(DL->getContext());
-  initIRBuilder(B, DL, InsertBB, InsertBefore);
+  initIRBuilder(B, DL, InsertPt);
   return B.CreateCall(IntrinsicFn, Args);
 }
 
 DbgInstPtr DIBuilder::insertLabel(DILabel *LabelInfo, const DILocation *DL,
-                                  BasicBlock *InsertBB,
-                                  Instruction *InsertBefore) {
+                                  InsertPosition InsertPt) {
   assert(LabelInfo && "empty or invalid DILabel* passed to dbg.label");
   assert(DL && "Expected debug loc");
   assert(DL->getScope()->getSubprogram() ==
@@ -1174,10 +1125,10 @@ DbgInstPtr DIBuilder::insertLabel(DILabel *LabelInfo, const DILocation *DL,
   trackIfUnresolved(LabelInfo);
   if (M.IsNewDbgInfoFormat) {
     DbgLabelRecord *DLR = new DbgLabelRecord(LabelInfo, DL);
-    if (InsertBB && InsertBefore)
-      InsertBB->insertDbgRecordBefore(DLR, InsertBefore->getIterator());
-    else if (InsertBB)
-      InsertBB->insertDbgRecordBefore(DLR, InsertBB->end());
+    if (InsertPt.isValid()) {
+      auto *BB = InsertPt.getBasicBlock();
+      BB->insertDbgRecordBefore(DLR, InsertPt);
+    }
     return DLR;
   }
 
@@ -1187,7 +1138,7 @@ DbgInstPtr DIBuilder::insertLabel(DILabel *LabelInfo, const DILocation *DL,
   Value *Args[] = {MetadataAsValue::get(VMContext, LabelInfo)};
 
   IRBuilder<> B(DL->getContext());
-  initIRBuilder(B, DL, InsertBB, InsertBefore);
+  initIRBuilder(B, DL, InsertPt);
   return B.CreateCall(LabelFn, Args);
 }
 
diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
index 4ce518009bd3e..8c973eb8cf302 100644
--- a/llvm/lib/IR/DebugInfo.cpp
+++ b/llvm/lib/IR/DebugInfo.cpp
@@ -1686,7 +1686,8 @@ LLVMDbgRecordRef LLVMDIBuilderInsertDeclareRecordBefore(
   DbgInstPtr DbgInst = unwrap(Builder)->insertDeclare(
       unwrap(Storage), unwrap<DILocalVariable>(VarInfo),
       unwrap<DIExpression>(Expr), unwrap<DILocation>(DL),
-      unwrap<Instruction>(Instr));
+      Instr ? InsertPosition(unwrap<Instruction>(Instr)->getIterator())
+            : nullptr);
   // This assert will fail if the module is in the old debug info format.
   // This function should only be called if the module is in the new
   // debug info format.
@@ -1718,7 +1719,9 @@ LLVMDbgRecordRef LLVMDIBuilderInsertDbgValueRecordBefore(
     LLVMMetadataRef Expr, LLVMMetadataRef DebugLoc, LLVMValueRef Instr) {
   DbgInstPtr DbgInst = unwrap(Builder)->insertDbgValueIntrinsic(
       unwrap(Val), unwrap<DILocalVariable>(VarInfo), unwrap<DIExpression>(Expr),
-      unwrap<DILocation>(DebugLoc), unwrap<Instruction>(Instr));
+      unwrap<DILocation>(DebugLoc),
+      Instr ? InsertPosition(unwrap<Instruction>(Instr)->getIterator())
+            : nullptr);
   // This assert will fail if the module is in the old debug info format.
   // This function should only be called if the module is in the new
   // debug info format.
@@ -1734,7 +1737,8 @@ LLVMDbgRecordRef LLVMDIBuilderInsertDbgValueRecordAtEnd(
     LLVMMetadataRef Expr, LLVMMetadataRef DebugLoc, LLVMBasicBlockRef Block) {
   DbgInstPtr DbgInst = unwrap(Builder)->insertDbgValueIntrinsic(
       unwrap(Val), unwrap<DILocalVariable>(VarInfo), unwrap<DIExpression>(Expr),
-      unwrap<DILocation>(DebugLoc), unwrap(Block));
+      unwrap<DILocation>(DebugLoc),
+      Block ? InsertPosition(unwrap(Block)->end()) : nullptr);
   // This assert will fail if the module is in the old debug info format.
   // This function should only be called if the module is in the new
   // debug info format.
@@ -1800,21 +1804,25 @@ void LLVMInstructionSetDebugLoc(LLVMValueRef Inst, LLVMMetadataRef Loc) {
     unwrap<Instruction>(Inst)->setDebugLoc(DebugLoc());
 }
 
-LLVMMetadataRef LLVMDIBuilderCreateLabel(
-    LLVMDIBuilderRef Builder,
-    LLVMMetadataRef Context, const char *Name, size_t NameLen,
-    LLVMMetadataRef File, unsigned LineNo, LLVMBool AlwaysPreserve) {
+LLVMMetadataRef LLVMDIBuilderCreateLabel(LLVMDIBuilderRef Builder,
+                                         LLVMMetadataRef Context,
+                                         const char *Name, size_t NameLen,
+                                         LLVMMetadataRef File, unsigned LineNo,
+                                         LLVMBool AlwaysPreserve) {
   return wrap(unwrap(Builder)->createLabel(
-    unwrapDI<DIScope>(Context), StringRef(Name, NameLen),
-    unwrapDI<DIFile>(File), LineNo, AlwaysPreserve));
+      unwrapDI<DIScope>(Context), StringRef(Name, NameLen),
+      unwrapDI<DIFile>(File), LineNo, AlwaysPreserve));
 }
 
-LLVMDbgRecordRef LLVMDIBuilderInsertLabelBefore(
-    LLVMDIBuilderRef Builder, LLVMMetadataRef LabelInfo,
-    LLVMMetadataRef Location, LLVMValueRef InsertBefore) {
+LLVMDbgRecordRef LLVMDIBuilderInsertLabelBefore(LLVMDIBuilderRef Builder,
+                                                LLVMMetadataRef LabelInfo,
+                                                LLVMMetadataRef Location,
+                                                LLVMValueRef InsertBefore) {
   DbgInstPtr DbgInst = unwrap(Builder)->insertLabel(
-    unwrapDI<DILabel>(LabelInfo), unwrapDI<DILocation>(Location),
-    unwrap<Instruction>(InsertBefore));
+      unwrapDI<DILabel>(LabelInfo), unwrapDI<DILocation>(Location),
+      InsertBefore
+          ? InsertPosition(unwrap<Instruction>(InsertBefore)->getIterator())
+          : nullptr);
   // This assert will fail if the module is in the old debug info format.
   // This function should only be called if the module is in the new
   // debug info format.
@@ -1825,12 +1833,13 @@ LLVMDbgRecordRef LLVMDIBuilderInsertLabelBefore(
   return wrap(cast<DbgRecord *>(DbgInst));
 }
 
-LLVMDbgRecordRef LLVMDIBuilderInsertLabelAtEnd(
-    LLVMDIBuilderRef Builder, LLVMMetadataRef LabelInfo,
-    LLVMMetadataRef Location, LLVMBasicBlockRef InsertAtEnd) {
+LLVMDbgRecordRef LLVMDIBuilderInsertLabelAtEnd(LLVMDIBuilderRef Builder,
+                                               LLVMMetadataRef LabelInfo,
+                                               LLVMMetadataRef Location,
+                                               LLVMBasicBlockRef InsertAtEnd) {
   DbgInstPtr DbgInst = unwrap(Builder)->insertLabel(
-    unwrapDI<DILabel>(LabelInfo), unwrapDI<DILocation>(Location),
-    unwrap(InsertAtEnd));
+      unwrapDI<DILabel>(LabelInfo), unwrapDI<DILocation>(Location),
+      InsertAtEnd ? InsertPosition(unwrap(InsertAtEnd)->end()) : nullptr);
   // This assert will fail if the module is in the old debug info format.
   // This function should only be called if the module is in the new
   // debug info format.
diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
index 5f104f1692731..68edabb083be3 100644
--- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -850,7 +850,7 @@ static void buildFrameDebugInfo(Function &F, coro::Shape &Shape,
   } else {
     DBuilder.insertDeclare(Shape.FramePtr, FrameDIVar,
                            DBuilder.createExpression(), DILoc,
-                           &*Shape.getInsertPtAfterFramePtr());
+                           Shape.getInsertPtAfterFramePtr());
   }
 }
 
@@ -1145,7 +1145,7 @@ static void insertSpills(const FrameDataInfo &FrameData, coro::Shape &Shape) {
             DIBuilder(*CurrentBlock->getParent()->getParent(), AllowUnresolved)
                 .insertDeclare(CurrentReload, DDI->getVariable(),
                                DDI->getExpression(), DDI->getDebugLoc(),
-                               &*Builder.GetInsertPoint());
+                               Builder.GetInsertPoint());
           }
           // This dbg.declare is for the main function entry point.  It
           // will be deleted in all coro-split functions.
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 29240aaaa21be..e88c130cccf20 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -5154,7 +5154,7 @@ insertNewDbgInst(DIBuilder &DIB, DbgDeclareInst *Orig, AllocaInst *NewAddr,
     return;
 
   DIB.insertDeclare(NewAddr, Orig->getVariable(), NewAddrExpr,
-                    Orig->getDebugLoc(), BeforeInst);
+                    Orig->getDebugLoc(), BeforeInst->getIterator());
 }
 
 /// Insert a new dbg.assign.
diff --git a/llvm/lib/Transforms/Utils/Debugify.cpp b/llvm/lib/Transforms/Utils/Debugify.cpp
index e5e2aa6556930..e47a6ce6e9205 100644
--- a/llvm/lib/Transforms/Utils/Debugify.cpp
+++ b/llvm/lib/Transforms/Utils/Debugify.cpp
@@ -127,7 +127,7 @@ bool llvm::applyDebugifyMetadata(
     // Helper that inserts a dbg.value before \p InsertBefore, copying the
     // location (and possibly the type, if it's non-void) from \p TemplateInst.
     auto insertDbgVal = [&](Instruction &TemplateInst,
-                            Instruction *InsertBefore) {
+                            BasicBlock::iterator InsertPt) {
       std::string Name = utostr(NextVar++);
       Value *V = &TemplateInst;
       if (TemplateInst.getType()->isVoidTy())
@@ -137,7 +137,7 @@ bool llvm::applyDebugifyMetadata(
                                              getCachedDIType(V->getType()),
                                              /*AlwaysPreserve=*/true);
       DIB.insertDbgValueIntrinsic(V, LocalVar, DIB.createExpression(), Loc,
-                                  InsertBefore);
+                                  InsertPt);
     };
 
     for (BasicBlock &BB : F) {
@@ -161,7 +161,9 @@ bool llvm::applyDebugifyMetadata(
       // are made.
       BasicBlock::iterator InsertPt = BB.getFirstInsertionPt();
       assert(InsertPt != BB.end() && "Expected to find an insertion point");
-      Instruction *InsertBefore = &*InsertPt;
+
+      // Insert after existing debug values to preserve order.
+      InsertPt.setHeadBit(false);
 
       // Attach debug values.
       for (Instruction *I = &*BB.begin(); I != LastInst; I = I->getNextNode()) {
@@ -172,9 +174,9 @@ bool llvm::applyDebugifyMetadata(
         // Phis and EH pads must be grouped at the beginning of the block.
         // Only advance the insertion point when we finish visiting these.
         if (!isa<PHINode>(I) && !I->isEHPad())
-          InsertBefore = I->getNextNode();
+          InsertPt = std::next(I->getIterator());
 
-        insertDbgVal(*I, InsertBefore);
+        insertDbgVal(*I, InsertPt);
         InsertedDbgVal = true;
       }
     }
@@ -185,7 +187,7 @@ bool llvm::applyDebugifyMetadata(
     // those tests, and this helps with that.)
     if (DebugifyLevel == Level::LocationsAndVariables && !InsertedDbgVal) {
       auto *Term = findTerminatingInstruction(F.getEntryBlock());
-      insertDbgVal(*Term, Term);
+      insertDbgVal(*Term, Term->getIterator());
     }
     if (ApplyToMF)
       ApplyToMF(DIB, F);
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index d5cf62e52cca3..93f6725bdf398 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -1696,9 +1696,7 @@ static void insertDbgValueOrDbgVariableRecord(DIBuilder &Builder, Value *DV,
                                               const DebugLoc &NewLoc,
                                               BasicBlock::iterator Instr) {
   if (!UseNewDbgInfoFormat) {
-    auto DbgVal = Builder.insertDbgValueIntrinsic(DV, DIVar, DIExpr, NewLoc,
-                                                  (Instruction *)nullptr);
-    cast<Instruction *>(DbgVal)->insertBefore(Instr);
+    Builder.insertDbgValueIntrinsic(DV, DIVar, DIExpr, NewLoc, Instr);
   } else {
     // RemoveDIs: if we're using the new debug-info format, allocate a
     // DbgVariableRecord directly instead of a dbg.value intrinsic.
@@ -1711,19 +1709,10 @@ static void insertDbgValueOrDbgVariableRecord(DIBuilder &Builder, Value *DV,
 
 static void insertDbgValueOrDbgVariableRecordAfter(
     DIBuilder &Builder, Value *DV, DILocalVariable *DIVar, DIExpression *DIExpr,
-    const DebugLoc &NewLoc, BasicBlock::iterator Instr) {
-  if (!UseNewDbgInfoFormat) {
-    auto DbgVal = Builder.insertDbgValueIntrinsic(DV, DIVar, DIExpr, NewLoc,
-                                                  (Instruction *)nullptr);
-    cast<Instruction *>(DbgVal)->insertAfter(Instr);
-  } else {
-    // RemoveDIs: if we're using the new debug-info format, allocate a
-    // DbgVariableRecord directly instead of a dbg.value intrinsic.
-    ValueAsMetadata *DVAM = ValueAsMetadata::get(DV);
-    DbgVariableRecord *DV =
-        new DbgVariableRecord(DVAM, DIVar, DIExpr, NewLoc.get());
-    Instr->getParent()->insertDbgRecordAfter(DV, &*Instr);
-  }
+    const DebugLoc &NewLoc, Instruction *Instr) {
+  BasicBlock::iterator NextIt = std::next(Instr->getIterator());
+  NextIt.setHeadBit(true);
+  insertDbgValueOrDbgVariableRecord(Builder, DV, DIVar, DIExpr, NewLoc, NextIt);
 }
 
 /// Inserts a llvm.dbg.value intrinsic before a store to an alloca'd value
@@ -1815,7 +1804,7 @@ void llvm::ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII,
   // preferable to keep tracking both the loaded value and the original
   // address in case the alloca can not be elided.
   insertDbgValueOrDbgVariableRecordAfter(Builder, LI, DIVar, DIExpr, NewLoc,
-                                         LI->getIterator());
+                                         LI);
 }
 
 void llvm::ConvertDebugDeclareToDebugValue(DbgVariableRecord *DVR,
diff --git a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index 016186cb6b09d..05fd989271c32 100644
--- a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -119,7 +119,8 @@ static void createDebugValue(DIBuilder &DIB, Value *NewValue,
                              DILocalVariable *Variable,
                              DIExpression *Expression, const DILocation *DI,
                              Instruction *InsertBefore) {
-  DIB.insertDbgValueIntrinsic(NewValue, Variable, Expression, DI, InsertBefore);
+  DIB.insertDbgValueIntrinsic(NewValue, Variable, Expression, DI,
+                              InsertBefore->getIterator());
 }
 
 /// Helper for updating assignment tracking debug info when promoting allocas.
diff --git a/llvm/unittests/IR/IRBuilderTest.cpp b/llvm/unittests/IR/IRBuilderTest.cpp
index 2fd52860e71b9..3a55d88f03d49 100644
--- a/llvm/unittests/IR/IRBuilderTest.cpp
+++ b/llvm/unittests/IR/IRBuilderTest.cpp
@@ -923,12 +923,13 @@ TEST_F(IRBuilderTest, DIBuilder) {
 
     { /* dbg.label | DbgLabelRecord */
       // Insert before I and check order.
-      ExpectOrder(DIB.insertLabel(Label, LabelLoc, I), I->getIterator());
+      ExpectOrder(DIB.insertLabel(Label, LabelLoc, I->getIterator()),
+                  I->getIterator());
 
       // We should be able to insert at the end of the block, even if there's
       // no terminator yet. Note that in RemoveDIs mode this record won't get
       // inserted into the block untill another instruction is added.
-      DbgInstPtr LabelRecord = DIB.insertLabel(Label, LabelLoc, BB);
+      DbgInstPtr LabelRecord = DIB.insertLabel(Label, LabelLoc, BB->end());
       // Specifically do not insert a terminator, to check this works. `I`
       // should have absorbed the DbgLabelRecord in the new debug info mode.
       I = Builder.CreateAlloca(Builder.getInt32Ty());
@@ -945,7 +946,7 @@ TEST_F(IRBuilderTest, DIBuilder) {
         DIB.createAutoVariable(BarSP, "Y", File, 2, IntType, true);
     { /* dbg.value | DbgVariableRecord::Value */
       ExpectOrder(DIB.insertDbgValueIntrinsic(I, VarX, DIB.createExpression(),
-                                              VarLoc, I),
+                                              VarLoc, I->getIterator()),
                   I->getIterator());
       // Check inserting at end of the block works as with labels.
       DbgInstPtr VarXValue = DIB.insertDbgValueIntrinsic(
@@ -955,7 +956,8 @@ TEST_F(IRBuilderTest, DIBuilder) {
       EXPECT_EQ(BB->getTrailingDbgRecords(), nullptr);
     }
     { /* dbg.declare | DbgVariableRecord::Declare */
-      ExpectOrder(DIB.insertDeclare(I, VarY, DIB.createExpression(), VarLoc, I),
+      ExpectOrder(DIB.insertDeclare(I, VarY, DIB.createExpression(), VarLoc,
+                                    I->getIterator()),
                   I->getIterator());
       // Check inserting at end of the block works as with labels.
       DbgInstPtr VarYDeclare =
diff --git a/llvm/unittests/Transforms/Utils/CloningTest.cpp b/llvm/unittests/Transforms/Utils/CloningTest.cpp
index 96e863b257c95..1745a6fb3ede5 100644
--- a/llvm/unittests/Transforms/Utils/CloningTest.cpp
+++ b/llvm/unittests/Transforms/Utils/CloningTest.cpp
@@ -507,7 +507,7 @@ class CloneFunc : public ::testing::Test {
     auto *Variable =
         DBuilder.createAutoVariable(Subprogram, "x", File, 5, IntType, true);
     auto *DL = DILocation::get(Subprogram->getContext(), 5, 0, Subprogram);
-    DBuilder.insertDeclare(Alloca, Variable, E, DL, Store);
+    DBuilder.insertDeclare(Alloca, Variable, E, DL, Store->getIterator());
     DBuilder.insertDbgValueIntrinsic(AllocaContent, Variable, E, DL, Entry);
     // Also create an inlined variable.
     // Create a distinct struct type that we should not duplicate during
@@ -526,7 +526,8 @@ class CloneFunc : public ::testing::Test {
         Subprogram->getContext(), 9, 4, Scope,
         DILocation::get(Subprogram->getContext(), 5, 2, Subprogram));
     IBuilder.SetCurrentDebugLocation(InlinedDL);
-    DBuilder.insertDeclare(Alloca, InlinedVar, E, InlinedDL, Store);
+    DBuilder.insertDeclare(Alloca, InlinedVar, E, InlinedDL,
+                           Store->getIterator());
     IBuilder.CreateStore(IBuilder.getInt32(2), Alloca);
     // Finalize the debug info.
     DBuilder.finalize();

From 2a358bac7874baed5d05221e8ae6c75266c407b7 Mon Sep 17 00:00:00 2001
From: Petr Hosek <phosek@google.com>
Date: Wed, 12 Feb 2025 11:37:07 -0800
Subject: [PATCH 140/282] [libc] Move __LLVM_LIBC__ define to
 __llvm-libc-common.h (#126877)

Relying on features.h is problematic since codebases are free to have
such a header on their search path, which breaks compilation. libc
should instead provide a more standard way of getting __LLVM_LIBC__.
Since __llvm-libc-common.h is included from all libc headers, defining
__LLVM_LIBC__ there ensures that this define is available whenever any
of the standard header is included.

(cherry picked from commit b0d782080529cf5d422847e1f91f29bd7c62f691)
---
 libc/include/__llvm-libc-common.h               | 2 ++
 libc/include/llvm-libc-macros/features-macros.h | 2 --
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/libc/include/__llvm-libc-common.h b/libc/include/__llvm-libc-common.h
index b5a23c5765f4d..9112d1ebe302c 100644
--- a/libc/include/__llvm-libc-common.h
+++ b/libc/include/__llvm-libc-common.h
@@ -9,6 +9,8 @@
 #ifndef LLVM_LIBC_COMMON_H
 #define LLVM_LIBC_COMMON_H
 
+#define __LLVM_LIBC__ 1
+
 #ifdef __cplusplus
 
 #undef __BEGIN_C_DECLS
diff --git a/libc/include/llvm-libc-macros/features-macros.h b/libc/include/llvm-libc-macros/features-macros.h
index 5bc87a68fc0ba..f87ae4ad12408 100644
--- a/libc/include/llvm-libc-macros/features-macros.h
+++ b/libc/include/llvm-libc-macros/features-macros.h
@@ -9,6 +9,4 @@
 #ifndef LLVM_LIBC_MACROS_FEATURES_MACROS_H
 #define LLVM_LIBC_MACROS_FEATURES_MACROS_H
 
-#define __LLVM_LIBC__ 1
-
 #endif // LLVM_LIBC_MACROS_FEATURES_MACROS_H

From 37f7f151e586d3e3e5339286aeb4aac6ad7561ea Mon Sep 17 00:00:00 2001
From: Haojian Wu <hokein.wu@gmail.com>
Date: Fri, 14 Feb 2025 16:21:53 +0100
Subject: [PATCH 141/282] Diagnose the code with trailing comma in the function
 call. (#125232)

This patch fixes a regression caused by
https://github.com/llvm/llvm-project/pull/114684 where clang accepts
trailing commas for function calls.

Fixes #125225

(cherry picked from commit 922f339c4ef3631f66dc4b8caa4c356103dbf69d)
---
 clang/lib/Parse/ParseExpr.cpp  | 2 ++
 clang/test/Parser/recovery.cpp | 7 +++++++
 2 files changed, 9 insertions(+)

diff --git a/clang/lib/Parse/ParseExpr.cpp b/clang/lib/Parse/ParseExpr.cpp
index aa8b3870a188c..0cadede51a9b3 100644
--- a/clang/lib/Parse/ParseExpr.cpp
+++ b/clang/lib/Parse/ParseExpr.cpp
@@ -2237,6 +2237,8 @@ Parser::ParsePostfixExpressionSuffix(ExprResult LHS) {
             if (PP.isCodeCompletionReached() && !CalledSignatureHelp)
               RunSignatureHelp();
             LHS = ExprError();
+          } else if (!HasError && HasTrailingComma) {
+            Diag(Tok, diag::err_expected_expression);
           } else if (LHS.isInvalid()) {
             for (auto &E : ArgExprs)
               Actions.CorrectDelayedTyposInExpr(E);
diff --git a/clang/test/Parser/recovery.cpp b/clang/test/Parser/recovery.cpp
index 4e2811c4cac92..2fce67a52c6b6 100644
--- a/clang/test/Parser/recovery.cpp
+++ b/clang/test/Parser/recovery.cpp
@@ -215,3 +215,10 @@ struct ::template foo, struct ::template bar; // expected-error 2 {{expected ide
 struct ::foo struct::; // expected-error {{no struct named 'foo' in the global namespace}} expected-error {{expected identifier}}
 class :: : {} a;  // expected-error {{expected identifier}} expected-error {{expected class name}}
 }
+
+namespace GH125225 {
+void func(int);
+void k() {
+  func(1, ); // expected-error {{expected expression}}
+}
+}

From 94b97d19b7fa0fd1b5959e788bac104a017e6755 Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Thu, 30 Jan 2025 18:03:42 -0800
Subject: [PATCH 142/282] [clang-format] Fix annotation of Java/JavaScript
 keyword extends (#125038)

Uncovered in #124891.

(cherry picked from commit ea84474966f19af4f1f4a1250ec258af1c6e2571)
---
 clang/lib/Format/TokenAnnotator.cpp           | 5 +++++
 clang/unittests/Format/TokenAnnotatorTest.cpp | 1 +
 2 files changed, 6 insertions(+)

diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index 4246ade6e19be..d0724f35b1542 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -2586,6 +2586,11 @@ class AnnotatingParser {
     if (Tok.isNot(tok::identifier) || !Tok.Previous)
       return false;
 
+    if ((Style.isJavaScript() || Style.Language == FormatStyle::LK_Java) &&
+        Tok.is(Keywords.kw_extends)) {
+      return false;
+    }
+
     if (const auto *NextNonComment = Tok.getNextNonComment();
         (!NextNonComment && !Line.InMacroBody) ||
         (NextNonComment &&
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index 2147a1b950dd1..479c34e6c9c11 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -3464,6 +3464,7 @@ TEST_F(TokenAnnotatorTest, BraceKind) {
   Tokens = annotate("a = class Foo extends goog.a {};",
                     getGoogleStyle(FormatStyle::LK_JavaScript));
   ASSERT_EQ(Tokens.size(), 12u) << Tokens;
+  EXPECT_TOKEN(Tokens[4], tok::identifier, TT_Unknown); // Not TT_StartOfName
   EXPECT_TOKEN(Tokens[8], tok::l_brace, TT_ClassLBrace);
   EXPECT_BRACE_KIND(Tokens[8], BK_Block);
   EXPECT_TOKEN(Tokens[9], tok::r_brace, TT_ClassRBrace);

From 6d3b034f8f7d47909309b9869f43397b64e99f91 Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Thu, 30 Jan 2025 19:33:15 -0800
Subject: [PATCH 143/282] [clang-format] Add ClassHeadName to help annotating
 StartOfName (#124891)

Fixes #124574.

(cherry picked from commit 1a25bea852cd4f7d99e644953c31278f7f257ccd)
---
 clang/lib/Format/FormatToken.h                |  2 +
 clang/lib/Format/TokenAnnotator.cpp           |  2 +-
 clang/lib/Format/UnwrappedLineParser.cpp      | 15 ++++--
 clang/unittests/Format/TokenAnnotatorTest.cpp | 53 +++++++++++++++++++
 4 files changed, 66 insertions(+), 6 deletions(-)

diff --git a/clang/lib/Format/FormatToken.h b/clang/lib/Format/FormatToken.h
index d97b6522f1fef..29aba281ae103 100644
--- a/clang/lib/Format/FormatToken.h
+++ b/clang/lib/Format/FormatToken.h
@@ -44,6 +44,8 @@ namespace format {
   TYPE(CaseLabelColon)                                                         \
   TYPE(CastRParen)                                                             \
   TYPE(ClassLBrace)                                                            \
+  /* Name of class/struct/union/interface definition. */                       \
+  TYPE(ClassHeadName)                                                          \
   TYPE(ClassRBrace)                                                            \
   TYPE(CompoundRequirementLBrace)                                              \
   /* ternary ?: expression */                                                  \
diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index d0724f35b1542..87abe76ae1f7e 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -2583,7 +2583,7 @@ class AnnotatingParser {
     if (Style.isVerilog())
       return false;
 
-    if (Tok.isNot(tok::identifier) || !Tok.Previous)
+    if (!Tok.Previous || Tok.isNot(tok::identifier) || Tok.is(TT_ClassHeadName))
       return false;
 
     if ((Style.isJavaScript() || Style.Language == FormatStyle::LK_Java) &&
diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index 120922d271aab..1411197e32554 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -4029,7 +4029,7 @@ void UnwrappedLineParser::parseRecord(bool ParseAsExpr) {
   const FormatToken &InitialToken = *FormatTok;
   nextToken();
 
-  const FormatToken *ClassName = nullptr;
+  FormatToken *ClassName = nullptr;
   bool IsDerived = false;
   auto IsNonMacroIdentifier = [](const FormatToken *Tok) {
     return Tok->is(tok::identifier) && Tok->TokenText != Tok->TokenText.upper();
@@ -4059,7 +4059,7 @@ void UnwrappedLineParser::parseRecord(bool ParseAsExpr) {
     }
     if (FormatTok->is(tok::l_square) && handleCppAttributes())
       continue;
-    const auto *Previous = FormatTok;
+    auto *Previous = FormatTok;
     nextToken();
     switch (FormatTok->Tok.getKind()) {
     case tok::l_paren:
@@ -4074,9 +4074,12 @@ void UnwrappedLineParser::parseRecord(bool ParseAsExpr) {
     case tok::hashhash:
       break;
     default:
-      if (!JSPastExtendsOrImplements && !ClassName &&
-          Previous->is(tok::identifier) && Previous->isNot(TT_AttributeMacro) &&
-          Previous->TokenText != Previous->TokenText.upper()) {
+      if (JSPastExtendsOrImplements || ClassName ||
+          Previous->isNot(tok::identifier) || Previous->is(TT_AttributeMacro)) {
+        break;
+      }
+      if (const auto Text = Previous->TokenText;
+          Text.size() == 1 || Text != Text.upper()) {
         ClassName = Previous;
       }
     }
@@ -4160,6 +4163,8 @@ void UnwrappedLineParser::parseRecord(bool ParseAsExpr) {
   if (FormatTok->is(tok::l_brace)) {
     if (IsListInitialization())
       return;
+    if (ClassName)
+      ClassName->setFinalizedType(TT_ClassHeadName);
     auto [OpenBraceType, ClosingBraceType] = GetBraceTypes(InitialToken);
     FormatTok->setFinalizedType(OpenBraceType);
     if (ParseAsExpr) {
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index 479c34e6c9c11..7e90bde8755fb 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -479,11 +479,13 @@ TEST_F(TokenAnnotatorTest, UnderstandsUsesOfPlusAndMinus) {
 TEST_F(TokenAnnotatorTest, UnderstandsClasses) {
   auto Tokens = annotate("class C {};");
   ASSERT_EQ(Tokens.size(), 6u) << Tokens;
+  EXPECT_TOKEN(Tokens[1], tok::identifier, TT_ClassHeadName);
   EXPECT_TOKEN(Tokens[2], tok::l_brace, TT_ClassLBrace);
   EXPECT_TOKEN(Tokens[3], tok::r_brace, TT_ClassRBrace);
 
   Tokens = annotate("const class C {} c;");
   ASSERT_EQ(Tokens.size(), 8u) << Tokens;
+  EXPECT_TOKEN(Tokens[2], tok::identifier, TT_ClassHeadName);
   EXPECT_TOKEN(Tokens[3], tok::l_brace, TT_ClassLBrace);
   EXPECT_TOKEN(Tokens[4], tok::r_brace, TT_ClassRBrace);
 
@@ -494,6 +496,7 @@ TEST_F(TokenAnnotatorTest, UnderstandsClasses) {
 
   Tokens = annotate("class [[deprecated(\"\")]] C { int i; };");
   ASSERT_EQ(Tokens.size(), 17u) << Tokens;
+  EXPECT_TOKEN(Tokens[9], tok::identifier, TT_ClassHeadName);
   EXPECT_TOKEN(Tokens[10], tok::l_brace, TT_ClassLBrace);
   EXPECT_TOKEN(Tokens[14], tok::r_brace, TT_ClassRBrace);
 }
@@ -501,21 +504,25 @@ TEST_F(TokenAnnotatorTest, UnderstandsClasses) {
 TEST_F(TokenAnnotatorTest, UnderstandsStructs) {
   auto Tokens = annotate("struct S {};");
   ASSERT_EQ(Tokens.size(), 6u) << Tokens;
+  EXPECT_TOKEN(Tokens[1], tok::identifier, TT_ClassHeadName);
   EXPECT_TOKEN(Tokens[2], tok::l_brace, TT_StructLBrace);
   EXPECT_TOKEN(Tokens[3], tok::r_brace, TT_StructRBrace);
 
   Tokens = annotate("struct macro(a) S {};");
   ASSERT_EQ(Tokens.size(), 10u) << Tokens;
+  EXPECT_TOKEN(Tokens[5], tok::identifier, TT_ClassHeadName);
   EXPECT_TOKEN(Tokens[6], tok::l_brace, TT_StructLBrace);
   EXPECT_TOKEN(Tokens[7], tok::r_brace, TT_StructRBrace);
 
   Tokens = annotate("struct EXPORT_MACRO [[nodiscard]] C { int i; };");
   ASSERT_EQ(Tokens.size(), 15u) << Tokens;
+  EXPECT_TOKEN(Tokens[7], tok::identifier, TT_ClassHeadName);
   EXPECT_TOKEN(Tokens[8], tok::l_brace, TT_StructLBrace);
   EXPECT_TOKEN(Tokens[12], tok::r_brace, TT_StructRBrace);
 
   Tokens = annotate("struct [[deprecated]] [[nodiscard]] C { int i; };");
   ASSERT_EQ(Tokens.size(), 19u) << Tokens;
+  EXPECT_TOKEN(Tokens[11], tok::identifier, TT_ClassHeadName);
   EXPECT_TOKEN(Tokens[12], tok::l_brace, TT_StructLBrace);
   EXPECT_TOKEN(Tokens[16], tok::r_brace, TT_StructRBrace);
 
@@ -523,12 +530,14 @@ TEST_F(TokenAnnotatorTest, UnderstandsStructs) {
                     "  void f(T &t);\n"
                     "};");
   ASSERT_EQ(Tokens.size(), 18u) << Tokens;
+  EXPECT_TOKEN(Tokens[5], tok::identifier, TT_ClassHeadName);
   EXPECT_TOKEN(Tokens[6], tok::l_brace, TT_StructLBrace);
   EXPECT_TOKEN(Tokens[11], tok::amp, TT_PointerOrReference);
   EXPECT_TOKEN(Tokens[15], tok::r_brace, TT_StructRBrace);
 
   Tokens = annotate("template <typename T> struct S<const T[N]> {};");
   ASSERT_EQ(Tokens.size(), 18u) << Tokens;
+  EXPECT_TOKEN(Tokens[6], tok::identifier, TT_ClassHeadName);
   EXPECT_TOKEN(Tokens[7], tok::less, TT_TemplateOpener);
   EXPECT_TOKEN(Tokens[10], tok::l_square, TT_ArraySubscriptLSquare);
   EXPECT_TOKEN(Tokens[13], tok::greater, TT_TemplateCloser);
@@ -537,6 +546,7 @@ TEST_F(TokenAnnotatorTest, UnderstandsStructs) {
 
   Tokens = annotate("template <typename T> struct S<T const[N]> {};");
   ASSERT_EQ(Tokens.size(), 18u) << Tokens;
+  EXPECT_TOKEN(Tokens[6], tok::identifier, TT_ClassHeadName);
   EXPECT_TOKEN(Tokens[7], tok::less, TT_TemplateOpener);
   EXPECT_TOKEN(Tokens[10], tok::l_square, TT_ArraySubscriptLSquare);
   EXPECT_TOKEN(Tokens[13], tok::greater, TT_TemplateCloser);
@@ -547,6 +557,7 @@ TEST_F(TokenAnnotatorTest, UnderstandsStructs) {
                     "  void f(T const (&a)[n]);\n"
                     "};");
   ASSERT_EQ(Tokens.size(), 35u) << Tokens;
+  EXPECT_TOKEN(Tokens[9], tok::identifier, TT_ClassHeadName);
   EXPECT_TOKEN(Tokens[10], tok::less, TT_TemplateOpener);
   EXPECT_TOKEN(Tokens[13], tok::l_square, TT_ArraySubscriptLSquare);
   EXPECT_TOKEN(Tokens[16], tok::greater, TT_TemplateCloser);
@@ -558,12 +569,24 @@ TEST_F(TokenAnnotatorTest, UnderstandsStructs) {
 
   Tokens = annotate("template <typename T, enum E e> struct S {};");
   ASSERT_EQ(Tokens.size(), 15u) << Tokens;
+  EXPECT_TOKEN(Tokens[10], tok::identifier, TT_ClassHeadName);
   EXPECT_TOKEN(Tokens[11], tok::l_brace, TT_StructLBrace);
 
+  Tokens = annotate(
+      "template <> struct __declspec(foo) Op<Bar *> : OpImpl<Bar *> {};");
+  ASSERT_EQ(Tokens.size(), 23u) << Tokens;
+  EXPECT_TOKEN(Tokens[5], tok::l_paren, TT_AttributeLParen);
+  EXPECT_TOKEN(Tokens[7], tok::r_paren, TT_AttributeRParen);
+  EXPECT_TOKEN(Tokens[8], tok::identifier, TT_ClassHeadName);
+  EXPECT_TOKEN(Tokens[13], tok::colon, TT_InheritanceColon);
+  EXPECT_TOKEN(Tokens[19], tok::l_brace, TT_StructLBrace);
+  EXPECT_TOKEN(Tokens[20], tok::r_brace, TT_StructRBrace);
+
   constexpr StringRef Code{"struct EXPORT StructName {};"};
 
   Tokens = annotate(Code);
   ASSERT_EQ(Tokens.size(), 7u) << Tokens;
+  EXPECT_TOKEN(Tokens[2], tok::identifier, TT_ClassHeadName);
   EXPECT_TOKEN(Tokens[3], tok::l_brace, TT_StructLBrace);
   EXPECT_TOKEN(Tokens[4], tok::r_brace, TT_StructRBrace);
 
@@ -572,6 +595,7 @@ TEST_F(TokenAnnotatorTest, UnderstandsStructs) {
   Tokens = annotate(Code, Style);
   ASSERT_EQ(Tokens.size(), 7u) << Tokens;
   EXPECT_TOKEN(Tokens[1], tok::identifier, TT_AttributeMacro);
+  EXPECT_TOKEN(Tokens[2], tok::identifier, TT_ClassHeadName);
   EXPECT_TOKEN(Tokens[3], tok::l_brace, TT_StructLBrace);
   EXPECT_TOKEN(Tokens[4], tok::r_brace, TT_StructRBrace);
 }
@@ -579,11 +603,13 @@ TEST_F(TokenAnnotatorTest, UnderstandsStructs) {
 TEST_F(TokenAnnotatorTest, UnderstandsUnions) {
   auto Tokens = annotate("union U {};");
   ASSERT_EQ(Tokens.size(), 6u) << Tokens;
+  EXPECT_TOKEN(Tokens[1], tok::identifier, TT_ClassHeadName);
   EXPECT_TOKEN(Tokens[2], tok::l_brace, TT_UnionLBrace);
   EXPECT_TOKEN(Tokens[3], tok::r_brace, TT_UnionRBrace);
 
   Tokens = annotate("union U { void f() { return; } };");
   ASSERT_EQ(Tokens.size(), 14u) << Tokens;
+  EXPECT_TOKEN(Tokens[1], tok::identifier, TT_ClassHeadName);
   EXPECT_TOKEN(Tokens[2], tok::l_brace, TT_UnionLBrace);
   EXPECT_TOKEN(Tokens[7], tok::l_brace, TT_FunctionLBrace);
   EXPECT_TOKEN(Tokens[11], tok::r_brace, TT_UnionRBrace);
@@ -716,8 +742,10 @@ TEST_F(TokenAnnotatorTest, UnderstandsTemplateTemplateParameters) {
   EXPECT_TOKEN(Tokens[11], tok::less, TT_TemplateOpener);
   EXPECT_TOKEN(Tokens[14], tok::greater, TT_TemplateCloser);
   EXPECT_FALSE(Tokens[14]->ClosesTemplateDeclaration);
+  EXPECT_TOKEN(Tokens[16], tok::identifier, TT_Unknown);
   EXPECT_TOKEN(Tokens[21], tok::greater, TT_TemplateCloser);
   EXPECT_TRUE(Tokens[21]->ClosesTemplateDeclaration);
+  EXPECT_TOKEN(Tokens[23], tok::identifier, TT_ClassHeadName);
 }
 
 TEST_F(TokenAnnotatorTest, UnderstandsWhitespaceSensitiveMacros) {
@@ -800,6 +828,7 @@ TEST_F(TokenAnnotatorTest, UnderstandsCasts) {
 
   Tokens = annotate("return (struct foo){};");
   ASSERT_EQ(Tokens.size(), 9u) << Tokens;
+  EXPECT_TOKEN(Tokens[3], tok::identifier, TT_Unknown);
   EXPECT_TOKEN(Tokens[4], tok::r_paren, TT_CastRParen);
 
   Tokens = annotate("#define FOO(bar) foo((uint64_t)&bar)");
@@ -1169,6 +1198,7 @@ TEST_F(TokenAnnotatorTest, UnderstandsRequiresClausesAndConcepts) {
   EXPECT_EQ(Tokens[32]->FakeRParens, 1u);
   EXPECT_TOKEN(Tokens[33], tok::r_paren, TT_Unknown);
   EXPECT_TRUE(Tokens[33]->ClosesRequiresClause);
+  EXPECT_TOKEN(Tokens[35], tok::identifier, TT_Unknown);
 
   Tokens = annotate("template <typename T>\n"
                     "void foo(T) noexcept requires Bar<T>;");
@@ -1250,6 +1280,8 @@ TEST_F(TokenAnnotatorTest, UnderstandsRequiresClausesAndConcepts) {
                     "return number_zero_v<T>; }\n"
                     "};");
   ASSERT_EQ(Tokens.size(), 44u) << Tokens;
+  EXPECT_TOKEN(Tokens[6], tok::identifier, TT_ClassHeadName);
+  EXPECT_TOKEN(Tokens[11], tok::identifier, TT_Unknown);
   EXPECT_TOKEN(Tokens[13], tok::kw_requires, TT_RequiresClause);
   EXPECT_TOKEN(Tokens[14], tok::kw_requires, TT_RequiresExpression);
   EXPECT_TOKEN(Tokens[15], tok::l_brace, TT_RequiresExpressionLBrace);
@@ -1261,6 +1293,8 @@ TEST_F(TokenAnnotatorTest, UnderstandsRequiresClausesAndConcepts) {
       annotate("template <class A, class B> concept C ="
                "std::same_as<std::iter_value_t<A>, std::iter_value_t<B>>;");
   ASSERT_EQ(Tokens.size(), 31u) << Tokens;
+  EXPECT_TOKEN(Tokens[3], tok::identifier, TT_Unknown);
+  EXPECT_TOKEN(Tokens[6], tok::identifier, TT_Unknown);
   EXPECT_TOKEN(Tokens[8], tok::kw_concept, TT_Unknown);
   EXPECT_TOKEN(Tokens[14], tok::less, TT_TemplateOpener);
   EXPECT_TOKEN(Tokens[18], tok::less, TT_TemplateOpener);
@@ -1877,6 +1911,7 @@ TEST_F(TokenAnnotatorTest, UnderstandsLambdas) {
   ASSERT_EQ(Tokens.size(), 12u) << Tokens;
   EXPECT_TOKEN(Tokens[0], tok::l_square, TT_LambdaLSquare);
   EXPECT_TOKEN(Tokens[2], tok::arrow, TT_LambdaArrow);
+  EXPECT_TOKEN(Tokens[4], tok::identifier, TT_Unknown);
   EXPECT_TOKEN(Tokens[5], tok::l_brace, TT_LambdaLBrace);
 
   Tokens = annotate("foo([&](u32 bar) __attribute__((attr)) -> void {});");
@@ -2757,6 +2792,7 @@ TEST_F(TokenAnnotatorTest, UnderstandTableGenTokens) {
 
   // Structured statements.
   Tokens = Annotate("class Foo {}");
+  EXPECT_TOKEN(Tokens[1], tok::identifier, TT_StartOfName);
   EXPECT_TOKEN(Tokens[2], tok::l_brace, TT_FunctionLBrace);
   Tokens = Annotate("def Def: Foo {}");
   EXPECT_TOKEN(Tokens[2], tok::colon, TT_InheritanceColon);
@@ -3222,6 +3258,7 @@ TEST_F(TokenAnnotatorTest, BraceKind) {
 
   Tokens = annotate("class Foo<int> f() {}");
   ASSERT_EQ(Tokens.size(), 11u) << Tokens;
+  EXPECT_TOKEN(Tokens[1], tok::identifier, TT_Unknown);
   EXPECT_TOKEN(Tokens[5], tok::identifier, TT_FunctionDeclarationName);
   EXPECT_TOKEN(Tokens[6], tok::l_paren, TT_FunctionDeclarationLParen);
   EXPECT_TOKEN(Tokens[8], tok::l_brace, TT_FunctionLBrace);
@@ -3230,6 +3267,7 @@ TEST_F(TokenAnnotatorTest, BraceKind) {
 
   Tokens = annotate("template <typename T> class Foo<T> f() {}");
   ASSERT_EQ(Tokens.size(), 16u) << Tokens;
+  EXPECT_TOKEN(Tokens[6], tok::identifier, TT_Unknown);
   EXPECT_TOKEN(Tokens[10], tok::identifier, TT_FunctionDeclarationName);
   EXPECT_TOKEN(Tokens[11], tok::l_paren, TT_FunctionDeclarationLParen);
   EXPECT_TOKEN(Tokens[13], tok::l_brace, TT_FunctionLBrace);
@@ -3340,36 +3378,45 @@ TEST_F(TokenAnnotatorTest, BraceKind) {
 
   Tokens = annotate("struct ::Foo {};");
   ASSERT_EQ(Tokens.size(), 7u) << Tokens;
+  EXPECT_TOKEN(Tokens[2], tok::identifier, TT_ClassHeadName);
   EXPECT_BRACE_KIND(Tokens[3], BK_Block);
   EXPECT_BRACE_KIND(Tokens[4], BK_Block);
 
   Tokens = annotate("struct NS::Foo {};");
   ASSERT_EQ(Tokens.size(), 8u) << Tokens;
+  EXPECT_TOKEN(Tokens[1], tok::identifier, TT_Unknown);
+  EXPECT_TOKEN(Tokens[3], tok::identifier, TT_ClassHeadName);
   EXPECT_BRACE_KIND(Tokens[4], BK_Block);
   EXPECT_BRACE_KIND(Tokens[5], BK_Block);
 
   Tokens = annotate("struct Foo<int> {};");
   ASSERT_EQ(Tokens.size(), 9u) << Tokens;
+  EXPECT_TOKEN(Tokens[1], tok::identifier, TT_ClassHeadName);
   EXPECT_BRACE_KIND(Tokens[5], BK_Block);
   EXPECT_BRACE_KIND(Tokens[6], BK_Block);
 
   Tokens = annotate("struct Foo<int>::Bar {};");
   ASSERT_EQ(Tokens.size(), 11u) << Tokens;
+  EXPECT_TOKEN(Tokens[1], tok::identifier, TT_Unknown);
+  EXPECT_TOKEN(Tokens[6], tok::identifier, TT_ClassHeadName);
   EXPECT_BRACE_KIND(Tokens[7], BK_Block);
   EXPECT_BRACE_KIND(Tokens[8], BK_Block);
 
   Tokens = annotate("struct Foo<int> : Base {};");
   ASSERT_EQ(Tokens.size(), 11u) << Tokens;
+  EXPECT_TOKEN(Tokens[1], tok::identifier, TT_ClassHeadName);
   EXPECT_BRACE_KIND(Tokens[7], BK_Block);
   EXPECT_BRACE_KIND(Tokens[8], BK_Block);
 
   Tokens = annotate("struct Foo final {};");
   ASSERT_EQ(Tokens.size(), 7u) << Tokens;
+  EXPECT_TOKEN(Tokens[1], tok::identifier, TT_ClassHeadName);
   EXPECT_BRACE_KIND(Tokens[3], BK_Block);
   EXPECT_BRACE_KIND(Tokens[4], BK_Block);
 
   Tokens = annotate("struct [[foo]] [[bar]] Foo final : Base1, Base2 {};");
   ASSERT_EQ(Tokens.size(), 21u) << Tokens;
+  EXPECT_TOKEN(Tokens[11], tok::identifier, TT_ClassHeadName);
   EXPECT_BRACE_KIND(Tokens[17], BK_Block);
   EXPECT_BRACE_KIND(Tokens[18], BK_Block);
 
@@ -3405,6 +3452,7 @@ TEST_F(TokenAnnotatorTest, BraceKind) {
                     "#endif\n"
                     "};");
   ASSERT_EQ(Tokens.size(), 29u) << Tokens;
+  EXPECT_TOKEN(Tokens[8], tok::identifier, TT_ClassHeadName);
   EXPECT_BRACE_KIND(Tokens[11], BK_Block);
   EXPECT_BRACE_KIND(Tokens[17], BK_Block);
   EXPECT_BRACE_KIND(Tokens[22], BK_Block);
@@ -3456,6 +3504,7 @@ TEST_F(TokenAnnotatorTest, BraceKind) {
   Tokens = annotate("a = class extends goog.a {};",
                     getGoogleStyle(FormatStyle::LK_JavaScript));
   ASSERT_EQ(Tokens.size(), 11u) << Tokens;
+  EXPECT_TOKEN(Tokens[3], tok::identifier, TT_Unknown);
   EXPECT_TOKEN(Tokens[7], tok::l_brace, TT_ClassLBrace);
   EXPECT_BRACE_KIND(Tokens[7], BK_Block);
   EXPECT_TOKEN(Tokens[8], tok::r_brace, TT_ClassRBrace);
@@ -3464,6 +3513,7 @@ TEST_F(TokenAnnotatorTest, BraceKind) {
   Tokens = annotate("a = class Foo extends goog.a {};",
                     getGoogleStyle(FormatStyle::LK_JavaScript));
   ASSERT_EQ(Tokens.size(), 12u) << Tokens;
+  EXPECT_TOKEN(Tokens[3], tok::identifier, TT_ClassHeadName);
   EXPECT_TOKEN(Tokens[4], tok::identifier, TT_Unknown); // Not TT_StartOfName
   EXPECT_TOKEN(Tokens[8], tok::l_brace, TT_ClassLBrace);
   EXPECT_BRACE_KIND(Tokens[8], BK_Block);
@@ -3473,6 +3523,8 @@ TEST_F(TokenAnnotatorTest, BraceKind) {
   Tokens = annotate("#define FOO(X) \\\n"
                     "  struct X##_tag_ {};");
   ASSERT_EQ(Tokens.size(), 14u) << Tokens;
+  EXPECT_TOKEN(Tokens[7], tok::identifier, TT_Unknown);
+  EXPECT_TOKEN(Tokens[9], tok::identifier, TT_ClassHeadName);
   EXPECT_TOKEN(Tokens[10], tok::l_brace, TT_StructLBrace);
   EXPECT_BRACE_KIND(Tokens[10], BK_Block);
   EXPECT_TOKEN(Tokens[11], tok::r_brace, TT_StructRBrace);
@@ -3483,6 +3535,7 @@ TEST_F(TokenAnnotatorTest, BraceKind) {
                     "    void f() { return; } \\\n"
                     "  };");
   ASSERT_EQ(Tokens.size(), 20u) << Tokens;
+  EXPECT_TOKEN(Tokens[4], tok::identifier, TT_ClassHeadName);
   EXPECT_TOKEN(Tokens[8], tok::l_brace, TT_StructLBrace);
   EXPECT_BRACE_KIND(Tokens[8], BK_Block);
   EXPECT_TOKEN(Tokens[10], tok::identifier, TT_FunctionDeclarationName);

From 2eb558c99d967af093340def231c9a1ba228571b Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Thu, 13 Feb 2025 14:19:51 -0800
Subject: [PATCH 144/282] [SLP] Check for PHI nodes (potentially cycles!) when
 checking dependencies

When checking for dependecies for gather nodes with users with the same
last instruction, cannot rely on the index order, if there is (even
potential!) cycle in the graph, which may cause order not work correctly
and cause compiler crash.

Fixes #127128

(cherry picked from commit ac217ee389d63124432e5e6890851a678f7a676b)
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 10 +++-
 .../X86/delayed-gather-emission.ll            |  2 +-
 .../X86/matching-gather-nodes-phi-users.ll    |  2 +-
 .../SLPVectorizer/X86/phi-node-with-cycle.ll  | 59 +++++++++++++++++++
 4 files changed, 70 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/phi-node-with-cycle.ll

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 19963e780ebd3..7b20eda550095 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -13181,8 +13181,16 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
           continue;
         // If the user instruction is used for some reason in different
         // vectorized nodes - make it depend on index.
+        // If any vector node is PHI node, this dependency might not work
+        // because of cycle dependencies, so disable it.
         if (TEUseEI.UserTE != UseEI.UserTE &&
-            TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)
+            (TEUseEI.UserTE->Idx < UseEI.UserTE->Idx ||
+             any_of(
+                 VectorizableTree,
+                 [](const std::unique_ptr<TreeEntry> &TE) {
+                   return TE->State == TreeEntry::Vectorize &&
+                          TE->getOpcode() == Instruction::PHI;
+                 })))
           continue;
       }
 
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/delayed-gather-emission.ll b/llvm/test/Transforms/SLPVectorizer/X86/delayed-gather-emission.ll
index 5562291dbb6be..bf3f0c4df74e4 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/delayed-gather-emission.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/delayed-gather-emission.ll
@@ -31,7 +31,7 @@ define void @test() {
 ; CHECK-NEXT:    [[TOBOOL:%.*]] = fcmp une float [[I2]], 0.000000e+00
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <2 x i32> <i32 poison, i32 0>
 ; CHECK-NEXT:    [[TMP9]] = insertelement <2 x float> [[TMP8]], float [[I2]], i32 0
-; CHECK-NEXT:    [[TMP10]] = shufflevector <2 x float> [[TMP9]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP10]] = insertelement <2 x float> [[TMP2]], float [[I2]], i32 0
 ; CHECK-NEXT:    br i1 [[TOBOOL]], label [[BB1]], label [[BB2]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/matching-gather-nodes-phi-users.ll b/llvm/test/Transforms/SLPVectorizer/X86/matching-gather-nodes-phi-users.ll
index 166c819098c8c..d649465c9ff12 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/matching-gather-nodes-phi-users.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/matching-gather-nodes-phi-users.ll
@@ -8,7 +8,7 @@
 ; YAML: Function:        test
 ; YAML: Args:
 ; YAML:   - String:          'Stores SLP vectorized with cost '
-; YAML:   - Cost:            '-6'
+; YAML:   - Cost:            '-3'
 ; YAML:   - String:          ' and with tree size '
 ; YAML:   - TreeSize:        '14'
 ; YAML: ...
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi-node-with-cycle.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi-node-with-cycle.ll
new file mode 100644
index 0000000000000..22e7e6a8e6624
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/phi-node-with-cycle.ll
@@ -0,0 +1,59 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -mcpu=haswell < %s | FileCheck %s
+
+define void @test(float %0) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: float [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> <float 0.000000e+00, float poison>, float [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fdiv <2 x float> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = fdiv <2 x float> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    br label %[[BB6:.*]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul <2 x float> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = fsub <2 x float> zeroinitializer, [[TMP7]]
+; CHECK-NEXT:    br label %[[BB10:.*]]
+; CHECK:       [[BB9:.*]]:
+; CHECK-NEXT:    br label %[[BB10]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    [[TMP11:%.*]] = phi <2 x float> [ [[TMP8]], %[[BB6]] ], [ poison, %[[BB9]] ]
+; CHECK-NEXT:    br label %[[BB12:.*]]
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    [[TMP13:%.*]] = fmul <2 x float> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = fsub <2 x float> [[TMP11]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x float> [[TMP14]], i32 0
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x float> [[TMP14]], i32 1
+; CHECK-NEXT:    [[TMP17:%.*]] = fadd float [[TMP15]], [[TMP16]]
+; CHECK-NEXT:    [[TMP18:%.*]] = call float @llvm.fabs.f32(float [[TMP17]])
+; CHECK-NEXT:    ret void
+;
+  %2 = fdiv float 0.000000e+00, 0.000000e+00
+  %3 = fdiv float 0.000000e+00, 0.000000e+00
+  %4 = fdiv float %0, 0.000000e+00
+  br label %5
+
+5:
+  %6 = fmul float %4, 0.000000e+00
+  %7 = fsub float 0.000000e+00, %6
+  %8 = fmul float %3, 0.000000e+00
+  %9 = fsub float 0.000000e+00, %8
+  br label %11
+
+10:
+  br label %11
+
+11:
+  %12 = phi float [ %7, %5 ], [ 0.000000e+00, %10 ]
+  %13 = phi float [ %9, %5 ], [ 0.000000e+00, %10 ]
+  br label %14
+
+14:
+  %15 = fmul float %2, 0.000000e+00
+  %16 = fsub float %12, %15
+  %17 = fmul float %4, 0.000000e+00
+  %18 = fsub float %13, %17
+  %19 = fadd float %16, %18
+  %20 = call float @llvm.fabs.f32(float %19)
+  ret void
+}
+

From d90afbca9367ef98273e1b3dae7fa26766e0335d Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Fri, 14 Feb 2025 17:28:02 -0800
Subject: [PATCH 145/282] Revert "[SLP] Check for PHI nodes (potentially
 cycles!) when checking dependencies"

This reverts commit 2eb558c99d967af093340def231c9a1ba228571b.

This commit did not pass pre-commit CI.
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 10 +---
 .../X86/delayed-gather-emission.ll            |  2 +-
 .../X86/matching-gather-nodes-phi-users.ll    |  2 +-
 .../SLPVectorizer/X86/phi-node-with-cycle.ll  | 59 -------------------
 4 files changed, 3 insertions(+), 70 deletions(-)
 delete mode 100644 llvm/test/Transforms/SLPVectorizer/X86/phi-node-with-cycle.ll

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 7b20eda550095..19963e780ebd3 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -13181,16 +13181,8 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
           continue;
         // If the user instruction is used for some reason in different
         // vectorized nodes - make it depend on index.
-        // If any vector node is PHI node, this dependency might not work
-        // because of cycle dependencies, so disable it.
         if (TEUseEI.UserTE != UseEI.UserTE &&
-            (TEUseEI.UserTE->Idx < UseEI.UserTE->Idx ||
-             any_of(
-                 VectorizableTree,
-                 [](const std::unique_ptr<TreeEntry> &TE) {
-                   return TE->State == TreeEntry::Vectorize &&
-                          TE->getOpcode() == Instruction::PHI;
-                 })))
+            TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)
           continue;
       }
 
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/delayed-gather-emission.ll b/llvm/test/Transforms/SLPVectorizer/X86/delayed-gather-emission.ll
index bf3f0c4df74e4..5562291dbb6be 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/delayed-gather-emission.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/delayed-gather-emission.ll
@@ -31,7 +31,7 @@ define void @test() {
 ; CHECK-NEXT:    [[TOBOOL:%.*]] = fcmp une float [[I2]], 0.000000e+00
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <2 x i32> <i32 poison, i32 0>
 ; CHECK-NEXT:    [[TMP9]] = insertelement <2 x float> [[TMP8]], float [[I2]], i32 0
-; CHECK-NEXT:    [[TMP10]] = insertelement <2 x float> [[TMP2]], float [[I2]], i32 0
+; CHECK-NEXT:    [[TMP10]] = shufflevector <2 x float> [[TMP9]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    br i1 [[TOBOOL]], label [[BB1]], label [[BB2]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/matching-gather-nodes-phi-users.ll b/llvm/test/Transforms/SLPVectorizer/X86/matching-gather-nodes-phi-users.ll
index d649465c9ff12..166c819098c8c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/matching-gather-nodes-phi-users.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/matching-gather-nodes-phi-users.ll
@@ -8,7 +8,7 @@
 ; YAML: Function:        test
 ; YAML: Args:
 ; YAML:   - String:          'Stores SLP vectorized with cost '
-; YAML:   - Cost:            '-3'
+; YAML:   - Cost:            '-6'
 ; YAML:   - String:          ' and with tree size '
 ; YAML:   - TreeSize:        '14'
 ; YAML: ...
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi-node-with-cycle.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi-node-with-cycle.ll
deleted file mode 100644
index 22e7e6a8e6624..0000000000000
--- a/llvm/test/Transforms/SLPVectorizer/X86/phi-node-with-cycle.ll
+++ /dev/null
@@ -1,59 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -mcpu=haswell < %s | FileCheck %s
-
-define void @test(float %0) {
-; CHECK-LABEL: define void @test(
-; CHECK-SAME: float [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> <float 0.000000e+00, float poison>, float [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = fdiv <2 x float> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = fdiv <2 x float> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    br label %[[BB6:.*]]
-; CHECK:       [[BB6]]:
-; CHECK-NEXT:    [[TMP7:%.*]] = fmul <2 x float> [[TMP5]], zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = fsub <2 x float> zeroinitializer, [[TMP7]]
-; CHECK-NEXT:    br label %[[BB10:.*]]
-; CHECK:       [[BB9:.*]]:
-; CHECK-NEXT:    br label %[[BB10]]
-; CHECK:       [[BB10]]:
-; CHECK-NEXT:    [[TMP11:%.*]] = phi <2 x float> [ [[TMP8]], %[[BB6]] ], [ poison, %[[BB9]] ]
-; CHECK-NEXT:    br label %[[BB12:.*]]
-; CHECK:       [[BB12]]:
-; CHECK-NEXT:    [[TMP13:%.*]] = fmul <2 x float> [[TMP3]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = fsub <2 x float> [[TMP11]], [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x float> [[TMP14]], i32 0
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x float> [[TMP14]], i32 1
-; CHECK-NEXT:    [[TMP17:%.*]] = fadd float [[TMP15]], [[TMP16]]
-; CHECK-NEXT:    [[TMP18:%.*]] = call float @llvm.fabs.f32(float [[TMP17]])
-; CHECK-NEXT:    ret void
-;
-  %2 = fdiv float 0.000000e+00, 0.000000e+00
-  %3 = fdiv float 0.000000e+00, 0.000000e+00
-  %4 = fdiv float %0, 0.000000e+00
-  br label %5
-
-5:
-  %6 = fmul float %4, 0.000000e+00
-  %7 = fsub float 0.000000e+00, %6
-  %8 = fmul float %3, 0.000000e+00
-  %9 = fsub float 0.000000e+00, %8
-  br label %11
-
-10:
-  br label %11
-
-11:
-  %12 = phi float [ %7, %5 ], [ 0.000000e+00, %10 ]
-  %13 = phi float [ %9, %5 ], [ 0.000000e+00, %10 ]
-  br label %14
-
-14:
-  %15 = fmul float %2, 0.000000e+00
-  %16 = fsub float %12, %15
-  %17 = fmul float %4, 0.000000e+00
-  %18 = fsub float %13, %17
-  %19 = fadd float %16, %18
-  %20 = call float @llvm.fabs.f32(float %19)
-  ret void
-}
-

From 4fa59872f61b2c01abeac85e49a4a46e828fae9c Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Wed, 12 Feb 2025 10:00:10 +1100
Subject: [PATCH 146/282] [ORC] Switch to singleton pattern for
 UnwindInfoManager. (#126691)

The find-dynamic-unwind-info callback registration APIs in libunwind
limit the number of callbacks that can be registered. If we use multiple
UnwindInfoManager instances, each with their own own callback function
(as was the case prior to this patch) we can quickly exceed this limit
(see https://github.com/llvm/llvm-project/issues/126611).

This patch updates the UnwindInfoManager class to use a singleton
pattern, with the single instance shared between all LLVM JITs in the
process.

This change does _not_ apply to compact unwind info registered through
the ORC runtime (which currently installs its own callbacks).

As a bonus this change eliminates the need to load an IR "bouncer"
module to supply the unique callback for each instance, so support for
compact-unwind can be extended to the llvm-jitlink tools (which does not
support adding IR).

(cherry picked from commit 84fe1f63b02414085bf7a8434caaf4a358be86da)
---
 .../ExecutionEngine/Orc/Shared/OrcRTBridge.h  |   4 -
 .../Orc/TargetProcess/UnwindInfoManager.h     |  52 +++--
 .../Orc/UnwindInfoRegistrationPlugin.h        |  25 +--
 .../Orc/ExecutorProcessControl.cpp            |   5 +-
 llvm/lib/ExecutionEngine/Orc/LLJIT.cpp        |   4 +-
 .../Orc/Shared/OrcRTBridge.cpp                |   8 -
 .../Orc/TargetProcess/UnwindInfoManager.cpp   | 181 ++++++++----------
 .../Orc/UnwindInfoRegistrationPlugin.cpp      | 138 ++-----------
 .../llvm-jitlink-executor.cpp                 |   5 +
 llvm/tools/llvm-jitlink/llvm-jitlink.cpp      |  14 ++
 10 files changed, 146 insertions(+), 290 deletions(-)

diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h b/llvm/include/llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h
index db5ff135a7164..0f59edd429332 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h
@@ -90,10 +90,6 @@ using SPSRunAsIntFunctionSignature = int32_t(shared::SPSExecutorAddr, int32_t);
 } // end namespace rt
 
 namespace rt_alt {
-extern const char *UnwindInfoManagerInstanceName;
-extern const char *UnwindInfoManagerFindSectionsHelperName;
-extern const char *UnwindInfoManagerEnableWrapperName;
-extern const char *UnwindInfoManagerDisableWrapperName;
 extern const char *UnwindInfoManagerRegisterActionName;
 extern const char *UnwindInfoManagerDeregisterActionName;
 } // end namespace rt_alt
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/UnwindInfoManager.h b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/UnwindInfoManager.h
index fc7719f282122..847c340eff17d 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/UnwindInfoManager.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/UnwindInfoManager.h
@@ -15,14 +15,13 @@
 #define LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_UNWINDINFOMANAGER_H
 
 #include "llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h"
-#include "llvm/ExecutionEngine/Orc/TargetProcess/ExecutorBootstrapService.h"
 #include "llvm/Support/Error.h"
 #include <map>
 #include <mutex>
 
 namespace llvm::orc {
 
-class UnwindInfoManager : public ExecutorBootstrapService {
+class UnwindInfoManager {
 public:
   // This struct's layout should match the unw_dynamic_unwind_sections struct
   // from libunwind/src/libunwid_ext.h.
@@ -34,43 +33,40 @@ class UnwindInfoManager : public ExecutorBootstrapService {
     size_t compact_unwind_section_length;
   };
 
+  UnwindInfoManager(UnwindInfoManager &&) = delete;
+  UnwindInfoManager &operator=(UnwindInfoManager &&) = delete;
+  ~UnwindInfoManager();
+
   /// If the libunwind find-dynamic-unwind-info callback registration APIs are
-  /// available then this method will return an UnwindInfoManager instance,
-  /// otherwise it will return nullptr.
-  static std::unique_ptr<UnwindInfoManager> TryCreate();
+  /// available then this method will instantiate a global UnwindInfoManager
+  /// instance suitable for the process and return true. Otherwise it will
+  /// return false.
+  static bool TryEnable();
 
-  Error shutdown() override;
-  void addBootstrapSymbols(StringMap<ExecutorAddr> &M) override;
+  static void addBootstrapSymbols(StringMap<ExecutorAddr> &M);
 
-  Error enable(void *FindDynamicUnwindSections);
-  Error disable(void);
+  static Error registerSections(ArrayRef<orc::ExecutorAddrRange> CodeRanges,
+                                orc::ExecutorAddr DSOBase,
+                                orc::ExecutorAddrRange DWARFEHFrame,
+                                orc::ExecutorAddrRange CompactUnwind);
 
-  Error registerSections(ArrayRef<orc::ExecutorAddrRange> CodeRanges,
-                         orc::ExecutorAddr DSOBase,
-                         orc::ExecutorAddrRange DWARFEHFrame,
-                         orc::ExecutorAddrRange CompactUnwind);
+  static Error deregisterSections(ArrayRef<orc::ExecutorAddrRange> CodeRanges);
 
-  Error deregisterSections(ArrayRef<orc::ExecutorAddrRange> CodeRanges);
+private:
+  UnwindInfoManager() = default;
 
-  int findSections(uintptr_t Addr, UnwindSections *Info);
+  int findSectionsImpl(uintptr_t Addr, UnwindSections *Info);
+  static int findSections(uintptr_t Addr, UnwindSections *Info);
 
-private:
-  UnwindInfoManager(int (*AddFindDynamicUnwindSections)(void *),
-                    int (*RemoveFindDynamicUnwindSections)(void *))
-      : AddFindDynamicUnwindSections(AddFindDynamicUnwindSections),
-        RemoveFindDynamicUnwindSections(RemoveFindDynamicUnwindSections) {}
+  Error registerSectionsImpl(ArrayRef<orc::ExecutorAddrRange> CodeRanges,
+                             orc::ExecutorAddr DSOBase,
+                             orc::ExecutorAddrRange DWARFEHFrame,
+                             orc::ExecutorAddrRange CompactUnwind);
 
-  static int findSectionsHelper(UnwindInfoManager *Instance, uintptr_t Addr,
-                                UnwindSections *Info);
+  Error deregisterSectionsImpl(ArrayRef<orc::ExecutorAddrRange> CodeRanges);
 
   std::mutex M;
   std::map<uintptr_t, UnwindSections> UWSecs;
-
-  int (*AddFindDynamicUnwindSections)(void *) = nullptr;
-  int (*RemoveFindDynamicUnwindSections)(void *) = nullptr;
-  void *FindDynamicUnwindSections = nullptr;
-
-  static const char *AddFnName, *RemoveFnName;
 };
 
 } // namespace llvm::orc
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/UnwindInfoRegistrationPlugin.h b/llvm/include/llvm/ExecutionEngine/Orc/UnwindInfoRegistrationPlugin.h
index eb883a79a93d8..65f20ad3b2163 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/UnwindInfoRegistrationPlugin.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/UnwindInfoRegistrationPlugin.h
@@ -19,15 +19,17 @@ namespace llvm::orc {
 
 class UnwindInfoRegistrationPlugin : public LinkGraphLinkingLayer::Plugin {
 public:
-  static Expected<std::shared_ptr<UnwindInfoRegistrationPlugin>>
-  Create(IRLayer &IRL, JITDylib &PlatformJD, ExecutorAddr Instance,
-         ExecutorAddr FindHelper, ExecutorAddr Enable, ExecutorAddr Disable,
-         ExecutorAddr Register, ExecutorAddr Deregister);
+  UnwindInfoRegistrationPlugin(ExecutionSession &ES, ExecutorAddr Register,
+                               ExecutorAddr Deregister)
+      : ES(ES), Register(Register), Deregister(Deregister) {
+    DSOBaseName = ES.intern("__jitlink$libunwind_dso_base");
+  }
 
   static Expected<std::shared_ptr<UnwindInfoRegistrationPlugin>>
-  Create(IRLayer &IRL, JITDylib &PlatformJD);
+  Create(ExecutionSession &ES, ExecutorAddr Register, ExecutorAddr Deregister);
 
-  ~UnwindInfoRegistrationPlugin();
+  static Expected<std::shared_ptr<UnwindInfoRegistrationPlugin>>
+  Create(ExecutionSession &ES);
 
   void modifyPassConfig(MaterializationResponsibility &MR,
                         jitlink::LinkGraph &G,
@@ -49,20 +51,11 @@ class UnwindInfoRegistrationPlugin : public LinkGraphLinkingLayer::Plugin {
                                    ResourceKey SrcKey) override {}
 
 private:
-  UnwindInfoRegistrationPlugin(ExecutionSession &ES, ExecutorAddr Instance,
-                               ExecutorAddr Disable, ExecutorAddr Register,
-                               ExecutorAddr Deregister)
-      : ES(ES), Instance(Instance), Disable(Disable), Register(Register),
-        Deregister(Deregister) {
-    DSOBaseName = ES.intern("__jitlink$libunwind_dso_base");
-  }
-
-  static Expected<ThreadSafeModule> makeBouncerModule(ExecutionSession &ES);
   Error addUnwindInfoRegistrationActions(jitlink::LinkGraph &G);
 
   ExecutionSession &ES;
   SymbolStringPtr DSOBaseName;
-  ExecutorAddr Instance, Disable, Register, Deregister;
+  ExecutorAddr Register, Deregister;
 };
 
 } // namespace llvm::orc
diff --git a/llvm/lib/ExecutionEngine/Orc/ExecutorProcessControl.cpp b/llvm/lib/ExecutionEngine/Orc/ExecutorProcessControl.cpp
index 7b38150ab4b65..45cb28af56050 100644
--- a/llvm/lib/ExecutionEngine/Orc/ExecutorProcessControl.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/ExecutorProcessControl.cpp
@@ -54,9 +54,8 @@ SelfExecutorProcessControl::SelfExecutorProcessControl(
   // FIXME: Don't add an UnwindInfoManager by default -- it's redundant when
   //        the ORC runtime is loaded. We'll need a way to document this and
   //        allow clients to choose.
-  this->UnwindInfoMgr = UnwindInfoManager::TryCreate();
-  if (this->UnwindInfoMgr)
-    this->UnwindInfoMgr->addBootstrapSymbols(this->BootstrapSymbols);
+  if (UnwindInfoManager::TryEnable())
+    UnwindInfoManager::addBootstrapSymbols(this->BootstrapSymbols);
 #endif // __APPLE__
 }
 
diff --git a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
index dd844ae3a42bc..972c24abc7506 100644
--- a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
@@ -1241,8 +1241,8 @@ Expected<JITDylibSP> setUpGenericLLVMIRPlatform(LLJIT &J) {
 
       // If UseEHFrames hasn't been set then we're good to use compact-unwind.
       if (!UseEHFrames) {
-        if (auto UIRP = UnwindInfoRegistrationPlugin::Create(
-                J.getIRCompileLayer(), PlatformJD)) {
+        if (auto UIRP =
+                UnwindInfoRegistrationPlugin::Create(J.getExecutionSession())) {
           OLL->addPlugin(std::move(*UIRP));
           LLVM_DEBUG(dbgs() << "Enabled compact-unwind support.\n");
         } else
diff --git a/llvm/lib/ExecutionEngine/Orc/Shared/OrcRTBridge.cpp b/llvm/lib/ExecutionEngine/Orc/Shared/OrcRTBridge.cpp
index fef3ff989a52a..d3b3f121cfcd9 100644
--- a/llvm/lib/ExecutionEngine/Orc/Shared/OrcRTBridge.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Shared/OrcRTBridge.cpp
@@ -65,14 +65,6 @@ const char *RunAsIntFunctionWrapperName =
 
 } // end namespace rt
 namespace rt_alt {
-const char *UnwindInfoManagerInstanceName =
-    "orc_rt_alt_UnwindInfoManager_Instance";
-const char *UnwindInfoManagerFindSectionsHelperName =
-    "orc_rt_alt_UnwindInfoManager_findSectionsHelper";
-const char *UnwindInfoManagerEnableWrapperName =
-    "orc_rt_alt_UnwindInfoManager_enable";
-const char *UnwindInfoManagerDisableWrapperName =
-    "orc_rt_alt_UnwindInfoManager_disable";
 const char *UnwindInfoManagerRegisterActionName =
     "orc_rt_alt_UnwindInfoManager_register";
 const char *UnwindInfoManagerDeregisterActionName =
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/UnwindInfoManager.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/UnwindInfoManager.cpp
index 9f748154c03e5..d67cbb807f2cd 100644
--- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/UnwindInfoManager.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/UnwindInfoManager.cpp
@@ -9,7 +9,10 @@
 #include "llvm/ExecutionEngine/Orc/TargetProcess/UnwindInfoManager.h"
 #include "llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h"
 #include "llvm/ExecutionEngine/Orc/Shared/WrapperFunctionUtils.h"
-#include "llvm/Support/DynamicLibrary.h"
+
+#ifdef __APPLE__
+#include <dlfcn.h>
+#endif // __APPLE__
 
 #define DEBUG_TYPE "orc"
 
@@ -17,40 +20,17 @@ using namespace llvm;
 using namespace llvm::orc;
 using namespace llvm::orc::shared;
 
-static orc::shared::CWrapperFunctionResult
-llvm_orc_rt_alt_UnwindInfoManager_enable(const char *Data, uint64_t Size) {
-  return WrapperFunction<SPSError(SPSExecutorAddr, SPSExecutorAddr)>::handle(
-             Data, Size,
-             [](ExecutorAddr Instance, ExecutorAddr FindFn) {
-               return Instance.toPtr<UnwindInfoManager *>()->enable(
-                   FindFn.toPtr<void *>());
-             })
-      .release();
-}
-
-static orc::shared::CWrapperFunctionResult
-llvm_orc_rt_alt_UnwindInfoManager_disable(const char *Data, uint64_t Size) {
-  return WrapperFunction<SPSError(SPSExecutorAddr)>::handle(
-             Data, Size,
-             [](ExecutorAddr Instance) {
-               return Instance.toPtr<UnwindInfoManager *>()->disable();
-             })
-      .release();
-}
-
 static orc::shared::CWrapperFunctionResult
 llvm_orc_rt_alt_UnwindInfoManager_register(const char *Data, uint64_t Size) {
-  using SPSSig =
-      SPSError(SPSExecutorAddr, SPSSequence<SPSExecutorAddrRange>,
-               SPSExecutorAddr, SPSExecutorAddrRange, SPSExecutorAddrRange);
+  using SPSSig = SPSError(SPSSequence<SPSExecutorAddrRange>, SPSExecutorAddr,
+                          SPSExecutorAddrRange, SPSExecutorAddrRange);
 
   return WrapperFunction<SPSSig>::handle(
              Data, Size,
-             [](ExecutorAddr Instance,
-                std::vector<ExecutorAddrRange> CodeRanges, ExecutorAddr DSOBase,
+             [](std::vector<ExecutorAddrRange> CodeRanges, ExecutorAddr DSOBase,
                 ExecutorAddrRange DWARFRange,
                 ExecutorAddrRange CompactUnwindRange) {
-               return Instance.toPtr<UnwindInfoManager *>()->registerSections(
+               return UnwindInfoManager::registerSections(
                    CodeRanges, DSOBase, DWARFRange, CompactUnwindRange);
              })
       .release();
@@ -58,89 +38,100 @@ llvm_orc_rt_alt_UnwindInfoManager_register(const char *Data, uint64_t Size) {
 
 static orc::shared::CWrapperFunctionResult
 llvm_orc_rt_alt_UnwindInfoManager_deregister(const char *Data, uint64_t Size) {
-  using SPSSig = SPSError(SPSExecutorAddr, SPSSequence<SPSExecutorAddrRange>);
+  using SPSSig = SPSError(SPSSequence<SPSExecutorAddrRange>);
 
   return WrapperFunction<SPSSig>::handle(
              Data, Size,
-             [](ExecutorAddr Instance,
-                std::vector<ExecutorAddrRange> CodeRanges) {
-               return Instance.toPtr<UnwindInfoManager *>()->deregisterSections(
-                   CodeRanges);
+             [](std::vector<ExecutorAddrRange> CodeRanges) {
+               return UnwindInfoManager::deregisterSections(CodeRanges);
              })
       .release();
 }
 
 namespace llvm::orc {
 
-const char *UnwindInfoManager::AddFnName =
-    "__unw_add_find_dynamic_unwind_sections";
-const char *UnwindInfoManager::RemoveFnName =
-    "__unw_remove_find_dynamic_unwind_sections";
-
-std::unique_ptr<UnwindInfoManager> UnwindInfoManager::TryCreate() {
-  std::string ErrMsg;
-  auto DL = sys::DynamicLibrary::getPermanentLibrary(nullptr, &ErrMsg);
-  if (!DL.isValid())
-    return nullptr;
-
-  auto AddFindDynamicUnwindSections =
-      (int (*)(void *))DL.getAddressOfSymbol(AddFnName);
-  if (!AddFindDynamicUnwindSections)
-    return nullptr;
-
-  auto RemoveFindDynamicUnwindSections =
-      (int (*)(void *))DL.getAddressOfSymbol(RemoveFnName);
-  if (!RemoveFindDynamicUnwindSections)
-    return nullptr;
-
-  return std::unique_ptr<UnwindInfoManager>(new UnwindInfoManager(
-      AddFindDynamicUnwindSections, RemoveFindDynamicUnwindSections));
+static const char *AddFnName = "__unw_add_find_dynamic_unwind_sections";
+static const char *RemoveFnName = "__unw_remove_find_dynamic_unwind_sections";
+static std::unique_ptr<UnwindInfoManager> Instance;
+static int (*RemoveFindDynamicUnwindSections)(void *) = nullptr;
+
+UnwindInfoManager::~UnwindInfoManager() {
+  if (int Err = RemoveFindDynamicUnwindSections((void *)&findSections)) {
+    LLVM_DEBUG({
+      dbgs() << "Failed call to " << RemoveFnName << ": error = " << Err
+             << "\n";
+    });
+  }
 }
 
-Error UnwindInfoManager::shutdown() { return Error::success(); }
+bool UnwindInfoManager::TryEnable() {
+#ifdef __APPLE__
+  static std::mutex M;
+  std::lock_guard<std::mutex> Lock(M);
+
+  if (Instance)
+    return true;
+
+  auto AddFn = (int (*)(void *))dlsym(RTLD_DEFAULT, AddFnName);
+  if (!AddFn)
+    return false;
+
+  auto RemoveFn = (int (*)(void *))dlsym(RTLD_DEFAULT, RemoveFnName);
+  if (!RemoveFn)
+    return false;
+
+  Instance.reset(new UnwindInfoManager());
+
+  if (auto Err = AddFn((void *)&findSections)) {
+    LLVM_DEBUG({
+      dbgs() << "Failed call to " << AddFnName << ": error = " << Err << "\n";
+    });
+    Instance = nullptr;
+    return false;
+  }
+
+  RemoveFindDynamicUnwindSections = RemoveFn;
+  return true;
+
+#else
+  return false;
+#endif // __APPLE__
+}
 
 void UnwindInfoManager::addBootstrapSymbols(StringMap<ExecutorAddr> &M) {
-  M[rt_alt::UnwindInfoManagerInstanceName] = ExecutorAddr::fromPtr(this);
-  M[rt_alt::UnwindInfoManagerFindSectionsHelperName] =
-      ExecutorAddr::fromPtr(&findSectionsHelper);
-  M[rt_alt::UnwindInfoManagerEnableWrapperName] =
-      ExecutorAddr::fromPtr(llvm_orc_rt_alt_UnwindInfoManager_enable);
-  M[rt_alt::UnwindInfoManagerDisableWrapperName] =
-      ExecutorAddr::fromPtr(llvm_orc_rt_alt_UnwindInfoManager_disable);
   M[rt_alt::UnwindInfoManagerRegisterActionName] =
       ExecutorAddr::fromPtr(llvm_orc_rt_alt_UnwindInfoManager_register);
   M[rt_alt::UnwindInfoManagerDeregisterActionName] =
       ExecutorAddr::fromPtr(llvm_orc_rt_alt_UnwindInfoManager_deregister);
 }
 
-Error UnwindInfoManager::enable(void *FindDynamicUnwindSections) {
-  LLVM_DEBUG(dbgs() << "Enabling UnwindInfoManager.\n");
-
-  if (auto Err = AddFindDynamicUnwindSections(FindDynamicUnwindSections))
-    return make_error<StringError>(Twine("Could not register function via ") +
-                                       AddFnName +
-                                       ", error code = " + Twine(Err),
-                                   inconvertibleErrorCode());
-
-  this->FindDynamicUnwindSections = FindDynamicUnwindSections;
-  return Error::success();
+Error UnwindInfoManager::registerSections(
+    ArrayRef<orc::ExecutorAddrRange> CodeRanges, orc::ExecutorAddr DSOBase,
+    orc::ExecutorAddrRange DWARFEHFrame, orc::ExecutorAddrRange CompactUnwind) {
+  return Instance->registerSectionsImpl(CodeRanges, DSOBase, DWARFEHFrame,
+                                        CompactUnwind);
 }
 
-Error UnwindInfoManager::disable(void) {
-  LLVM_DEBUG(dbgs() << "Disabling UnwindInfoManager.\n");
+Error UnwindInfoManager::deregisterSections(
+    ArrayRef<orc::ExecutorAddrRange> CodeRanges) {
+  return Instance->deregisterSectionsImpl(CodeRanges);
+}
 
-  if (FindDynamicUnwindSections)
-    if (auto Err = RemoveFindDynamicUnwindSections(FindDynamicUnwindSections))
-      return make_error<StringError>(
-          Twine("Could not deregister function via ") + RemoveFnName +
-              "error code = " + Twine(Err),
-          inconvertibleErrorCode());
+int UnwindInfoManager::findSectionsImpl(uintptr_t Addr, UnwindSections *Info) {
+  std::lock_guard<std::mutex> Lock(M);
+  auto I = UWSecs.upper_bound(Addr);
+  if (I == UWSecs.begin())
+    return 0;
+  --I;
+  *Info = I->second;
+  return 1;
+}
 
-  FindDynamicUnwindSections = nullptr;
-  return Error::success();
+int UnwindInfoManager::findSections(uintptr_t Addr, UnwindSections *Info) {
+  return Instance->findSectionsImpl(Addr, Info);
 }
 
-Error UnwindInfoManager::registerSections(
+Error UnwindInfoManager::registerSectionsImpl(
     ArrayRef<ExecutorAddrRange> CodeRanges, ExecutorAddr DSOBase,
     ExecutorAddrRange DWARFEHFrame, ExecutorAddrRange CompactUnwind) {
   std::lock_guard<std::mutex> Lock(M);
@@ -154,7 +145,7 @@ Error UnwindInfoManager::registerSections(
   return Error::success();
 }
 
-Error UnwindInfoManager::deregisterSections(
+Error UnwindInfoManager::deregisterSectionsImpl(
     ArrayRef<ExecutorAddrRange> CodeRanges) {
   std::lock_guard<std::mutex> Lock(M);
   for (auto &R : CodeRanges) {
@@ -169,20 +160,4 @@ Error UnwindInfoManager::deregisterSections(
   return Error::success();
 }
 
-int UnwindInfoManager::findSections(uintptr_t Addr, UnwindSections *Info) {
-  std::lock_guard<std::mutex> Lock(M);
-  auto I = UWSecs.upper_bound(Addr);
-  if (I == UWSecs.begin())
-    return 0;
-  --I;
-  *Info = I->second;
-  return 1;
-}
-
-int UnwindInfoManager::findSectionsHelper(UnwindInfoManager *Instance,
-                                          uintptr_t Addr,
-                                          UnwindSections *Info) {
-  return Instance->findSections(Addr, Info);
-}
-
 } // namespace llvm::orc
diff --git a/llvm/lib/ExecutionEngine/Orc/UnwindInfoRegistrationPlugin.cpp b/llvm/lib/ExecutionEngine/Orc/UnwindInfoRegistrationPlugin.cpp
index ae1f3f98269db..4482eedc00702 100644
--- a/llvm/lib/ExecutionEngine/Orc/UnwindInfoRegistrationPlugin.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/UnwindInfoRegistrationPlugin.cpp
@@ -9,7 +9,6 @@
 #include "llvm/ExecutionEngine/Orc/UnwindInfoRegistrationPlugin.h"
 
 #include "llvm/ADT/ScopeExit.h"
-#include "llvm/ExecutionEngine/Orc/AbsoluteSymbols.h"
 #include "llvm/ExecutionEngine/Orc/Shared/MachOObjectFormat.h"
 #include "llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h"
 #include "llvm/IR/IRBuilder.h"
@@ -19,95 +18,21 @@
 
 using namespace llvm::jitlink;
 
-static const char *FindDynamicUnwindSectionsFunctionName =
-    "_orc_rt_alt_find_dynamic_unwind_sections";
-
 namespace llvm::orc {
 
 Expected<std::shared_ptr<UnwindInfoRegistrationPlugin>>
-UnwindInfoRegistrationPlugin::Create(IRLayer &IRL, JITDylib &PlatformJD,
-                                     ExecutorAddr Instance,
-                                     ExecutorAddr FindHelper,
-                                     ExecutorAddr Enable, ExecutorAddr Disable,
-                                     ExecutorAddr Register,
-                                     ExecutorAddr Deregister) {
-
-  auto &ES = IRL.getExecutionSession();
-
-  // Build bouncer module.
-  auto M = makeBouncerModule(ES);
-  if (!M)
-    return M.takeError();
-
-  auto BouncerRT = PlatformJD.createResourceTracker();
-  auto RemoveBouncerModule = make_scope_exit([&]() {
-    if (auto Err = BouncerRT->remove())
-      ES.reportError(std::move(Err));
-  });
-
-  if (auto Err = PlatformJD.define(absoluteSymbols(
-          {{ES.intern(rt_alt::UnwindInfoManagerInstanceName),
-            ExecutorSymbolDef(Instance, JITSymbolFlags())},
-           {ES.intern(rt_alt::UnwindInfoManagerFindSectionsHelperName),
-            ExecutorSymbolDef(FindHelper, JITSymbolFlags::Callable)}})))
-    return std::move(Err);
-
-  if (auto Err = IRL.add(BouncerRT, std::move(*M)))
-    return Err;
-
-  auto FindUnwindSections =
-      ES.lookup({&PlatformJD}, FindDynamicUnwindSectionsFunctionName);
-  if (!FindUnwindSections)
-    return FindUnwindSections.takeError();
-
-  using namespace shared;
-  using SPSEnableSig = SPSError(SPSExecutorAddr, SPSExecutorAddr);
-  Error CallErr = Error::success();
-  if (auto Err = ES.callSPSWrapper<SPSEnableSig>(
-          Enable, CallErr, Instance, FindUnwindSections->getAddress())) {
-    consumeError(std::move(CallErr));
-    return std::move(Err);
-  }
-
-  if (CallErr)
-    return std::move(CallErr);
-
-  RemoveBouncerModule.release();
-
-  return std::shared_ptr<UnwindInfoRegistrationPlugin>(
-      new UnwindInfoRegistrationPlugin(ES, Instance, Disable, Register,
-                                       Deregister));
-}
-
-Expected<std::shared_ptr<UnwindInfoRegistrationPlugin>>
-UnwindInfoRegistrationPlugin::Create(IRLayer &IRL, JITDylib &PlatformJD) {
+UnwindInfoRegistrationPlugin::Create(ExecutionSession &ES) {
 
-  ExecutorAddr Instance, FindHelper, Enable, Disable, Register, Deregister;
+  ExecutorAddr Register, Deregister;
 
-  auto &EPC = IRL.getExecutionSession().getExecutorProcessControl();
+  auto &EPC = ES.getExecutorProcessControl();
   if (auto Err = EPC.getBootstrapSymbols(
-          {{Instance, rt_alt::UnwindInfoManagerInstanceName},
-           {FindHelper, rt_alt::UnwindInfoManagerFindSectionsHelperName},
-           {Enable, rt_alt::UnwindInfoManagerEnableWrapperName},
-           {Disable, rt_alt::UnwindInfoManagerDisableWrapperName},
-           {Register, rt_alt::UnwindInfoManagerRegisterActionName},
+          {{Register, rt_alt::UnwindInfoManagerRegisterActionName},
            {Deregister, rt_alt::UnwindInfoManagerDeregisterActionName}}))
     return std::move(Err);
 
-  return Create(IRL, PlatformJD, Instance, FindHelper, Enable, Disable,
-                Register, Deregister);
-}
-
-UnwindInfoRegistrationPlugin::~UnwindInfoRegistrationPlugin() {
-  using namespace shared;
-  using SPSDisableSig = SPSError(SPSExecutorAddr);
-  Error CallErr = Error::success();
-  if (auto Err = ES.callSPSWrapper<SPSDisableSig>(Disable, CallErr, Instance)) {
-    consumeError(std::move(CallErr));
-    ES.reportError(std::move(Err));
-  }
-  if (CallErr)
-    ES.reportError(std::move(CallErr));
+  return std::make_shared<UnwindInfoRegistrationPlugin>(ES, Register,
+                                                        Deregister);
 }
 
 void UnwindInfoRegistrationPlugin::modifyPassConfig(
@@ -118,43 +43,6 @@ void UnwindInfoRegistrationPlugin::modifyPassConfig(
       [this](LinkGraph &G) { return addUnwindInfoRegistrationActions(G); });
 }
 
-Expected<ThreadSafeModule>
-UnwindInfoRegistrationPlugin::makeBouncerModule(ExecutionSession &ES) {
-  auto Ctx = std::make_unique<LLVMContext>();
-  auto M = std::make_unique<Module>("__libunwind_find_unwind_bouncer", *Ctx);
-  M->setTargetTriple(ES.getTargetTriple().str());
-
-  auto EscapeName = [](const char *N) { return std::string("\01") + N; };
-
-  auto *PtrTy = PointerType::getUnqual(*Ctx);
-  auto *OpaqueStructTy = StructType::create(*Ctx, "UnwindInfoMgr");
-  auto *UnwindMgrInstance = new GlobalVariable(
-      *M, OpaqueStructTy, true, GlobalValue::ExternalLinkage, nullptr,
-      EscapeName(rt_alt::UnwindInfoManagerInstanceName));
-
-  auto *Int64Ty = Type::getInt64Ty(*Ctx);
-  auto *FindHelperTy = FunctionType::get(Int64Ty, {PtrTy, PtrTy, PtrTy}, false);
-  auto *FindHelperFn = Function::Create(
-      FindHelperTy, GlobalValue::ExternalLinkage,
-      EscapeName(rt_alt::UnwindInfoManagerFindSectionsHelperName), *M);
-
-  auto *FindFnTy = FunctionType::get(Int64Ty, {PtrTy, PtrTy}, false);
-  auto *FindFn =
-      Function::Create(FindFnTy, GlobalValue::ExternalLinkage,
-                       EscapeName(FindDynamicUnwindSectionsFunctionName), *M);
-  auto *EntryBlock = BasicBlock::Create(M->getContext(), StringRef(), FindFn);
-  IRBuilder<> IB(EntryBlock);
-
-  std::vector<Value *> FindHelperArgs;
-  FindHelperArgs.push_back(UnwindMgrInstance);
-  for (auto &Arg : FindFn->args())
-    FindHelperArgs.push_back(&Arg);
-
-  IB.CreateRet(IB.CreateCall(FindHelperFn, FindHelperArgs));
-
-  return ThreadSafeModule(std::move(M), std::move(Ctx));
-}
-
 Error UnwindInfoRegistrationPlugin::addUnwindInfoRegistrationActions(
     LinkGraph &G) {
   ExecutorAddrRange EHFrameRange, UnwindInfoRange;
@@ -220,17 +108,15 @@ Error UnwindInfoRegistrationPlugin::addUnwindInfoRegistrationActions(
 
   using namespace shared;
   using SPSRegisterArgs =
-      SPSArgList<SPSExecutorAddr, SPSSequence<SPSExecutorAddrRange>,
-                 SPSExecutorAddr, SPSExecutorAddrRange, SPSExecutorAddrRange>;
-  using SPSDeregisterArgs =
-      SPSArgList<SPSExecutorAddr, SPSSequence<SPSExecutorAddrRange>>;
+      SPSArgList<SPSSequence<SPSExecutorAddrRange>, SPSExecutorAddr,
+                 SPSExecutorAddrRange, SPSExecutorAddrRange>;
+  using SPSDeregisterArgs = SPSArgList<SPSSequence<SPSExecutorAddrRange>>;
 
   G.allocActions().push_back(
       {cantFail(WrapperFunctionCall::Create<SPSRegisterArgs>(
-           Register, Instance, CodeRanges, DSOBase, EHFrameRange,
-           UnwindInfoRange)),
-       cantFail(WrapperFunctionCall::Create<SPSDeregisterArgs>(
-           Deregister, Instance, CodeRanges))});
+           Register, CodeRanges, DSOBase, EHFrameRange, UnwindInfoRange)),
+       cantFail(WrapperFunctionCall::Create<SPSDeregisterArgs>(Deregister,
+                                                               CodeRanges))});
 
   return Error::success();
 }
diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink-executor/llvm-jitlink-executor.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink-executor/llvm-jitlink-executor.cpp
index dbdec77327774..8553eb70ebe49 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink-executor/llvm-jitlink-executor.cpp
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink-executor/llvm-jitlink-executor.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ExecutionEngine/Orc/TargetProcess/RegisterEHFrames.h"
 #include "llvm/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.h"
 #include "llvm/ExecutionEngine/Orc/TargetProcess/SimpleRemoteEPCServer.h"
+#include "llvm/ExecutionEngine/Orc/TargetProcess/UnwindInfoManager.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/DynamicLibrary.h"
@@ -190,6 +191,10 @@ int main(int argc, char *argv[]) {
                 SimpleRemoteEPCServer::defaultBootstrapSymbols();
             addDefaultBootstrapValuesForHostProcess(S.bootstrapMap(),
                                                     S.bootstrapSymbols());
+#ifdef __APPLE__
+            if (UnwindInfoManager::TryEnable())
+              UnwindInfoManager::addBootstrapSymbols(S.bootstrapSymbols());
+#endif // __APPLE__
             S.services().push_back(
                 std::make_unique<rt_bootstrap::SimpleExecutorMemoryManager>());
             S.services().push_back(
diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
index a7ab7554902f8..9e6d3df297fc7 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
@@ -44,6 +44,7 @@
 #include "llvm/ExecutionEngine/Orc/TargetProcess/JITLoaderPerf.h"
 #include "llvm/ExecutionEngine/Orc/TargetProcess/JITLoaderVTune.h"
 #include "llvm/ExecutionEngine/Orc/TargetProcess/RegisterEHFrames.h"
+#include "llvm/ExecutionEngine/Orc/UnwindInfoRegistrationPlugin.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
@@ -1204,6 +1205,19 @@ Session::Session(std::unique_ptr<ExecutorProcessControl> EPC, Error &Err)
           inconvertibleErrorCode());
       return;
     }
+  } else if (TT.isOSBinFormatMachO()) {
+    if (!NoExec) {
+      std::optional<bool> ForceEHFrames;
+      if ((Err = ES.getBootstrapMapValue<bool, bool>("darwin-use-ehframes-only",
+                                                     ForceEHFrames)))
+        return;
+      bool UseEHFrames = ForceEHFrames ? *ForceEHFrames : false;
+      if (!UseEHFrames)
+        ObjLayer.addPlugin(ExitOnErr(UnwindInfoRegistrationPlugin::Create(ES)));
+      else
+        ObjLayer.addPlugin(std::make_unique<EHFrameRegistrationPlugin>(
+            ES, ExitOnErr(EPCEHFrameRegistrar::Create(ES))));
+    }
   } else if (TT.isOSBinFormatELF()) {
     if (!NoExec)
       ObjLayer.addPlugin(std::make_unique<EHFrameRegistrationPlugin>(

From d135ed0e3166ea306fefcd52d9bf0e2517d06521 Mon Sep 17 00:00:00 2001
From: Florian Mayer <fmayer@google.com>
Date: Tue, 11 Feb 2025 15:29:15 -0800
Subject: [PATCH 147/282] [NFC] [clang] fixed unused variable warning

(cherry picked from commit 9f61a60c777465c8a1bb67f80560a9e3b4d0f05b)
---
 .../lib/ExecutionEngine/Orc/TargetProcess/UnwindInfoManager.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/UnwindInfoManager.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/UnwindInfoManager.cpp
index d67cbb807f2cd..c0807115c8398 100644
--- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/UnwindInfoManager.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/UnwindInfoManager.cpp
@@ -51,7 +51,7 @@ llvm_orc_rt_alt_UnwindInfoManager_deregister(const char *Data, uint64_t Size) {
 namespace llvm::orc {
 
 static const char *AddFnName = "__unw_add_find_dynamic_unwind_sections";
-static const char *RemoveFnName = "__unw_remove_find_dynamic_unwind_sections";
+[[maybe_unused]] static const char *RemoveFnName = "__unw_remove_find_dynamic_unwind_sections";
 static std::unique_ptr<UnwindInfoManager> Instance;
 static int (*RemoveFindDynamicUnwindSections)(void *) = nullptr;
 

From 73315311bd6a073ba45e938d8728d9b8f389453e Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Fri, 31 Jan 2025 05:02:50 -0500
Subject: [PATCH 148/282] [llvm][Support] Enable dl_iterate_phdr support on
 OpenBSD and DragonFly (#125186)

(cherry picked from commit 95e19e21c55db7ede8ff7795512bbfc4ca0ca782)
---
 llvm/lib/Support/Unix/Signals.inc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Support/Unix/Signals.inc b/llvm/lib/Support/Unix/Signals.inc
index 9a12663228a36..2e7b467a14bbe 100644
--- a/llvm/lib/Support/Unix/Signals.inc
+++ b/llvm/lib/Support/Unix/Signals.inc
@@ -468,7 +468,8 @@ void llvm::sys::AddSignalHandler(sys::SignalHandlerCallback FnPtr,
 
 #if ENABLE_BACKTRACES && defined(HAVE_BACKTRACE) &&                            \
     (defined(__linux__) || defined(__FreeBSD__) ||                             \
-     defined(__FreeBSD_kernel__) || defined(__NetBSD__))
+     defined(__FreeBSD_kernel__) || defined(__NetBSD__) ||                     \
+     defined(__OpenBSD__) || defined(__DragonFly__))
 struct DlIteratePhdrData {
   void **StackTrace;
   int depth;

From 8b73dad2247835d4f97dc0a731d3173ce573d597 Mon Sep 17 00:00:00 2001
From: Mark de Wever <koraq@xs4all.nl>
Date: Fri, 14 Feb 2025 18:27:54 +0100
Subject: [PATCH 149/282] [libc++][format] Disables the FTM on older MacOS
 versions. (#126547)

On older MacOS versions where `std::to_chars` for floating-point types
is not available the format library can't be used. Due to some issue
with the availability macro used to disable format on MacOS the issue
triggers regardless of the type being formatted.

The print library has the same issue.

Fixes: #125353
(cherry picked from commit fbd92d098500775501ba917f21e094f4d714f562)
---
 libcxx/include/version                        |  8 +-
 .../format.version.compile.pass.cpp           | 48 +++++++----
 .../ostream.version.compile.pass.cpp          | 32 +++++---
 .../print.version.compile.pass.cpp            | 32 +++++---
 .../version.version.compile.pass.cpp          | 80 +++++++++++++------
 .../generate_feature_test_macro_components.py | 10 +++
 6 files changed, 148 insertions(+), 62 deletions(-)

diff --git a/libcxx/include/version b/libcxx/include/version
index 29a71ed574e56..c5966b90c061d 100644
--- a/libcxx/include/version
+++ b/libcxx/include/version
@@ -417,7 +417,9 @@ __cpp_lib_void_t                                        201411L <type_traits>
 # define __cpp_lib_erase_if                             202002L
 # undef  __cpp_lib_execution
 // # define __cpp_lib_execution                            201902L
-# define __cpp_lib_format                               202110L
+# if _LIBCPP_AVAILABILITY_HAS_TO_CHARS_FLOATING_POINT
+#   define __cpp_lib_format                             202110L
+# endif
 # define __cpp_lib_format_uchar                         202311L
 # define __cpp_lib_generic_unordered_lookup             201811L
 # define __cpp_lib_int_pow2                             202002L
@@ -499,7 +501,9 @@ __cpp_lib_void_t                                        201411L <type_traits>
 # undef  __cpp_lib_optional
 # define __cpp_lib_optional                             202110L
 # define __cpp_lib_out_ptr                              202106L
-# define __cpp_lib_print                                202207L
+# if _LIBCPP_AVAILABILITY_HAS_TO_CHARS_FLOATING_POINT
+#   define __cpp_lib_print                              202207L
+# endif
 # undef  __cpp_lib_ranges
 # define __cpp_lib_ranges                               202406L
 // # define __cpp_lib_ranges_as_const                      202207L
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/format.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/format.version.compile.pass.cpp
index 7bb2fa399b094..6a96325661346 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/format.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/format.version.compile.pass.cpp
@@ -68,11 +68,17 @@
 
 #elif TEST_STD_VER == 20
 
-# ifndef __cpp_lib_format
-#   error "__cpp_lib_format should be defined in c++20"
-# endif
-# if __cpp_lib_format != 202110L
-#   error "__cpp_lib_format should have the value 202110L in c++20"
+# if !defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_TO_CHARS_FLOATING_POINT
+#   ifndef __cpp_lib_format
+#     error "__cpp_lib_format should be defined in c++20"
+#   endif
+#   if __cpp_lib_format != 202110L
+#     error "__cpp_lib_format should have the value 202110L in c++20"
+#   endif
+# else
+#   ifdef __cpp_lib_format
+#     error "__cpp_lib_format should not be defined when the requirement '!defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_TO_CHARS_FLOATING_POINT' is not met!"
+#   endif
 # endif
 
 # ifdef __cpp_lib_format_ranges
@@ -88,11 +94,17 @@
 
 #elif TEST_STD_VER == 23
 
-# ifndef __cpp_lib_format
-#   error "__cpp_lib_format should be defined in c++23"
-# endif
-# if __cpp_lib_format != 202110L
-#   error "__cpp_lib_format should have the value 202110L in c++23"
+# if !defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_TO_CHARS_FLOATING_POINT
+#   ifndef __cpp_lib_format
+#     error "__cpp_lib_format should be defined in c++23"
+#   endif
+#   if __cpp_lib_format != 202110L
+#     error "__cpp_lib_format should have the value 202110L in c++23"
+#   endif
+# else
+#   ifdef __cpp_lib_format
+#     error "__cpp_lib_format should not be defined when the requirement '!defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_TO_CHARS_FLOATING_POINT' is not met!"
+#   endif
 # endif
 
 # ifndef __cpp_lib_format_ranges
@@ -111,11 +123,17 @@
 
 #elif TEST_STD_VER > 23
 
-# ifndef __cpp_lib_format
-#   error "__cpp_lib_format should be defined in c++26"
-# endif
-# if __cpp_lib_format != 202110L
-#   error "__cpp_lib_format should have the value 202110L in c++26"
+# if !defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_TO_CHARS_FLOATING_POINT
+#   ifndef __cpp_lib_format
+#     error "__cpp_lib_format should be defined in c++26"
+#   endif
+#   if __cpp_lib_format != 202110L
+#     error "__cpp_lib_format should have the value 202110L in c++26"
+#   endif
+# else
+#   ifdef __cpp_lib_format
+#     error "__cpp_lib_format should not be defined when the requirement '!defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_TO_CHARS_FLOATING_POINT' is not met!"
+#   endif
 # endif
 
 # ifndef __cpp_lib_format_ranges
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/ostream.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/ostream.version.compile.pass.cpp
index 2a42ca080612d..61d7747e4b6d1 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/ostream.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/ostream.version.compile.pass.cpp
@@ -89,11 +89,17 @@
 #   endif
 # endif
 
-# ifndef __cpp_lib_print
-#   error "__cpp_lib_print should be defined in c++23"
-# endif
-# if __cpp_lib_print != 202207L
-#   error "__cpp_lib_print should have the value 202207L in c++23"
+# if !defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_TO_CHARS_FLOATING_POINT
+#   ifndef __cpp_lib_print
+#     error "__cpp_lib_print should be defined in c++23"
+#   endif
+#   if __cpp_lib_print != 202207L
+#     error "__cpp_lib_print should have the value 202207L in c++23"
+#   endif
+# else
+#   ifdef __cpp_lib_print
+#     error "__cpp_lib_print should not be defined when the requirement '!defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_TO_CHARS_FLOATING_POINT' is not met!"
+#   endif
 # endif
 
 #elif TEST_STD_VER > 23
@@ -111,11 +117,17 @@
 #   endif
 # endif
 
-# ifndef __cpp_lib_print
-#   error "__cpp_lib_print should be defined in c++26"
-# endif
-# if __cpp_lib_print != 202207L
-#   error "__cpp_lib_print should have the value 202207L in c++26"
+# if !defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_TO_CHARS_FLOATING_POINT
+#   ifndef __cpp_lib_print
+#     error "__cpp_lib_print should be defined in c++26"
+#   endif
+#   if __cpp_lib_print != 202207L
+#     error "__cpp_lib_print should have the value 202207L in c++26"
+#   endif
+# else
+#   ifdef __cpp_lib_print
+#     error "__cpp_lib_print should not be defined when the requirement '!defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_TO_CHARS_FLOATING_POINT' is not met!"
+#   endif
 # endif
 
 #endif // TEST_STD_VER > 23
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/print.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/print.version.compile.pass.cpp
index f4ccea4e86304..c9743cf41ef1e 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/print.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/print.version.compile.pass.cpp
@@ -50,20 +50,32 @@
 
 #elif TEST_STD_VER == 23
 
-# ifndef __cpp_lib_print
-#   error "__cpp_lib_print should be defined in c++23"
-# endif
-# if __cpp_lib_print != 202207L
-#   error "__cpp_lib_print should have the value 202207L in c++23"
+# if !defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_TO_CHARS_FLOATING_POINT
+#   ifndef __cpp_lib_print
+#     error "__cpp_lib_print should be defined in c++23"
+#   endif
+#   if __cpp_lib_print != 202207L
+#     error "__cpp_lib_print should have the value 202207L in c++23"
+#   endif
+# else
+#   ifdef __cpp_lib_print
+#     error "__cpp_lib_print should not be defined when the requirement '!defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_TO_CHARS_FLOATING_POINT' is not met!"
+#   endif
 # endif
 
 #elif TEST_STD_VER > 23
 
-# ifndef __cpp_lib_print
-#   error "__cpp_lib_print should be defined in c++26"
-# endif
-# if __cpp_lib_print != 202207L
-#   error "__cpp_lib_print should have the value 202207L in c++26"
+# if !defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_TO_CHARS_FLOATING_POINT
+#   ifndef __cpp_lib_print
+#     error "__cpp_lib_print should be defined in c++26"
+#   endif
+#   if __cpp_lib_print != 202207L
+#     error "__cpp_lib_print should have the value 202207L in c++26"
+#   endif
+# else
+#   ifdef __cpp_lib_print
+#     error "__cpp_lib_print should not be defined when the requirement '!defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_TO_CHARS_FLOATING_POINT' is not met!"
+#   endif
 # endif
 
 #endif // TEST_STD_VER > 23
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
index 8f5788d2bed20..137d6cb428930 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
@@ -3685,11 +3685,17 @@
 #   error "__cpp_lib_flat_set should not be defined before c++23"
 # endif
 
-# ifndef __cpp_lib_format
-#   error "__cpp_lib_format should be defined in c++20"
-# endif
-# if __cpp_lib_format != 202110L
-#   error "__cpp_lib_format should have the value 202110L in c++20"
+# if !defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_TO_CHARS_FLOATING_POINT
+#   ifndef __cpp_lib_format
+#     error "__cpp_lib_format should be defined in c++20"
+#   endif
+#   if __cpp_lib_format != 202110L
+#     error "__cpp_lib_format should have the value 202110L in c++20"
+#   endif
+# else
+#   ifdef __cpp_lib_format
+#     error "__cpp_lib_format should not be defined when the requirement '!defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_TO_CHARS_FLOATING_POINT' is not met!"
+#   endif
 # endif
 
 # ifdef __cpp_lib_format_path
@@ -5146,11 +5152,17 @@
 #   endif
 # endif
 
-# ifndef __cpp_lib_format
-#   error "__cpp_lib_format should be defined in c++23"
-# endif
-# if __cpp_lib_format != 202110L
-#   error "__cpp_lib_format should have the value 202110L in c++23"
+# if !defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_TO_CHARS_FLOATING_POINT
+#   ifndef __cpp_lib_format
+#     error "__cpp_lib_format should be defined in c++23"
+#   endif
+#   if __cpp_lib_format != 202110L
+#     error "__cpp_lib_format should have the value 202110L in c++23"
+#   endif
+# else
+#   ifdef __cpp_lib_format
+#     error "__cpp_lib_format should not be defined when the requirement '!defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_TO_CHARS_FLOATING_POINT' is not met!"
+#   endif
 # endif
 
 # ifdef __cpp_lib_format_path
@@ -5679,11 +5691,17 @@
 #   endif
 # endif
 
-# ifndef __cpp_lib_print
-#   error "__cpp_lib_print should be defined in c++23"
-# endif
-# if __cpp_lib_print != 202207L
-#   error "__cpp_lib_print should have the value 202207L in c++23"
+# if !defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_TO_CHARS_FLOATING_POINT
+#   ifndef __cpp_lib_print
+#     error "__cpp_lib_print should be defined in c++23"
+#   endif
+#   if __cpp_lib_print != 202207L
+#     error "__cpp_lib_print should have the value 202207L in c++23"
+#   endif
+# else
+#   ifdef __cpp_lib_print
+#     error "__cpp_lib_print should not be defined when the requirement '!defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_TO_CHARS_FLOATING_POINT' is not met!"
+#   endif
 # endif
 
 # if !defined(_LIBCPP_VERSION) || _LIBCPP_HAS_LOCALIZATION
@@ -6853,11 +6871,17 @@
 #   endif
 # endif
 
-# ifndef __cpp_lib_format
-#   error "__cpp_lib_format should be defined in c++26"
-# endif
-# if __cpp_lib_format != 202110L
-#   error "__cpp_lib_format should have the value 202110L in c++26"
+# if !defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_TO_CHARS_FLOATING_POINT
+#   ifndef __cpp_lib_format
+#     error "__cpp_lib_format should be defined in c++26"
+#   endif
+#   if __cpp_lib_format != 202110L
+#     error "__cpp_lib_format should have the value 202110L in c++26"
+#   endif
+# else
+#   ifdef __cpp_lib_format
+#     error "__cpp_lib_format should not be defined when the requirement '!defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_TO_CHARS_FLOATING_POINT' is not met!"
+#   endif
 # endif
 
 # if !defined(_LIBCPP_VERSION)
@@ -7557,11 +7581,17 @@
 #   endif
 # endif
 
-# ifndef __cpp_lib_print
-#   error "__cpp_lib_print should be defined in c++26"
-# endif
-# if __cpp_lib_print != 202207L
-#   error "__cpp_lib_print should have the value 202207L in c++26"
+# if !defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_TO_CHARS_FLOATING_POINT
+#   ifndef __cpp_lib_print
+#     error "__cpp_lib_print should be defined in c++26"
+#   endif
+#   if __cpp_lib_print != 202207L
+#     error "__cpp_lib_print should have the value 202207L in c++26"
+#   endif
+# else
+#   ifdef __cpp_lib_print
+#     error "__cpp_lib_print should not be defined when the requirement '!defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_TO_CHARS_FLOATING_POINT' is not met!"
+#   endif
 # endif
 
 # if !defined(_LIBCPP_VERSION) || _LIBCPP_HAS_LOCALIZATION
diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py
index 58ecd79cf7469..00318c2d2a3cd 100755
--- a/libcxx/utils/generate_feature_test_macro_components.py
+++ b/libcxx/utils/generate_feature_test_macro_components.py
@@ -529,6 +529,11 @@ def add_version_header(tc):
             # 202305 P2757R3 Type-checking format args
             # 202306 P2637R3 Member Visit
             "headers": ["format"],
+            # Trying to use `std::format` where to_chars floating-point is not
+            # available causes compilation errors, even with non floating-point types.
+            # https://github.com/llvm/llvm-project/issues/125353
+            "test_suite_guard": "!defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_TO_CHARS_FLOATING_POINT",
+            "libcxx_guard": "_LIBCPP_AVAILABILITY_HAS_TO_CHARS_FLOATING_POINT",
         },
         {
             "name": "__cpp_lib_format_path",
@@ -1004,6 +1009,11 @@ def add_version_header(tc):
                 # "c++26": 202406, # P3235R3 std::print more types faster with less memory
             },
             "headers": ["ostream", "print"],
+            # Trying to use `std::print` where to_chars floating-point is not
+            # available causes compilation errors, even with non floating-point types.
+            # https://github.com/llvm/llvm-project/issues/125353
+            "test_suite_guard": "!defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_TO_CHARS_FLOATING_POINT",
+            "libcxx_guard": "_LIBCPP_AVAILABILITY_HAS_TO_CHARS_FLOATING_POINT",
         },
         {
             "name": "__cpp_lib_quoted_string_io",

From 94291653a7414df9d825d6be9d10be345456d30c Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Sat, 15 Feb 2025 14:13:32 -0800
Subject: [PATCH 150/282] [PowerPC] Use getSignedTargetConstant in
 SelectOptimalAddrMode. (#127305)

Fixes #127298.

(cherry picked from commit 256145b4b0058ae22a1040cd4b7ea44fc49a4ece)
---
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp |  4 ++--
 llvm/test/CodeGen/PowerPC/pr127298.ll       | 13 +++++++++++++
 2 files changed, 15 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/PowerPC/pr127298.ll

diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 4ca328bd9a9ba..21ff6f050817a 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -19050,8 +19050,8 @@ PPC::AddrMode PPCTargetLowering::SelectOptimalAddrMode(const SDNode *Parent,
         int32_t Addr = (int32_t)CNImm;
         // Otherwise, break this down into LIS + Disp.
         Disp = DAG.getSignedTargetConstant((int16_t)Addr, DL, MVT::i32);
-        Base =
-            DAG.getTargetConstant((Addr - (int16_t)Addr) >> 16, DL, MVT::i32);
+        Base = DAG.getSignedTargetConstant((Addr - (int16_t)Addr) >> 16, DL,
+                                           MVT::i32);
         uint32_t LIS = CNType == MVT::i32 ? PPC::LIS : PPC::LIS8;
         Base = SDValue(DAG.getMachineNode(LIS, DL, CNType, Base), 0);
         break;
diff --git a/llvm/test/CodeGen/PowerPC/pr127298.ll b/llvm/test/CodeGen/PowerPC/pr127298.ll
new file mode 100644
index 0000000000000..f7560216ef7d8
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/pr127298.ll
@@ -0,0 +1,13 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=powerpc | FileCheck %s
+
+define void @foo() {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0: # %Entry
+; CHECK-NEXT:    lis 3, -8530
+; CHECK-NEXT:    lbz 3, -16657(3)
+; CHECK-NEXT:    blr
+Entry:
+  %0 = load volatile i8, ptr inttoptr (i32 -559038737 to ptr), align 1
+  ret void
+}

From a2b502050302a4cf8a9c4e623331810eed51bb81 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Thu, 30 Jan 2025 19:03:38 -0800
Subject: [PATCH 151/282] [ELF] ICF: replace includeInDynsym with isExported

Similar to the change to MarkLive.cpp when isExported was introduced.
includeInDynsym might return true even when isExported is false for
statically linked executables.

(cherry picked from commit 45f538ecba1a51768002a5bc0c194b5af4cd9c27)
---
 lld/ELF/Driver.cpp      | 2 +-
 lld/test/ELF/icf-safe.s | 9 ++++++---
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index 2d8a5ade2fece..6121a4254453c 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -2434,7 +2434,7 @@ static void findKeepUniqueSections(Ctx &ctx, opt::InputArgList &args) {
   // or DSOs, so we conservatively mark them as address-significant.
   bool icfSafe = ctx.arg.icf == ICFLevel::Safe;
   for (Symbol *sym : ctx.symtab->getSymbols())
-    if (sym->includeInDynsym(ctx))
+    if (sym->isExported)
       markAddrsig(icfSafe, sym);
 
   // Visit the address-significance table in each object file and mark each
diff --git a/lld/test/ELF/icf-safe.s b/lld/test/ELF/icf-safe.s
index 96776feccbc67..5381532609938 100644
--- a/lld/test/ELF/icf-safe.s
+++ b/lld/test/ELF/icf-safe.s
@@ -1,16 +1,19 @@
 # REQUIRES: x86
+# RUN: llvm-mc -filetype=obj -triple=x86_64 %S/Inputs/shared.s -o %ta.o
+# RUN: ld.lld -shared -soname=ta %ta.o -o %ta.so
 
 # RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t1.o
 # RUN: llvm-objcopy %t1.o %t1copy.o
 # RUN: llvm-objcopy --localize-symbol=h1 %t1.o %t1changed.o
 # RUN: ld.lld -r %t1.o -o %t1reloc.o
 # RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %S/Inputs/icf-safe.s -o %t2.o
-# RUN: ld.lld %t1.o %t2.o -o %t2 --icf=safe --print-icf-sections | FileCheck %s
+# RUN: ld.lld %t1.o %t2.o -o %t2 --icf=safe --print-icf-sections --export-dynamic | FileCheck %s
 # RUN: ld.lld %t1copy.o %t2.o -o %t2 --icf=safe --print-icf-sections | FileCheck %s
 # RUN: ld.lld %t1.o %t2.o -o %t3 --icf=safe --print-icf-sections -shared | FileCheck --check-prefix=EXPORT %s
-# RUN: ld.lld %t1.o %t2.o -o %t3 --icf=safe --print-icf-sections --export-dynamic | FileCheck --check-prefix=EXPORT %s
+## Exported symbols are suppressed for ICF when dynamic linking is enabled.
+# RUN: ld.lld %t1.o %t2.o %ta.so -o %t3 --icf=safe --print-icf-sections --export-dynamic | FileCheck --check-prefix=EXPORT %s
 # RUN: ld.lld %t1.o %t2.o -o %t2 --icf=all --print-icf-sections | FileCheck --check-prefix=ALL %s
-# RUN: ld.lld %t1.o %t2.o -o %t2 --icf=all --print-icf-sections --export-dynamic | FileCheck --check-prefix=ALL-EXPORT %s
+# RUN: ld.lld %t1.o %t2.o %ta.so -o %t2 --icf=all --print-icf-sections --export-dynamic | FileCheck --check-prefix=ALL-EXPORT %s
 # RUN: ld.lld %t1changed.o -o %t4 --icf=safe 2>&1 | FileCheck --check-prefix=SH_LINK_0 %s
 # RUN: ld.lld %t1reloc.o -o %t4 --icf=safe 2>&1 | FileCheck --check-prefix=SH_LINK_0 %s
 

From 02a511e42c6783f14ac45d71c5278dd031c1bcf7 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Thu, 30 Jan 2025 22:24:04 -0800
Subject: [PATCH 152/282] [ELF] Merge exportDynamic/isExported and remove
 Symbol::includeInDynsym

Commit 3733ed6f1c6b0eef1e13e175ac81ad309fc0b080 introduced isExported to
cache includeInDynsym. If we don't unnecessarily set isExported for
undefined symbols, exportDynamic/includeInDynsym can be replaced with
isExported.

(cherry picked from commit d6fa74ab3d4cc77005836e72a2d6fe222bab4c59)
---
 lld/ELF/Driver.cpp            |  3 ++-
 lld/ELF/InputFiles.cpp        |  4 ++--
 lld/ELF/SymbolTable.cpp       |  8 +++-----
 lld/ELF/Symbols.cpp           | 27 +++++++++++++--------------
 lld/ELF/Symbols.h             | 15 +++++----------
 lld/ELF/SyntheticSections.cpp |  4 ++--
 lld/ELF/Writer.cpp            | 13 +++++++------
 7 files changed, 34 insertions(+), 40 deletions(-)

diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index 6121a4254453c..391140bce7394 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -2575,7 +2575,8 @@ void LinkerDriver::compileBitcodeFiles(bool skipLinkedOutput) {
       for (Symbol *sym : obj->getGlobalSymbols()) {
         if (!sym->isDefined())
           continue;
-        if (ctx.hasDynsym && sym->includeInDynsym(ctx))
+        if (ctx.hasDynsym && ctx.arg.exportDynamic &&
+            sym->computeBinding(ctx) != STB_LOCAL)
           sym->isExported = true;
         if (sym->hasVersionSuffix)
           sym->parseSymbolVersion(ctx);
diff --git a/lld/ELF/InputFiles.cpp b/lld/ELF/InputFiles.cpp
index caee72cf31955..d43de8ce6dfef 100644
--- a/lld/ELF/InputFiles.cpp
+++ b/lld/ELF/InputFiles.cpp
@@ -1581,7 +1581,7 @@ template <class ELFT> void SharedFile::parse() {
       }
       Symbol *s = ctx.symtab->addSymbol(
           Undefined{this, name, sym.getBinding(), sym.st_other, sym.getType()});
-      s->exportDynamic = true;
+      s->isExported = true;
       if (sym.getBinding() != STB_WEAK &&
           ctx.arg.unresolvedSymbolsInShlib != UnresolvedPolicy::Ignore)
         requiredSymbols.push_back(s);
@@ -1778,7 +1778,7 @@ static void createBitcodeSymbol(Ctx &ctx, Symbol *&sym,
                    nullptr);
     // The definition can be omitted if all bitcode definitions satisfy
     // `canBeOmittedFromSymbolTable()` and isUsedInRegularObj is false.
-    // The latter condition is tested in Symbol::includeInDynsym.
+    // The latter condition is tested in parseVersionAndComputeIsPreemptible.
     sym->ltoCanOmit = objSym.canBeOmittedFromSymbolTable() &&
                       (!sym->isDefined() || sym->ltoCanOmit);
     sym->resolve(ctx, newSym);
diff --git a/lld/ELF/SymbolTable.cpp b/lld/ELF/SymbolTable.cpp
index 975700505facb..b8a70d4e898fc 100644
--- a/lld/ELF/SymbolTable.cpp
+++ b/lld/ELF/SymbolTable.cpp
@@ -203,7 +203,7 @@ void SymbolTable::handleDynamicList() {
       syms = findByVersion(ver);
 
     for (Symbol *sym : syms)
-      sym->exportDynamic = sym->inDynamicList = true;
+      sym->isExported = sym->inDynamicList = true;
   }
 }
 
@@ -350,10 +350,8 @@ void SymbolTable::scanVersionScript() {
         assignAsterisk(pat, &v, true);
   }
 
-  // isPreemptible is false at this point. To correctly compute the binding of a
-  // Defined (which is used by includeInDynsym(ctx)), we need to know if it is
-  // VER_NDX_LOCAL or not. Compute symbol versions before handling
-  // --dynamic-list.
+  // Handle --dynamic-list. If a specified symbol is also matched by local: in a
+  // version script, the version script takes precedence.
   handleDynamicList();
 }
 
diff --git a/lld/ELF/Symbols.cpp b/lld/ELF/Symbols.cpp
index b10391c65dfdc..890877cb1bc04 100644
--- a/lld/ELF/Symbols.cpp
+++ b/lld/ELF/Symbols.cpp
@@ -268,16 +268,6 @@ uint8_t Symbol::computeBinding(Ctx &ctx) const {
   return binding;
 }
 
-bool Symbol::includeInDynsym(Ctx &ctx) const {
-  if (computeBinding(ctx) == STB_LOCAL)
-    return false;
-  if (!isDefined() && !isCommon())
-    return true;
-
-  return exportDynamic ||
-         (ctx.arg.exportDynamic && (isUsedInRegularObj || !ltoCanOmit));
-}
-
 // Print out a log message for --trace-symbol.
 void elf::printTraceSymbol(const Symbol &sym, StringRef name) {
   std::string s;
@@ -374,9 +364,18 @@ void elf::parseVersionAndComputeIsPreemptible(Ctx &ctx) {
   for (Symbol *sym : ctx.symtab->getSymbols()) {
     if (sym->hasVersionSuffix)
       sym->parseSymbolVersion(ctx);
-    if (hasDynsym) {
-      sym->isExported = sym->includeInDynsym(ctx);
-      sym->isPreemptible = sym->isExported && computeIsPreemptible(ctx, *sym);
+    if (!hasDynsym)
+      continue;
+    if (sym->computeBinding(ctx) == STB_LOCAL) {
+      sym->isExported = false;
+      continue;
+    }
+    if (!sym->isDefined() && !sym->isCommon()) {
+      sym->isPreemptible = computeIsPreemptible(ctx, *sym);
+    } else if (ctx.arg.exportDynamic &&
+               (sym->isUsedInRegularObj || !sym->ltoCanOmit)) {
+      sym->isExported = true;
+      sym->isPreemptible = computeIsPreemptible(ctx, *sym);
     }
   }
 }
@@ -655,7 +654,7 @@ void Symbol::resolve(Ctx &ctx, const LazySymbol &other) {
 }
 
 void Symbol::resolve(Ctx &ctx, const SharedSymbol &other) {
-  exportDynamic = true;
+  isExported = true;
   if (isPlaceholder()) {
     other.overwrite(*this);
     return;
diff --git a/lld/ELF/Symbols.h b/lld/ELF/Symbols.h
index 48df6f60db864..64f2f6eaa8d09 100644
--- a/lld/ELF/Symbols.h
+++ b/lld/ELF/Symbols.h
@@ -105,6 +105,9 @@ class Symbol {
   uint8_t partition;
 
   // True if this symbol is preemptible at load time.
+  //
+  // Primarily set in two locations, (a) parseVersionAndComputeIsPreemptible and
+  // (b) demoteSymbolsAndComputeIsPreemptible.
   LLVM_PREFERRED_TYPE(bool)
   uint8_t isPreemptible : 1;
 
@@ -131,16 +134,9 @@ class Symbol {
   // - If -shared or --export-dynamic is specified, any symbol in an object
   //   file/bitcode sets this property, unless suppressed by LTO
   //   canBeOmittedFromSymbolTable().
-  //
-  // Primarily set in two locations, (a) after parseSymbolVersion and
-  // (b) during demoteSymbols.
   LLVM_PREFERRED_TYPE(bool)
   uint8_t isExported : 1;
 
-  // Used to compute isExported. Set when defined or referenced by a SharedFile.
-  LLVM_PREFERRED_TYPE(bool)
-  uint8_t exportDynamic : 1;
-
   LLVM_PREFERRED_TYPE(bool)
   uint8_t ltoCanOmit : 1;
 
@@ -159,7 +155,6 @@ class Symbol {
     stOther = (stOther & ~3) | visibility;
   }
 
-  bool includeInDynsym(Ctx &) const;
   uint8_t computeBinding(Ctx &) const;
   bool isGlobal() const { return binding == llvm::ELF::STB_GLOBAL; }
   bool isWeak() const { return binding == llvm::ELF::STB_WEAK; }
@@ -247,8 +242,8 @@ class Symbol {
   Symbol(Kind k, InputFile *file, StringRef name, uint8_t binding,
          uint8_t stOther, uint8_t type)
       : file(file), nameData(name.data()), nameSize(name.size()), type(type),
-        binding(binding), stOther(stOther), symbolKind(k), exportDynamic(false),
-        ltoCanOmit(false), archSpecificBit(false) {}
+        binding(binding), stOther(stOther), symbolKind(k), ltoCanOmit(false),
+        archSpecificBit(false) {}
 
   void overwrite(Symbol &sym, Kind k) const {
     if (sym.traced)
diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp
index eb07d82fc9601..ffa6e3c008c48 100644
--- a/lld/ELF/SyntheticSections.cpp
+++ b/lld/ELF/SyntheticSections.cpp
@@ -4776,8 +4776,8 @@ template <class ELFT> void elf::createSyntheticSections(Ctx &ctx) {
       add(*part.buildId);
     }
 
-    // dynSymTab is always present to simplify sym->includeInDynsym(ctx) in
-    // finalizeSections.
+    // dynSymTab is always present to simplify several finalizeSections
+    // functions.
     part.dynStrTab = std::make_unique<StringTableSection>(ctx, ".dynstr", true);
     part.dynSymTab =
         std::make_unique<SymbolTableSection<ELFT>>(ctx, *part.dynStrTab);
diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index 6c7bcee02047b..3ba1cdbce572b 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -296,13 +296,12 @@ static void demoteSymbolsAndComputeIsPreemptible(Ctx &ctx) {
                   sym->type)
             .overwrite(*sym);
         sym->versionId = VER_NDX_GLOBAL;
-        if (hasDynsym && sym->includeInDynsym(ctx))
-          sym->isExported = true;
       }
     }
 
     if (hasDynsym)
-      sym->isPreemptible = sym->isExported && computeIsPreemptible(ctx, *sym);
+      sym->isPreemptible = (sym->isUndefined() || sym->isExported) &&
+                           computeIsPreemptible(ctx, *sym);
   }
 }
 
@@ -1841,9 +1840,10 @@ template <class ELFT> void Writer<ELFT>::finalizeSections() {
 
   // If the previous code block defines any non-hidden symbols (e.g.
   // __global_pointer$), they may be exported.
-  if (ctx.hasDynsym)
+  if (ctx.hasDynsym && ctx.arg.exportDynamic)
     for (Symbol *sym : ctx.synthesizedSymbols)
-      sym->isExported = sym->includeInDynsym(ctx);
+      if (sym->computeBinding(ctx) != STB_LOCAL)
+        sym->isExported = true;
 
   demoteSymbolsAndComputeIsPreemptible(ctx);
 
@@ -1931,7 +1931,8 @@ template <class ELFT> void Writer<ELFT>::finalizeSections() {
 
       // computeBinding might localize a linker-synthesized hidden symbol
       // (e.g. __global_pointer$) that was considered exported.
-      if (sym->isExported && !sym->isLocal()) {
+      if (ctx.hasDynsym && (sym->isUndefined() || sym->isExported) &&
+          !sym->isLocal()) {
         ctx.partitions[sym->partition - 1].dynSymTab->addSymbol(sym);
         if (auto *file = dyn_cast<SharedFile>(sym->file))
           if (file->isNeeded && !sym->isUndefined())

From 9bcc825ee491a85c2f7b1573d6a3abf6d5cf0c8a Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Wed, 5 Feb 2025 21:16:00 -0800
Subject: [PATCH 153/282] [ELF] Refine isExported/isPreemptible condition

Reland 994cea3f0a2d0caf4d66321ad5a06ab330144d89 after bolt tests no
longer rely on -pie --unresolved-symbols=ignore-all with no input DSO
generating PLT entries.

---

Commit f10441ad003236ef3b9e5415a571d2be0c0ce5ce , while dropping a
special case for isUndefWeak and --no-dynamic-linking, made
--export-dynamic ineffective when -pie is used without any input DSO.

This change restores --export-dynamic and unifies -pie and -pie
--no-dynamic-linker when there is no input DSO.

* -pie with no input DSO suppresses undefined symbols in .dynsym.
  Previously this only appied to -pie --no-dynamic-linker.
* As a side effect, -pie with no input DSO suppresses PLT.

(cherry picked from commit 52fc6ffcda0895c0c7b976ad1f5cb5a282b571d2)
---
 lld/ELF/Config.h                              |  2 --
 lld/ELF/Driver.cpp                            | 11 ++---------
 lld/ELF/Symbols.cpp                           |  4 +++-
 lld/ELF/SyntheticSections.cpp                 |  4 ++--
 lld/ELF/Writer.cpp                            | 11 ++++++-----
 lld/test/ELF/executable-undefined-ignoreall.s |  6 ++++--
 lld/test/ELF/ppc32-weak-undef-call.s          |  6 +++---
 lld/test/ELF/ppc64-undefined-weak.s           | 16 ++++++++--------
 lld/test/ELF/riscv-gp.s                       | 13 ++++++++-----
 lld/test/ELF/weak-undef-lib.s                 |  2 +-
 lld/test/ELF/weak-undef-no-dynamic-linker.s   |  7 +++----
 lld/test/ELF/weak-undef-rw.s                  | 16 ++++++++--------
 12 files changed, 48 insertions(+), 50 deletions(-)

diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h
index 98e52b52ea46a..9826ed0517337 100644
--- a/lld/ELF/Config.h
+++ b/lld/ELF/Config.h
@@ -292,7 +292,6 @@ struct Config {
   bool gdbIndex;
   bool gnuHash = false;
   bool gnuUnique;
-  bool hasDynSymTab;
   bool ignoreDataAddressEquality;
   bool ignoreFunctionAddressEquality;
   bool ltoCSProfileGenerate;
@@ -306,7 +305,6 @@ struct Config {
   bool mipsN32Abi = false;
   bool mmapOutputFile;
   bool nmagic;
-  bool noDynamicLinker = false;
   bool noinhibitExec;
   bool nostdlib;
   bool oFormatBinary;
diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index 391140bce7394..a1e9ecae08557 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -781,11 +781,8 @@ static StringRef getDynamicLinker(Ctx &ctx, opt::InputArgList &args) {
   auto *arg = args.getLastArg(OPT_dynamic_linker, OPT_no_dynamic_linker);
   if (!arg)
     return "";
-  if (arg->getOption().getID() == OPT_no_dynamic_linker) {
-    // --no-dynamic-linker suppresses undefined weak symbols in .dynsym
-    ctx.arg.noDynamicLinker = true;
+  if (arg->getOption().getID() == OPT_no_dynamic_linker)
     return "";
-  }
   return arg->getValue();
 }
 
@@ -2921,12 +2918,8 @@ template <class ELFT> void LinkerDriver::link(opt::InputArgList &args) {
 
   parseFiles(ctx, files);
 
-  // Dynamic linking is used if there is an input DSO,
-  // or -shared or non-static pie is specified.
-  ctx.hasDynsym = !ctx.sharedFiles.empty() || ctx.arg.shared ||
-                  (ctx.arg.pie && !ctx.arg.noDynamicLinker);
   // Create dynamic sections for dynamic linking and static PIE.
-  ctx.arg.hasDynSymTab = ctx.hasDynsym || ctx.arg.isPic;
+  ctx.hasDynsym = !ctx.sharedFiles.empty() || ctx.arg.isPic;
 
   // If an entry symbol is in a static archive, pull out that file now.
   if (Symbol *sym = ctx.symtab->find(ctx.arg.entry))
diff --git a/lld/ELF/Symbols.cpp b/lld/ELF/Symbols.cpp
index 890877cb1bc04..80b0691428007 100644
--- a/lld/ELF/Symbols.cpp
+++ b/lld/ELF/Symbols.cpp
@@ -360,7 +360,9 @@ void elf::parseVersionAndComputeIsPreemptible(Ctx &ctx) {
   // Symbol themselves might know their versions because symbols
   // can contain versions in the form of <name>@<version>.
   // Let them parse and update their names to exclude version suffix.
+  // In addition, compute isExported and isPreemptible.
   bool hasDynsym = ctx.hasDynsym;
+  bool maybePreemptible = ctx.sharedFiles.size() || ctx.arg.shared;
   for (Symbol *sym : ctx.symtab->getSymbols()) {
     if (sym->hasVersionSuffix)
       sym->parseSymbolVersion(ctx);
@@ -371,7 +373,7 @@ void elf::parseVersionAndComputeIsPreemptible(Ctx &ctx) {
       continue;
     }
     if (!sym->isDefined() && !sym->isCommon()) {
-      sym->isPreemptible = computeIsPreemptible(ctx, *sym);
+      sym->isPreemptible = maybePreemptible && computeIsPreemptible(ctx, *sym);
     } else if (ctx.arg.exportDynamic &&
                (sym->isUsedInRegularObj || !sym->ltoCanOmit)) {
       sym->isExported = true;
diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp
index ffa6e3c008c48..b03c4282ab1aa 100644
--- a/lld/ELF/SyntheticSections.cpp
+++ b/lld/ELF/SyntheticSections.cpp
@@ -4740,7 +4740,7 @@ template <class ELFT> void elf::createSyntheticSections(Ctx &ctx) {
 
   // Add MIPS-specific sections.
   if (ctx.arg.emachine == EM_MIPS) {
-    if (!ctx.arg.shared && ctx.arg.hasDynSymTab) {
+    if (!ctx.arg.shared && ctx.hasDynsym) {
       ctx.in.mipsRldMap = std::make_unique<MipsRldMapSection>(ctx);
       add(*ctx.in.mipsRldMap);
     }
@@ -4803,7 +4803,7 @@ template <class ELFT> void elf::createSyntheticSections(Ctx &ctx) {
       part.relaDyn = std::make_unique<RelocationSection<ELFT>>(
           ctx, relaDynName, ctx.arg.zCombreloc, threadCount);
 
-    if (ctx.arg.hasDynSymTab) {
+    if (ctx.hasDynsym) {
       add(*part.dynSymTab);
 
       part.verSym = std::make_unique<VersionTableSection>(ctx);
diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index 3ba1cdbce572b..0db922b07aabf 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -284,6 +284,7 @@ static void demoteSymbolsAndComputeIsPreemptible(Ctx &ctx) {
   llvm::TimeTraceScope timeScope("Demote symbols");
   DenseMap<InputFile *, DenseMap<SectionBase *, size_t>> sectionIndexMap;
   bool hasDynsym = ctx.hasDynsym;
+  bool maybePreemptible = ctx.sharedFiles.size() || ctx.arg.shared;
   for (Symbol *sym : ctx.symtab->getSymbols()) {
     if (auto *d = dyn_cast<Defined>(sym)) {
       if (d->section && !d->section->isLive())
@@ -300,7 +301,8 @@ static void demoteSymbolsAndComputeIsPreemptible(Ctx &ctx) {
     }
 
     if (hasDynsym)
-      sym->isPreemptible = (sym->isUndefined() || sym->isExported) &&
+      sym->isPreemptible = maybePreemptible &&
+                           (sym->isUndefined() || sym->isExported) &&
                            computeIsPreemptible(ctx, *sym);
   }
 }
@@ -1929,10 +1931,9 @@ template <class ELFT> void Writer<ELFT>::finalizeSections() {
       if (ctx.in.symTab)
         ctx.in.symTab->addSymbol(sym);
 
-      // computeBinding might localize a linker-synthesized hidden symbol
-      // (e.g. __global_pointer$) that was considered exported.
-      if (ctx.hasDynsym && (sym->isUndefined() || sym->isExported) &&
-          !sym->isLocal()) {
+      // computeBinding might localize a symbol that was considered exported
+      // but then synthesized as hidden (e.g. _DYNAMIC).
+      if ((sym->isExported || sym->isPreemptible) && !sym->isLocal()) {
         ctx.partitions[sym->partition - 1].dynSymTab->addSymbol(sym);
         if (auto *file = dyn_cast<SharedFile>(sym->file))
           if (file->isNeeded && !sym->isUndefined())
diff --git a/lld/test/ELF/executable-undefined-ignoreall.s b/lld/test/ELF/executable-undefined-ignoreall.s
index 073b22bd84543..1f83b1b61830a 100644
--- a/lld/test/ELF/executable-undefined-ignoreall.s
+++ b/lld/test/ELF/executable-undefined-ignoreall.s
@@ -1,11 +1,13 @@
 # REQUIRES: x86
 
-## --unresolved-symbols=ignore-all behaves similar to -shared:
+## In dynamic linking, --unresolved-symbols=ignore-all behaves similar to -shared:
 ## for PLT relocations to undefined symbols, produce dynamic relocations if we
 ## emit .dynsym.
 
+# RUN: llvm-mc -filetype=obj -triple=x86_64 %S/Inputs/shared.s -o %ta.o
+# RUN: ld.lld -shared -soname=ta %ta.o -o %ta.so
 # RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t.o
-# RUN: ld.lld %t.o -o %t --unresolved-symbols=ignore-all -pie
+# RUN: ld.lld %t.o %ta.so -o %t --unresolved-symbols=ignore-all -pie
 # RUN: llvm-readobj -r %t | FileCheck %s
 
 # CHECK:      Relocations [
diff --git a/lld/test/ELF/ppc32-weak-undef-call.s b/lld/test/ELF/ppc32-weak-undef-call.s
index dfb45e5fe18cf..1ad280a49c038 100644
--- a/lld/test/ELF/ppc32-weak-undef-call.s
+++ b/lld/test/ELF/ppc32-weak-undef-call.s
@@ -1,15 +1,15 @@
 # REQUIRES: ppc
 # RUN: llvm-mc -filetype=obj -triple=powerpc %s -o %t.o
 # RUN: ld.lld %t.o -o %t
-# RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck --check-prefix=PDE %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck --check-prefix=STATIC %s
 # RUN: ld.lld -pie %t.o -o %t
-# RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck --check-prefix=PIC %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck --check-prefix=STATIC %s
 # RUN: ld.lld -shared %t.o -o %t
 # RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck --check-prefix=PIC %s
 
 ## It does not really matter how we fixup it, but we cannot overflow and
 ## should not generate a call stub (this would waste space).
-# PDE: bl 0x100100b4
+# STATIC: bl {{.*}} <.text>
 
 ## With -pie or -shared, create a call stub. ld.bfd produces bl .+0
 # PIC:       bl 0x[[PLT:[0-9a-f]+]]
diff --git a/lld/test/ELF/ppc64-undefined-weak.s b/lld/test/ELF/ppc64-undefined-weak.s
index 7b1be5e36dd32..e9168020b216f 100644
--- a/lld/test/ELF/ppc64-undefined-weak.s
+++ b/lld/test/ELF/ppc64-undefined-weak.s
@@ -2,25 +2,25 @@
 
 # RUN: llvm-mc -filetype=obj -triple=powerpc64le %s -o %t.o
 # RUN: ld.lld %t.o -o %t
-# RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck %s --check-prefix=PDE
+# RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck %s --check-prefix=STATIC
 # RUN: ld.lld -pie %t.o -o %t
-# RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck %s --check-prefix=PIC
+# RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck %s --check-prefix=STATIC
 # RUN: ld.lld -shared %t.o -o %t.so
 # RUN: llvm-objdump -d --no-show-raw-insn %t.so | FileCheck %s --check-prefix=PIC
 
 # RUN: llvm-mc -filetype=obj -triple=powerpc64 %s -o %t.o
 # RUN: ld.lld %t.o -o %t
-# RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck %s --check-prefix=PDE
+# RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck %s --check-prefix=STATIC
 
 ## Branches to an undefined weak symbol need a thunk iff a dynamic relocation is
 ## produced. undefweak2 is hidden and does not need a dynamic relocation, so we
 ## suppress the thunk. undefweak1 needs a thunk iff -pie or -shared.
 
-# PDE-LABEL: <_start>:
-# PDE-NEXT:    bl {{.*}} <_start>
-# PDE-NEXT:    nop
-# PDE-NEXT:    bl {{.*}} <_start+0x8>
-# PDE-NEXT:    nop
+# STATIC-LABEL: <_start>:
+# STATIC-NEXT:    bl {{.*}} <_start>
+# STATIC-NEXT:    nop
+# STATIC-NEXT:    bl {{.*}} <_start+0x8>
+# STATIC-NEXT:    nop
 
 # PIC-LABEL: <_start>:
 # PIC-NEXT:    bl {{.*}} <__plt_undefweak1>
diff --git a/lld/test/ELF/riscv-gp.s b/lld/test/ELF/riscv-gp.s
index a30f5e9fbc625..2f715e1470f2d 100644
--- a/lld/test/ELF/riscv-gp.s
+++ b/lld/test/ELF/riscv-gp.s
@@ -14,17 +14,20 @@
 # SEC32: {{0*}}000039c0 0 NOTYPE GLOBAL DEFAULT [[#SDATA]] __global_pointer$
 
 # SEC64: [ [[#SDATA:]]] .sdata PROGBITS {{0*}}000032e0
+# SEC64:     '.dynsym'
+# SEC64-NOT: __global_pointer$
+# SEC64:     '.symtab'
 # SEC64: {{0*}}00003ae0 0 NOTYPE GLOBAL DEFAULT [[#SDATA]] __global_pointer$
 
 # ERR: error: relocation R_RISCV_PCREL_HI20 cannot be used against symbol '__global_pointer$'; recompile with -fPIC
 
 # RUN: ld.lld -pie --no-dynamic-linker --export-dynamic %t.64.o -o %t.64e
-# RUN: llvm-readelf -s %t.64e | FileCheck %s --check-prefix=STATICPIE
+# RUN: llvm-readelf -s %t.64e | FileCheck %s --check-prefix=STATICE
 
-# STATICPIE:     '.dynsym'
-# STATICPIE-NOT: __global_pointer$
-# STATICPIE:     '.symtab'
-# STATICPIE:     __global_pointer$
+# STATICE:     '.dynsym'
+# STATICE:     __global_pointer$
+# STATICE:     '.symtab'
+# STATICE:     __global_pointer$
 
 ## -r mode does not define __global_pointer$.
 # RUN: ld.lld -r %t.64.o -o %t.64.ro
diff --git a/lld/test/ELF/weak-undef-lib.s b/lld/test/ELF/weak-undef-lib.s
index a554e1d5a2f89..0ff1bc755f075 100644
--- a/lld/test/ELF/weak-undef-lib.s
+++ b/lld/test/ELF/weak-undef-lib.s
@@ -7,7 +7,7 @@
 # RUN: llvm-readobj --dyn-syms %t.so | FileCheck %s
 
 # RUN: ld.lld -pie -o %t %t1.o --start-lib %t2.o
-# RUN: llvm-readobj --dyn-syms %t | FileCheck %s
+# RUN: llvm-readelf --dyn-syms %t | FileCheck %s --check-prefix=STATICPIE
 
 # CHECK:      Name: foo
 # CHECK-NEXT: Value: 0x0
diff --git a/lld/test/ELF/weak-undef-no-dynamic-linker.s b/lld/test/ELF/weak-undef-no-dynamic-linker.s
index fa6936e1ef393..11abd5351af9d 100644
--- a/lld/test/ELF/weak-undef-no-dynamic-linker.s
+++ b/lld/test/ELF/weak-undef-no-dynamic-linker.s
@@ -1,13 +1,12 @@
 # REQUIRES: x86
 # RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t.o
 # RUN: ld.lld -pie %t.o -o %t
-# RUN: llvm-readobj --dyn-syms %t | FileCheck %s
+# RUN: llvm-readobj --dyn-syms %t | FileCheck --check-prefix=NO %s
 # RUN: ld.lld -pie --no-dynamic-linker %t.o -o %t
 # RUN: llvm-readobj --dyn-syms %t | FileCheck --check-prefix=NO %s
 
-## With --no-dynamic-linker, don't emit undefined weak symbols to .dynsym .
-## This will suppress a relocation.
-# CHECK: Name: foo
+## With static PIE (whether or not --no-dynamic-linker is specified), don't
+## emit undefined weak symbols to .dynsym . This suppresses relocations.
 # NO-NOT: Name: foo
 
 .weak foo
diff --git a/lld/test/ELF/weak-undef-rw.s b/lld/test/ELF/weak-undef-rw.s
index ec3e4ce734895..c5cf1bdcaba68 100644
--- a/lld/test/ELF/weak-undef-rw.s
+++ b/lld/test/ELF/weak-undef-rw.s
@@ -1,23 +1,23 @@
 # REQUIRES: x86
 # RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t.o
 # RUN: ld.lld %t.o -o %t --export-dynamic
-# RUN: llvm-readelf -r --hex-dump=.data %t | FileCheck %s --check-prefix=NOPIC
+# RUN: llvm-readelf -r --hex-dump=.data %t | FileCheck %s --check-prefix=STATIC
 # RUN: ld.lld %t.o -o %t.pie -pie
-# RUN: llvm-readobj -r %t.pie | FileCheck %s --check-prefix=PIC
+# RUN: llvm-readelf -r --hex-dump=.data %t.pie | FileCheck %s --check-prefix=STATIC
 # RUN: ld.lld %t.o -o %t.so -shared
 # RUN: llvm-readobj -r %t.so | FileCheck %s --check-prefix=PIC
 
 ## gABI leaves the behavior of weak undefined references implementation defined.
-## We choose to resolve them statically for -no-pie and produce dynamic relocations
-## for -pie and -shared.
+## We choose to resolve them statically for static linking and produce dynamic relocations
+## for dynamic linking (-shared or at least one input DSO).
 ##
 ## Note: Some ports of GNU ld support -z nodynamic-undefined-weak that we don't
 ## implement.
 
-# NOPIC: no relocations
-# NOPIC: Hex dump of section '.data':
-# NOPIC-NEXT: {{.*}} 00000000 00000000 
-# NOPIC-EMPTY:
+# STATIC:      no relocations
+# STATIC:      Hex dump of section '.data':
+# STATIC-NEXT: {{.*}} 00000000 00000000 .
+# STATIC-EMPTY:
 
 # PIC:      .rela.dyn {
 # PIC-NEXT:   R_X86_64_64 foobar 0x0

From 9e02cc4080f2268845c7e51a3f1a3b150daad40c Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Sun, 16 Feb 2025 20:18:29 +0800
Subject: [PATCH 154/282] [InstCombine] Do not keep samesign when speculatively
 executing icmps (#127007)

Closes https://github.com/llvm/llvm-project/issues/126974.

(cherry picked from commit 29f3a352068ce562bcb65e18a676c82a9991583c)
---
 .../InstCombine/InstCombineCompares.cpp       |  5 ++++
 llvm/test/Transforms/InstCombine/umax-icmp.ll | 24 +++++++++++++++++++
 2 files changed, 29 insertions(+)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index b64ac20ab0533..810ce7d382ae1 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -5609,6 +5609,11 @@ Instruction *InstCombinerImpl::foldICmpWithMinMax(Instruction &I,
       return false;
     return std::nullopt;
   };
+  // Remove samesign here since it is illegal to keep it when we speculatively
+  // execute comparisons. For example, `icmp samesign ult umax(X, -46), -32`
+  // cannot be decomposed into `(icmp samesign ult X, -46) or (icmp samesign ult
+  // -46, -32)`. `X` is allowed to be non-negative here.
+  Pred = static_cast<CmpInst::Predicate>(Pred);
   auto CmpXZ = IsCondKnownTrue(simplifyICmpInst(Pred, X, Z, Q));
   auto CmpYZ = IsCondKnownTrue(simplifyICmpInst(Pred, Y, Z, Q));
   if (!CmpXZ.has_value() && !CmpYZ.has_value())
diff --git a/llvm/test/Transforms/InstCombine/umax-icmp.ll b/llvm/test/Transforms/InstCombine/umax-icmp.ll
index b4eea30bfc6af..0c42d26750e4b 100644
--- a/llvm/test/Transforms/InstCombine/umax-icmp.ll
+++ b/llvm/test/Transforms/InstCombine/umax-icmp.ll
@@ -804,4 +804,28 @@ end:
   ret void
 }
 
+define i1 @pr126974(i8 %x) {
+; CHECK-LABEL: @pr126974(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp sgt i8 [[X:%.*]], -2
+; CHECK-NEXT:    br i1 [[COND]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i8 [[X]], -1
+; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK:       if.else:
+; CHECK-NEXT:    ret i1 false
+;
+entry:
+  %cond = icmp sgt i8 %x, -2
+  br i1 %cond, label %if.then, label %if.else
+
+if.then:
+  %umax = call i8 @llvm.umax.i8(i8 %x, i8 -46)
+  %cmp = icmp samesign ult i8 %umax, -32
+  ret i1 %cmp
+
+if.else:
+  ret i1 false
+}
+
 declare i32 @llvm.umax.i32(i32, i32)

From 5c126253ca4ecf8d46a313856c04a17e5d65a800 Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Tue, 18 Feb 2025 01:04:28 +0000
Subject: [PATCH 155/282] [ReleaseNotes][RemoveDIs] Add release note for
 deprecated insertion methods (#127493)

I've stuck this under "LLVM Infrastructure" as the IR plumbing methods
feel like infrastructure. The LLVM17 release notes stuck similar notes
in that section too.
---
 llvm/docs/ReleaseNotes.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index b42e111dc4283..c80aecfdea084 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -116,6 +116,8 @@ Changes to the LLVM IR
 Changes to LLVM infrastructure
 ------------------------------
 
+ * Two methods that use Instruction pointers as code positions (moveBefore, getFirstNonPHI) have been deprecated in favour of overloads and variants that use `BasicBlock::iterator`s instead. The pointer-flavoured methods will be removed in a future release. This work is part of the [RemoveDIs](https://llvm.org/docs/RemoveDIsDebugInfo.html) project, the documentation for which contains instructions for updating call-sites using the deprecated methods.
+
 Changes to building LLVM
 ------------------------
 

From 5707853ff790e724f2bb2c470f5a943743ac4511 Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Mon, 17 Feb 2025 17:46:02 -0500
Subject: [PATCH 156/282] [OpenMP][libomp] Add OpenBSD, NetBSD and DragonFly
 stdarg handling (#126182)

Fixes build on OpenBSD/aarch64.

```
FAILED: openmp/runtime/src/CMakeFiles/omp.dir/kmp_runtime.cpp.o
/home/brad/tmp/llvm-build/bin/clang++ --target=aarch64-unknown-openbsd7.6 -D_DEBUG -D_GLIBCXX_ASSERTIONS -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -Domp_EXPORTS -I/home/brad/tmp/llvm-build/runtimes/runtimes-bins/openmp/runtime/src -I/home/brad/tmp/llvm-brad/openmp/runtime/src -I/home/brad/tmp/llvm-brad/openmp/runtime/src/i18n -I/home/brad/tmp/llvm-brad/openmp/runtime/src/include -I/home/brad/tmp/llvm-brad/openmp/runtime/src/thirdparty/ittnotify -fPIC -fno-semantic-interposition -fvisibility-inlines-hidden -Werror=date-time -Werror=unguarded-availability-new -Wall -Wextra -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wmissing-field-initializers -Wimplicit-fallthrough -Wcovered-switch-default -Wno-noexcept-type -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wsuggest-override -Wstring-conversion -Wmisleading-indentation -Wctad-maybe-unsupported -fdiagnostics-color -ffunction-sections -fdata-sections -Wall -fcolor-diagnostics -Wcast-qual -Wformat-pedantic -Wimplicit-fallthrough -Wsign-compare -Wno-extra -Wno-pedantic -fno-semantic-interposition -fdata-sections -O3 -DNDEBUG -std=c++17 -fPIC   -D _GNU_SOURCE -D _REENTRANT -U_GLIBCXX_ASSERTIONS -UNDEBUG -fno-exceptions -fno-rtti -Wno-covered-switch-default -Wno-frame-address -Wno-strict-aliasing -Wno-switch -Wno-uninitialized -Wno-return-type-c-linkage -Wno-cast-qual -Wno-int-to-void-pointer-cast -MD -MT openmp/runtime/src/CMakeFiles/omp.dir/kmp_runtime.cpp.o -MF openmp/runtime/src/CMakeFiles/omp.dir/kmp_runtime.cpp.o.d -o openmp/runtime/src/CMakeFiles/omp.dir/kmp_runtime.cpp.o -c /home/brad/tmp/llvm-brad/openmp/runtime/src/kmp_runtime.cpp
/home/brad/tmp/llvm-brad/openmp/runtime/src/kmp_runtime.cpp:1449:47: error: value of type 'kmp_va_list' (aka '__builtin_va_list') is not contextually convertible to 'bool'
 1449 |   return (master_th->th.th_teams_microtask && ap &&
      |                                               ^~
/home/brad/tmp/llvm-brad/openmp/runtime/src/kmp_runtime.cpp:1449:44: error: invalid operands to binary expression ('microtask_t' (aka 'void (*)(int *, int *, ...)') and 'kmp_va_list' (aka '__builtin_va_list'))
 1449 |   return (master_th->th.th_teams_microtask && ap &&
      |           ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ^  ~~
/home/brad/tmp/llvm-brad/openmp/runtime/src/kmp_runtime.cpp:1457:15: warning: comparison between NULL and non-pointer ('kmp_va_list' (aka '__builtin_va_list') and NULL) [-Wnull-arithmetic]
 1457 |   return ((ap == NULL && active_level == 0) ||
      |            ~~ ^  ~~~~
/home/brad/tmp/llvm-brad/openmp/runtime/src/kmp_runtime.cpp:1457:15: error: invalid operands to binary expression ('kmp_va_list' (aka '__builtin_va_list') and 'long')
 1457 |   return ((ap == NULL && active_level == 0) ||
      |            ~~ ^  ~~~~
/home/brad/tmp/llvm-brad/openmp/runtime/src/kmp_runtime.cpp:1458:12: error: value of type 'kmp_va_list' (aka '__builtin_va_list') is not contextually convertible to 'bool'
 1458 |           (ap && teams_level > 0 && teams_level == level));
      |            ^~
/home/brad/tmp/llvm-brad/openmp/runtime/src/kmp_runtime.cpp:1458:15: error: invalid operands to binary expression ('kmp_va_list' (aka '__builtin_va_list') and 'bool')
 1458 |           (ap && teams_level > 0 && teams_level == level));
      |            ~~ ^  ~~~~~~~~~~~~~~~
/home/brad/tmp/llvm-brad/openmp/runtime/src/kmp_runtime.cpp:1735:9: error: invalid argument type 'kmp_va_list' (aka '__builtin_va_list') to unary expression
 1735 |     if (!ap) {
      |         ^~~
/home/brad/tmp/llvm-brad/openmp/runtime/src/kmp_runtime.cpp:2169:66: warning: comparison between NULL and non-pointer ('kmp_va_list' (aka '__builtin_va_list') and NULL) [-Wnull-arithmetic]
 2169 |             !(microtask == (microtask_t)__kmp_teams_master || ap == NULL))
      |                                                               ~~ ^  ~~~~
/home/brad/tmp/llvm-brad/openmp/runtime/src/kmp_runtime.cpp:2169:66: error: invalid operands to binary expression ('kmp_va_list' (aka '__builtin_va_list') and 'long')
 2169 |             !(microtask == (microtask_t)__kmp_teams_master || ap == NULL))
      |                                                               ~~ ^  ~~~~
/home/brad/tmp/llvm-brad/openmp/runtime/src/kmp_runtime.cpp:2284:9: error: value of type 'kmp_va_list' (aka '__builtin_va_list') is not contextually convertible to 'bool'
 2284 |     if (ap) {
      |         ^~
/home/brad/tmp/llvm-brad/openmp/runtime/src/kmp_runtime.cpp:2302:58: error: invalid argument type 'kmp_va_list' (aka '__builtin_va_list') to unary expression
 2302 |     __kmp_fork_team_threads(root, team, master_th, gtid, !ap);
      |                                                          ^~~
/home/brad/tmp/llvm-brad/openmp/runtime/src/kmp_runtime.cpp:2363:9: error: value of type 'kmp_va_list' (aka '__builtin_va_list') is not contextually convertible to 'bool'
 2363 |     if (ap) {
      |         ^~
/home/brad/tmp/llvm-brad/openmp/runtime/src/kmp_runtime.cpp:7803:3: error: no matching function for call to '__kmp_fork_call'
 7803 |   __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
      |   ^~~~~~~~~~~~~~~
/home/brad/tmp/llvm-brad/openmp/runtime/src/kmp_runtime.cpp:1927:5: note: candidate function not viable: no known conversion from 'long' to 'kmp_va_list' (aka '__builtin_va_list') for 7th argument
 1927 | int __kmp_fork_call(ident_t *loc, int gtid,
      |     ^
 1928 |                     enum fork_context_e call_context, // Intel, GNU, ...
 1929 |                     kmp_int32 argc, microtask_t microtask, launch_t invoker,
 1930 |                     kmp_va_list ap) {
      |                     ~~~~~~~~~~~~~~
2 warnings and 11 errors generated.
```

(cherry picked from commit 0b8bd472b0faf79005dfdd1078904fdf39879d61)
---
 openmp/runtime/src/kmp_os.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/openmp/runtime/src/kmp_os.h b/openmp/runtime/src/kmp_os.h
index 2252f5e7e97a7..29a281f096855 100644
--- a/openmp/runtime/src/kmp_os.h
+++ b/openmp/runtime/src/kmp_os.h
@@ -219,7 +219,8 @@ typedef kmp_uint32 kmp_uint;
 
 // stdarg handling
 #if (KMP_ARCH_ARM || KMP_ARCH_X86_64 || KMP_ARCH_AARCH64 || KMP_ARCH_WASM) &&  \
-    (KMP_OS_FREEBSD || KMP_OS_LINUX || KMP_OS_WASI)
+    (KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_OPENBSD || KMP_OS_DRAGONFLY ||  \
+     KMP_OS_LINUX || KMP_OS_WASI)
 typedef va_list *kmp_va_list;
 #define kmp_va_deref(ap) (*(ap))
 #define kmp_va_addr_of(ap) (&(ap))

From 7643bd660236cd72345c0f3cbbdc75e2726ff32b Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Fri, 31 Jan 2025 11:38:14 +0000
Subject: [PATCH 157/282] [TBAA] Don't emit pointer-tbaa for void pointers.
 (#122116)

While there are no special rules in the standards regarding void
pointers and strict aliasing, emitting distinct tags for void pointers
break some common idioms and there is no good alternative to re-write
the code without strict-aliasing violations. An example is to count the
entries in an array of pointers:

    int count_elements(void * values) {
      void **seq = values;
      int count;
      for (count = 0; seq && seq[count]; count++);
      return count;
    }

https://clang.godbolt.org/z/8dTv51v8W

An example in the wild is from
https://github.com/llvm/llvm-project/issues/119099

This patch avoids emitting distinct tags for void pointers, to avoid
those idioms causing mis-compiles for now.

Fixes https://github.com/llvm/llvm-project/issues/119099.
Fixes https://github.com/llvm/llvm-project/issues/122537.

PR: https://github.com/llvm/llvm-project/pull/122116
(cherry picked from commit 77d3f8a92564b533a3c60a8c8e0657c38fd88ba1)
---
 clang/docs/UsersManual.rst                    | 88 +++++++++++++++++--
 clang/lib/CodeGen/CodeGenTBAA.cpp             |  8 ++
 clang/test/CodeGen/tbaa-pointers.c            | 13 +--
 .../CodeGenOpenCL/amdgpu-enqueue-kernel.cl    | 25 +++---
 clang/unittests/CodeGen/TBAAMetadataTest.cpp  | 12 +--
 5 files changed, 105 insertions(+), 41 deletions(-)

diff --git a/clang/docs/UsersManual.rst b/clang/docs/UsersManual.rst
index a56c9425ebb75..943a9218ccbc2 100644
--- a/clang/docs/UsersManual.rst
+++ b/clang/docs/UsersManual.rst
@@ -2489,6 +2489,82 @@ are listed below.
 
     $ clang -fuse-ld=lld -Oz -Wl,--icf=safe -fcodegen-data-use code.cc
 
+.. _strict_aliasing:
+
+Strict Aliasing
+---------------
+
+The C and C++ standards require accesses to objects in memory to use l-values of
+an appropriate type for the object. This is called *strict aliasing* or
+*type-based alias analysis*. Strict aliasing enhances a variety of powerful
+memory optimizations, including reordering, combining, and eliminating memory
+accesses. These optimizations can lead to unexpected behavior in code that
+violates the strict aliasing rules. For example:
+
+.. code-block:: c++
+
+    void advance(size_t *index, double *data) {
+      double value = data[*index];
+      /* Clang may assume that this store does not change the contents of `data`. */
+      *index += 1;
+      /* Clang may assume that this store does not change the contents of `index`. */
+      data[*index] = value;
+      /* Either of these facts may create significant optimization opportunities
+       if Clang is able to inline this function. */
+  }
+
+Strict aliasing can be explicitly enabled with ``-fstrict-aliasing`` and
+disabled with ``-fno-strict-aliasing``. ``clang-cl`` defaults to
+``-fno-strict-aliasing``; see . Otherwise, Clang defaults to ``-fstrict-aliasing``.
+
+C and C++ specify slightly different rules for strict aliasing. To improve
+language interoperability, Clang allows two types to alias if either language
+would permit it. This includes applying the C++ similar types rule to C,
+allowing ``int **`` to alias ``int const * const *``. Clang also relaxes the
+standard aliasing rules in the following ways:
+
+* All integer types of the same size are permitted to alias each other,
+  including signed and unsigned types.
+* ``void*`` is permitted to alias any pointer type, ``void**`` is permitted to
+  alias any pointer to pointer type, and so on.
+
+Code which violates strict aliasing has undefined behavior. A program that
+works in one version of Clang may not work in another because of changes to the
+optimizer. Clang provides a :doc:`TypeSanitizer` to help detect
+violations of the strict aliasing rules, but it is currently still experimental.
+Code that is known to violate strict aliasing should generally be built with
+``-fno-strict-aliasing`` if the violation cannot be fixed.
+
+Clang supports several ways to fix a violation of strict aliasing:
+
+* L-values of the character types ``char`` and ``unsigned char`` (as well as
+  other types, depending on the standard) are permitted to access objects of
+  any type.
+
+* Library functions such as ``memcpy`` and ``memset`` are specified as treating
+  memory as characters and therefore are not limited by strict aliasing. If a
+  value of one type must be reinterpreted as another (e.g. to read the bits of a
+  floating-point number), use ``memcpy`` to copy the representation to an object
+  of the destination type. This has no overhead over a direct l-value access
+  because Clang should reliably optimize calls to these functions to use simple
+  loads and stores when they are used with small constant sizes.
+
+* The attribute ``may_alias`` can be added to a ``typedef`` to give l-values of
+  that type the same aliasing power as the character types.
+
+Clang makes a best effort to avoid obvious miscompilations from strict aliasing
+by only considering type information when it cannot prove that two accesses must
+refer to the same memory. However, it is not recommended that programmers
+intentionally rely on this instead of using one of the solutions above because
+it is too easy for the compiler's analysis to be blocked in surprising ways.
+
+In Clang 20, Clang strengthened its implementation of strict aliasing for
+accesses of pointer type. Previously, all accesses of pointer type were
+permitted to alias each other, but Clang now distinguishes different pointers
+by their pointee type, except as limited by the relaxations around qualifiers
+and ``void*`` described above. The previous behavior of treating all pointers as
+aliasing can be restored using ``-fno-pointer-tbaa``.
+
 Profile Guided Optimization
 ---------------------------
 
@@ -5272,12 +5348,6 @@ The Visual C++ Toolset has a slightly more elaborate mechanism for detection.
 Restrictions and Limitations compared to Clang
 ----------------------------------------------
 
-Strict Aliasing
-^^^^^^^^^^^^^^^
-
-Strict aliasing (TBAA) is always off by default in clang-cl. Whereas in clang,
-strict aliasing is turned on by default for all optimization levels.
-
-To enable LLVM optimizations based on strict aliasing rules (e.g., optimizations
-based on type of expressions in C/C++), user will need to explicitly pass
-`-fstrict-aliasing` to clang-cl.
+Strict aliasing (TBAA) is always off by default in clang-cl whereas in clang,
+strict aliasing is turned on by default for all optimization levels. For more
+details, see :ref:`Strict aliasing <strict_aliasing>`.
diff --git a/clang/lib/CodeGen/CodeGenTBAA.cpp b/clang/lib/CodeGen/CodeGenTBAA.cpp
index 75e66bae79afd..3f1a24791ddd8 100644
--- a/clang/lib/CodeGen/CodeGenTBAA.cpp
+++ b/clang/lib/CodeGen/CodeGenTBAA.cpp
@@ -226,6 +226,14 @@ llvm::MDNode *CodeGenTBAA::getTypeInfoHelper(const Type *Ty) {
       PtrDepth++;
       Ty = Ty->getPointeeType()->getBaseElementTypeUnsafe();
     } while (Ty->isPointerType());
+
+    // While there are no special rules in the standards regarding void pointers
+    // and strict aliasing, emitting distinct tags for void pointers break some
+    // common idioms and there is no good alternative to re-write the code
+    // without strict-aliasing violations.
+    if (Ty->isVoidType())
+      return AnyPtr;
+
     assert(!isa<VariableArrayType>(Ty));
     // When the underlying type is a builtin type, we compute the pointee type
     // string recursively, which is implicitly more forgiving than the standards
diff --git a/clang/test/CodeGen/tbaa-pointers.c b/clang/test/CodeGen/tbaa-pointers.c
index 4aae2552f107a..48adac503357f 100644
--- a/clang/test/CodeGen/tbaa-pointers.c
+++ b/clang/test/CodeGen/tbaa-pointers.c
@@ -208,12 +208,9 @@ int void_ptrs(void **ptr) {
 // COMMON-LABEL: define i32 @void_ptrs(
 // COMMON-SAME: ptr noundef [[PTRA:%.+]])
 // COMMON:        [[PTR_ADDR:%.+]]  = alloca ptr, align 8
-// DISABLE-NEXT:  store ptr [[PTRA]], ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR]]
-// DISABLE-NEXT:  [[L0:%.+]] = load ptr, ptr  [[PTR_ADDR]], align 8, !tbaa  [[ANYPTR]]
-// DISABLE-NEXT:  [[L1:%.+]] = load ptr, ptr [[L0]], align 8, !tbaa  [[ANYPTR]]
-// DEFAULT-NEXT:  store ptr [[PTRA]], ptr [[PTR_ADDR]], align 8, !tbaa [[P2VOID:!.+]]
-// DEFAULT-NEXT:  [[L0:%.+]] = load ptr, ptr  [[PTR_ADDR]], align 8, !tbaa  [[P2VOID]]
-// DEFAULT-NEXT:  [[L1:%.+]] = load ptr, ptr [[L0]], align 8, !tbaa  [[P1VOID:!.+]]
+// COMMON-NEXT:   store ptr [[PTRA]], ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR]]
+// COMMON-NEXT:   [[L0:%.+]] = load ptr, ptr  [[PTR_ADDR]], align 8, !tbaa  [[ANYPTR]]
+// COMMON-NEXT:   [[L1:%.+]] = load ptr, ptr [[L0]], align 8, !tbaa  [[ANYPTR]]
 // COMMON-NEXT:   [[BOOL:%.+]] = icmp ne ptr [[L1]], null
 // COMMON-NEXT:   [[BOOL_EXT:%.+]] = zext i1 [[BOOL]] to i64
 // COMMON-NEXT:   [[COND:%.+]] = select i1 [[BOOL]], i32 0, i32 1
@@ -254,7 +251,3 @@ int void_ptrs(void **ptr) {
 // COMMON:  [[INT_TAG]] = !{[[INT_TY:!.+]], [[INT_TY]], i64 0}
 // COMMON:  [[INT_TY]] = !{!"int", [[CHAR]], i64 0}
 // DEFAULT: [[ANYPTR]] = !{[[ANY_POINTER]],  [[ANY_POINTER]], i64 0}
-// DEFAULT: [[P2VOID]] = !{[[P2VOID_TY:!.+]], [[P2VOID_TY]], i64 0}
-// DEFAULT: [[P2VOID_TY]] = !{!"p2 void", [[ANY_POINTER]], i64 0}
-// DEFAULT: [[P1VOID]] = !{[[P1VOID_TY:!.+]], [[P1VOID_TY]], i64 0}
-// DEFAULT: [[P1VOID_TY]] = !{!"p1 void", [[ANY_POINTER]], i64 0}
diff --git a/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl b/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
index 5599f4dd50f04..ace34dd0ca6dc 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
@@ -651,7 +651,7 @@ kernel void test_target_features_kernel(global int *i) {
 //
 // GFX900: Function Attrs: convergent nounwind
 // GFX900-LABEL: define {{[^@]+}}@__test_block_invoke_3_kernel
-// GFX900-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) #[[ATTR6]] !kernel_arg_addr_space [[META28:![0-9]+]] !kernel_arg_access_qual [[META29:![0-9]+]] !kernel_arg_type [[META30:![0-9]+]] !kernel_arg_base_type [[META30]] !kernel_arg_type_qual [[META31:![0-9]+]] {
+// GFX900-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) #[[ATTR6]] !kernel_arg_addr_space [[META27:![0-9]+]] !kernel_arg_access_qual [[META28:![0-9]+]] !kernel_arg_type [[META29:![0-9]+]] !kernel_arg_base_type [[META29]] !kernel_arg_type_qual [[META30:![0-9]+]] {
 // GFX900-NEXT:  entry:
 // GFX900-NEXT:    [[TMP2:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5)
 // GFX900-NEXT:    store <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0]], ptr addrspace(5) [[TMP2]], align 8
@@ -688,7 +688,7 @@ kernel void test_target_features_kernel(global int *i) {
 //
 // GFX900: Function Attrs: convergent norecurse nounwind
 // GFX900-LABEL: define {{[^@]+}}@test_target_features_kernel
-// GFX900-SAME: (ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META32:![0-9]+]] !kernel_arg_access_qual [[META23]] !kernel_arg_type [[META33:![0-9]+]] !kernel_arg_base_type [[META33]] !kernel_arg_type_qual [[META25]] {
+// GFX900-SAME: (ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META31:![0-9]+]] !kernel_arg_access_qual [[META23]] !kernel_arg_type [[META32:![0-9]+]] !kernel_arg_base_type [[META32]] !kernel_arg_type_qual [[META25]] {
 // GFX900-NEXT:  entry:
 // GFX900-NEXT:    [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // GFX900-NEXT:    [[DEFAULT_QUEUE:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
@@ -700,7 +700,7 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-NEXT:    [[FLAGS_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[FLAGS]] to ptr
 // GFX900-NEXT:    [[NDRANGE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[NDRANGE]] to ptr
 // GFX900-NEXT:    [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr
-// GFX900-NEXT:    store ptr addrspace(1) [[I]], ptr [[I_ADDR_ASCAST]], align 8, !tbaa [[TBAA34:![0-9]+]]
+// GFX900-NEXT:    store ptr addrspace(1) [[I]], ptr [[I_ADDR_ASCAST]], align 8, !tbaa [[TBAA33:![0-9]+]]
 // GFX900-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR8]]
 // GFX900-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR8]]
 // GFX900-NEXT:    store i32 0, ptr [[FLAGS_ASCAST]], align 4, !tbaa [[TBAA17]]
@@ -803,16 +803,15 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900: [[META23]] = !{!"none"}
 // GFX900: [[META24]] = !{!"__block_literal"}
 // GFX900: [[META25]] = !{!""}
-// GFX900: [[TBAA26]] = !{[[META27:![0-9]+]], [[META27]], i64 0}
-// GFX900: [[META27]] = !{!"p1 void", [[META9]], i64 0}
-// GFX900: [[META28]] = !{i32 0, i32 3}
-// GFX900: [[META29]] = !{!"none", !"none"}
-// GFX900: [[META30]] = !{!"__block_literal", !"void*"}
-// GFX900: [[META31]] = !{!"", !""}
-// GFX900: [[META32]] = !{i32 1}
-// GFX900: [[META33]] = !{!"int*"}
-// GFX900: [[TBAA34]] = !{[[META35:![0-9]+]], [[META35]], i64 0}
-// GFX900: [[META35]] = !{!"p1 int", [[META9]], i64 0}
+// GFX900: [[TBAA26]] = !{[[META9]], [[META9]], i64 0}
+// GFX900: [[META27]] = !{i32 0, i32 3}
+// GFX900: [[META28]] = !{!"none", !"none"}
+// GFX900: [[META29]] = !{!"__block_literal", !"void*"}
+// GFX900: [[META30]] = !{!"", !""}
+// GFX900: [[META31]] = !{i32 1}
+// GFX900: [[META32]] = !{!"int*"}
+// GFX900: [[TBAA33]] = !{[[META34:![0-9]+]], [[META34]], i64 0}
+// GFX900: [[META34]] = !{!"p1 int", [[META9]], i64 0}
 //.
 //// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 // CHECK: {{.*}}
diff --git a/clang/unittests/CodeGen/TBAAMetadataTest.cpp b/clang/unittests/CodeGen/TBAAMetadataTest.cpp
index cad8783ea73fb..f05c9787c63ea 100644
--- a/clang/unittests/CodeGen/TBAAMetadataTest.cpp
+++ b/clang/unittests/CodeGen/TBAAMetadataTest.cpp
@@ -117,15 +117,9 @@ TEST(TBAAMetadataTest, BasicTypes) {
   ASSERT_TRUE(I);
 
   I = matchNext(I,
-      MInstruction(Instruction::Store,
-        MValType(PointerType::getUnqual(Compiler.Context)),
-        MMTuple(
-          MMTuple(
-            MMString("p1 void"),
-            AnyPtr,
-            MConstInt(0)),
-          MSameAs(0),
-          MConstInt(0))));
+                MInstruction(Instruction::Store,
+                             MValType(PointerType::getUnqual(Compiler.Context)),
+                             MMTuple(AnyPtr, MSameAs(0), MConstInt(0))));
   ASSERT_TRUE(I);
 
   I = matchNext(I,

From 99eab31a80d93eacea2e8687e12cf34a381fa68c Mon Sep 17 00:00:00 2001
From: Matheus Izvekov <mizvekov@gmail.com>
Date: Wed, 29 Jan 2025 21:58:55 -0300
Subject: [PATCH 158/282] [clang] StmtPrinter: Handle DeclRefExpr to a
 Decomposition (#125001)

A DeclRefExpr could never refer to a Decomposition in valid C++ code,
but somehow the Analyzer creates these entities and then it tries to
print them.

There is no sensible answer here, so we print 'decomposition' followed
by the names of all of its bindings, separated by dashes.

(cherry picked from commit 00c096e604ad3a5244af06602556f8de867e36c4)
---
 clang/lib/AST/StmtPrinter.cpp               |  8 +-
 clang/test/Analysis/anonymous-decls.cpp     | 89 +++++++++++++++++++++
 clang/test/Analysis/anonymous-parameter.cpp | 30 -------
 3 files changed, 96 insertions(+), 31 deletions(-)
 create mode 100644 clang/test/Analysis/anonymous-decls.cpp
 delete mode 100644 clang/test/Analysis/anonymous-parameter.cpp

diff --git a/clang/lib/AST/StmtPrinter.cpp b/clang/lib/AST/StmtPrinter.cpp
index 9efc88436f928..9299d4cb4aba8 100644
--- a/clang/lib/AST/StmtPrinter.cpp
+++ b/clang/lib/AST/StmtPrinter.cpp
@@ -1291,8 +1291,14 @@ void StmtPrinter::VisitDeclRefExpr(DeclRefExpr *Node) {
          << PD->getFunctionScopeIndex();
       break;
     }
+    case Decl::Decomposition:
+      OS << "decomposition";
+      for (const auto &I : cast<DecompositionDecl>(VD)->bindings())
+        OS << '-' << I->getName();
+      break;
     default:
-      llvm_unreachable("Unhandled anonymous declaration kind");
+      OS << "unhandled-anonymous-" << VD->getDeclKindName();
+      break;
     }
   }
   if (Node->hasExplicitTemplateArgs()) {
diff --git a/clang/test/Analysis/anonymous-decls.cpp b/clang/test/Analysis/anonymous-decls.cpp
new file mode 100644
index 0000000000000..211184523aa51
--- /dev/null
+++ b/clang/test/Analysis/anonymous-decls.cpp
@@ -0,0 +1,89 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=debug.DumpCFG -triple x86_64-apple-darwin12 -std=c++20 %s 2>&1 | FileCheck %s
+
+struct A {
+  static A a;
+  char b;
+  friend bool operator==(A, A) = default;
+};
+bool _ = A() == A::a;
+
+// FIXME: steps 1 and 5 show anonymous function parameters are
+// not handled correctly.
+
+// CHECK-LABEL: bool operator==(A, A) noexcept = default
+// CHECK-NEXT: [B2 (ENTRY)]
+// CHECK-NEXT:    Succs (1): B1
+// CHECK:      [B1]
+// CHECK-NEXT:    1: function-parameter-0-0
+// CHECK-NEXT:    2: [B1.1].b
+// CHECK-NEXT:    3: [B1.2] (ImplicitCastExpr, LValueToRValue, char)
+// CHECK-NEXT:    4: [B1.3] (ImplicitCastExpr, IntegralCast, int)
+// CHECK-NEXT:    5: function-parameter-0-1
+// CHECK-NEXT:    6: [B1.5].b
+// CHECK-NEXT:    7: [B1.6] (ImplicitCastExpr, LValueToRValue, char)
+// CHECK-NEXT:    8: [B1.7] (ImplicitCastExpr, IntegralCast, int)
+// CHECK-NEXT:    9: [B1.4] == [B1.8]
+// CHECK-NEXT:   10: return [B1.9];
+// CHECK-NEXT:    Preds (1): B2
+// CHECK-NEXT:    Succs (1): B0
+// CHECK:      [B0 (EXIT)]
+// CHECK-NEXT:    Preds (1): B1
+
+namespace std {
+template <class> struct iterator_traits;
+template <class, class> struct pair;
+template <class _Tp> struct iterator_traits<_Tp *> {
+  typedef _Tp &reference;
+};
+template <long, class> struct tuple_element;
+template <class> struct tuple_size;
+template <class _T1, class _T2> struct tuple_size<pair<_T1, _T2>> {
+  static const int value = 2;
+};
+template <class _T1, class _T2> struct tuple_element<0, pair<_T1, _T2>> {
+  using type = _T1;
+};
+template <class _T1, class _T2> struct tuple_element<1, pair<_T1, _T2>> {
+  using type = _T2;
+};
+template <long _Ip, class _T1, class _T2>
+tuple_element<_Ip, pair<_T1, _T2>>::type get(pair<_T1, _T2> &);
+struct __wrap_iter {
+  iterator_traits<pair<int, int> *>::reference operator*();
+  void operator++();
+};
+bool operator!=(__wrap_iter, __wrap_iter);
+struct vector {
+  __wrap_iter begin();
+  __wrap_iter end();
+};
+} // namespace std
+int main() {
+  std::vector v;
+  for (auto &[a, b] : v)
+    ;
+}
+
+// FIXME: On steps 8 and 14, a decomposition is referred by name, which they never have.
+
+// CHECK-LABEL: int main()
+// CHECK:      [B3]
+// CHECK-NEXT:   1: operator*
+// CHECK-NEXT:   2: [B3.1] (ImplicitCastExpr, FunctionToPointerDecay, iterator_traits<pair<int, int> *>::reference (*)(void))
+// CHECK-NEXT:   3: __begin1
+// CHECK-NEXT:   4: * [B3.3] (OperatorCall)
+// CHECK-NEXT:   5: auto &;
+// CHECK-NEXT:   6: get<0UL>
+// CHECK-NEXT:   7: [B3.6] (ImplicitCastExpr, FunctionToPointerDecay, typename tuple_element<0L, pair<int, int> >::type (*)(pair<int, int> &))
+// CHECK-NEXT:   8: decomposition-a-b
+// CHECK-NEXT:   9: [B3.7]([B3.8])
+// CHECK-NEXT:  10: [B3.9]
+// CHECK-NEXT:  11: std::tuple_element<0, std::pair<int, int>>::type a = get<0UL>(decomposition-a-b);
+// CHECK-NEXT:  12: get<1UL>
+// CHECK-NEXT:  13: [B3.12] (ImplicitCastExpr, FunctionToPointerDecay, typename tuple_element<1L, pair<int, int> >::type (*)(pair<int, int> &))
+// CHECK-NEXT:  14: decomposition-a-b
+// CHECK-NEXT:  15: [B3.13]([B3.14])
+// CHECK-NEXT:  16: [B3.15]
+// CHECK-NEXT:  17: std::tuple_element<1, std::pair<int, int>>::type b = get<1UL>(decomposition-a-b);
+// CHECK-NEXT:   Preds (1): B1
+// CHECK-NEXT:   Succs (1): B2
diff --git a/clang/test/Analysis/anonymous-parameter.cpp b/clang/test/Analysis/anonymous-parameter.cpp
deleted file mode 100644
index ad2a00b3329cb..0000000000000
--- a/clang/test/Analysis/anonymous-parameter.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-// RUN: %clang_analyze_cc1 -analyzer-checker=debug.DumpCFG -triple x86_64-apple-darwin12 -std=c++20 %s 2>&1 | FileCheck %s
-
-struct A {
-  static A a;
-  char b;
-  friend bool operator==(A, A) = default;
-};
-bool _ = A() == A::a;
-
-// FIXME: steps 1 and 5 show anonymous function parameters are
-// not handled correctly.
-
-// CHECK-LABEL: bool operator==(A, A) noexcept = default
-// CHECK-NEXT: [B2 (ENTRY)]
-// CHECK-NEXT:    Succs (1): B1
-// CHECK:      [B1]
-// CHECK-NEXT:    1: function-parameter-0-0
-// CHECK-NEXT:    2: [B1.1].b
-// CHECK-NEXT:    3: [B1.2] (ImplicitCastExpr, LValueToRValue, char)
-// CHECK-NEXT:    4: [B1.3] (ImplicitCastExpr, IntegralCast, int)
-// CHECK-NEXT:    5: function-parameter-0-1
-// CHECK-NEXT:    6: [B1.5].b
-// CHECK-NEXT:    7: [B1.6] (ImplicitCastExpr, LValueToRValue, char)
-// CHECK-NEXT:    8: [B1.7] (ImplicitCastExpr, IntegralCast, int)
-// CHECK-NEXT:    9: [B1.4] == [B1.8]
-// CHECK-NEXT:   10: return [B1.9];
-// CHECK-NEXT:    Preds (1): B2
-// CHECK-NEXT:    Succs (1): B0
-// CHECK:      [B0 (EXIT)]
-// CHECK-NEXT:    Preds (1): B1

From 819cac64004322cf260636c770f41a9016810903 Mon Sep 17 00:00:00 2001
From: Mark de Wever <koraq@xs4all.nl>
Date: Sat, 15 Feb 2025 20:15:32 +0100
Subject: [PATCH 159/282] [libc++] Fixes (|multi)_set spaceship operator.
 (#127326)

The operators did not have a _Compare template arguement. The fix
updates the generic container test to use allocators for all types used.
No other issues were found.

Fixes: #127095
(cherry picked from commit 248716f814d1d1fef88911d01a0b551d53c87c7a)
---
 libcxx/include/set                            |   8 +-
 .../test/support/test_container_comparisons.h | 266 +++++++++++-------
 2 files changed, 168 insertions(+), 106 deletions(-)

diff --git a/libcxx/include/set b/libcxx/include/set
index 2784e82760d7e..3c6ea360bd06c 100644
--- a/libcxx/include/set
+++ b/libcxx/include/set
@@ -1003,9 +1003,9 @@ operator<=(const set<_Key, _Compare, _Allocator>& __x, const set<_Key, _Compare,
 
 #  else // _LIBCPP_STD_VER <= 17
 
-template <class _Key, class _Allocator>
+template <class _Key, class _Compare, class _Allocator>
 _LIBCPP_HIDE_FROM_ABI __synth_three_way_result<_Key>
-operator<=>(const set<_Key, _Allocator>& __x, const set<_Key, _Allocator>& __y) {
+operator<=>(const set<_Key, _Compare, _Allocator>& __x, const set<_Key, _Compare, _Allocator>& __y) {
   return std::lexicographical_compare_three_way(__x.begin(), __x.end(), __y.begin(), __y.end(), std::__synth_three_way);
 }
 
@@ -1470,9 +1470,9 @@ operator<=(const multiset<_Key, _Compare, _Allocator>& __x, const multiset<_Key,
 
 #  else // _LIBCPP_STD_VER <= 17
 
-template <class _Key, class _Allocator>
+template <class _Key, class _Compare, class _Allocator>
 _LIBCPP_HIDE_FROM_ABI __synth_three_way_result<_Key>
-operator<=>(const multiset<_Key, _Allocator>& __x, const multiset<_Key, _Allocator>& __y) {
+operator<=>(const multiset<_Key, _Compare, _Allocator>& __x, const multiset<_Key, _Compare, _Allocator>& __y) {
   return std::lexicographical_compare_three_way(__x.begin(), __x.end(), __y.begin(), __y.end(), __synth_three_way);
 }
 
diff --git a/libcxx/test/support/test_container_comparisons.h b/libcxx/test/support/test_container_comparisons.h
index 543c5899922d0..f7bf78e48a1f8 100644
--- a/libcxx/test/support/test_container_comparisons.h
+++ b/libcxx/test/support/test_container_comparisons.h
@@ -13,51 +13,52 @@
 #include <functional>
 #include <set>
 
+#include "test_allocator.h"
 #include "test_comparisons.h"
 
 // Implementation detail of `test_sequence_container_spaceship`
-template <template <typename...> typename Container, typename Elem, typename Order>
+template <template <typename...> typename Container, typename Elem, typename Allocator, typename Order>
 constexpr void test_sequence_container_spaceship_with_type() {
   // Empty containers
   {
-    Container<Elem> l1;
-    Container<Elem> l2;
+    Container<Elem, Allocator> l1;
+    Container<Elem, Allocator> l2;
     assert(testOrder(l1, l2, Order::equivalent));
   }
   // Identical contents
   {
-    Container<Elem> l1{1, 1};
-    Container<Elem> l2{1, 1};
+    Container<Elem, Allocator> l1{1, 1};
+    Container<Elem, Allocator> l2{1, 1};
     assert(testOrder(l1, l2, Order::equivalent));
   }
   // Less, due to contained values
   {
-    Container<Elem> l1{1, 1};
-    Container<Elem> l2{1, 2};
+    Container<Elem, Allocator> l1{1, 1};
+    Container<Elem, Allocator> l2{1, 2};
     assert(testOrder(l1, l2, Order::less));
   }
   // Greater, due to contained values
   {
-    Container<Elem> l1{1, 3};
-    Container<Elem> l2{1, 2};
+    Container<Elem, Allocator> l1{1, 3};
+    Container<Elem, Allocator> l2{1, 2};
     assert(testOrder(l1, l2, Order::greater));
   }
   // Shorter list
   {
-    Container<Elem> l1{1};
-    Container<Elem> l2{1, 2};
+    Container<Elem, Allocator> l1{1};
+    Container<Elem, Allocator> l2{1, 2};
     assert(testOrder(l1, l2, Order::less));
   }
   // Longer list
   {
-    Container<Elem> l1{1, 2};
-    Container<Elem> l2{1};
+    Container<Elem, Allocator> l1{1, 2};
+    Container<Elem, Allocator> l2{1};
     assert(testOrder(l1, l2, Order::greater));
   }
   // Unordered
   if constexpr (std::is_same_v<Elem, PartialOrder>) {
-    Container<Elem> l1{1, std::numeric_limits<int>::min()};
-    Container<Elem> l2{1, 2};
+    Container<Elem, Allocator> l1{1, std::numeric_limits<int>::min()};
+    Container<Elem, Allocator> l2{1, 2};
     assert(testOrder(l1, l2, Order::unordered));
   }
 }
@@ -69,13 +70,22 @@ constexpr bool test_sequence_container_spaceship() {
   static_assert(std::three_way_comparable<Container<int>>);
 
   // Test different comparison categories
-  test_sequence_container_spaceship_with_type<Container, int, std::strong_ordering>();
-  test_sequence_container_spaceship_with_type<Container, StrongOrder, std::strong_ordering>();
-  test_sequence_container_spaceship_with_type<Container, WeakOrder, std::weak_ordering>();
-  test_sequence_container_spaceship_with_type<Container, PartialOrder, std::partial_ordering>();
+  test_sequence_container_spaceship_with_type<Container, int, std::allocator<int>, std::strong_ordering>();
+  test_sequence_container_spaceship_with_type<Container,
+                                              StrongOrder,
+                                              test_allocator<StrongOrder>,
+                                              std::strong_ordering>();
+  test_sequence_container_spaceship_with_type<Container, WeakOrder, std::allocator<WeakOrder>, std::weak_ordering>();
+  test_sequence_container_spaceship_with_type<Container,
+                                              PartialOrder,
+                                              test_allocator<PartialOrder>,
+                                              std::partial_ordering>();
 
   // `LessAndEqComp` does not have `operator<=>`. Ordering is synthesized based on `operator<`
-  test_sequence_container_spaceship_with_type<Container, LessAndEqComp, std::weak_ordering>();
+  test_sequence_container_spaceship_with_type<Container,
+                                              LessAndEqComp,
+                                              std::allocator<LessAndEqComp>,
+                                              std::weak_ordering>();
 
   // Thanks to SFINAE, the following is not a compiler error but returns `false`
   struct NonComparable {};
@@ -175,109 +185,114 @@ constexpr bool test_sequence_container_adaptor_spaceship() {
 }
 
 // Implementation detail of `test_ordered_map_container_spaceship`
-template <template <typename...> typename Container, typename Key, typename Val, typename Order, typename Compare>
+template <template <typename...> typename Container,
+          typename Key,
+          typename Val,
+          typename Allocator,
+          typename Order,
+          typename Compare>
 constexpr void test_ordered_map_container_spaceship_with_type(Compare comp) {
   // Empty containers
   {
-    Container<Key, Val, Compare> l1{{}, comp};
-    Container<Key, Val, Compare> l2{{}, comp};
+    Container<Key, Val, Compare, Allocator> l1{{}, comp};
+    Container<Key, Val, Compare, Allocator> l2{{}, comp};
     assert(testOrder(l1, l2, Order::equivalent));
   }
   // Identical contents
   {
-    Container<Key, Val, Compare> l1{{{1, 1}, {2, 1}}, comp};
-    Container<Key, Val, Compare> l2{{{1, 1}, {2, 1}}, comp};
+    Container<Key, Val, Compare, Allocator> l1{{{1, 1}, {2, 1}}, comp};
+    Container<Key, Val, Compare, Allocator> l2{{{1, 1}, {2, 1}}, comp};
     assert(testOrder(l1, l2, Order::equivalent));
   }
   // Less, due to contained values
   {
-    Container<Key, Val, Compare> l1{{{1, 1}, {2, 1}}, comp};
-    Container<Key, Val, Compare> l2{{{1, 1}, {2, 2}}, comp};
+    Container<Key, Val, Compare, Allocator> l1{{{1, 1}, {2, 1}}, comp};
+    Container<Key, Val, Compare, Allocator> l2{{{1, 1}, {2, 2}}, comp};
     assert(testOrder(l1, l2, Order::less));
   }
   // Greater, due to contained values
   {
-    Container<Key, Val, Compare> l1{{{1, 1}, {2, 3}}, comp};
-    Container<Key, Val, Compare> l2{{{1, 1}, {2, 2}}, comp};
+    Container<Key, Val, Compare, Allocator> l1{{{1, 1}, {2, 3}}, comp};
+    Container<Key, Val, Compare, Allocator> l2{{{1, 1}, {2, 2}}, comp};
     assert(testOrder(l1, l2, Order::greater));
   }
   // Shorter list
   {
-    Container<Key, Val, Compare> l1{{{1, 1}}, comp};
-    Container<Key, Val, Compare> l2{{{1, 1}, {2, 2}}, comp};
+    Container<Key, Val, Compare, Allocator> l1{{{1, 1}}, comp};
+    Container<Key, Val, Compare, Allocator> l2{{{1, 1}, {2, 2}}, comp};
     assert(testOrder(l1, l2, Order::less));
   }
   // Longer list
   {
-    Container<Key, Val, Compare> l1{{{1, 2}, {2, 2}}, comp};
-    Container<Key, Val, Compare> l2{{{1, 1}}, comp};
+    Container<Key, Val, Compare, Allocator> l1{{{1, 2}, {2, 2}}, comp};
+    Container<Key, Val, Compare, Allocator> l2{{{1, 1}}, comp};
     assert(testOrder(l1, l2, Order::greater));
   }
   // Unordered
   if constexpr (std::is_same_v<Val, PartialOrder>) {
-    Container<Key, Val, Compare> l1{{{1, 1}, {2, std::numeric_limits<int>::min()}}, comp};
-    Container<Key, Val, Compare> l2{{{1, 1}, {2, 2}}, comp};
+    Container<Key, Val, Compare, Allocator> l1{{{1, 1}, {2, std::numeric_limits<int>::min()}}, comp};
+    Container<Key, Val, Compare, Allocator> l2{{{1, 1}, {2, 2}}, comp};
     assert(testOrder(l1, l2, Order::unordered));
   }
 
   // Identical contents
   {
-    Container<Key, Val, Compare> l1{{{1, 1}, {2, 1}, {2, 2}}, comp};
-    Container<Key, Val, Compare> l2{{{1, 1}, {2, 1}, {2, 2}}, comp};
+    Container<Key, Val, Compare, Allocator> l1{{{1, 1}, {2, 1}, {2, 2}}, comp};
+    Container<Key, Val, Compare, Allocator> l2{{{1, 1}, {2, 1}, {2, 2}}, comp};
     assert(testOrder(l1, l2, Order::equivalent));
 
-    Container<Key, Val, Compare> l3{{{1, 1}, {2, 1}, {2, 2}}, comp};
-    Container<Key, Val, Compare> l4{{{2, 1}, {2, 2}, {1, 1}}, comp};
+    Container<Key, Val, Compare, Allocator> l3{{{1, 1}, {2, 1}, {2, 2}}, comp};
+    Container<Key, Val, Compare, Allocator> l4{{{2, 1}, {2, 2}, {1, 1}}, comp};
     assert(testOrder(l3, l4, Order::equivalent));
   }
   // Less, due to contained values
   {
-    Container<Key, Val, Compare> l1{{{1, 1}, {2, 1}, {2, 1}}, comp};
-    Container<Key, Val, Compare> l2{{{1, 1}, {2, 2}, {2, 2}}, comp};
+    Container<Key, Val, Compare, Allocator> l1{{{1, 1}, {2, 1}, {2, 1}}, comp};
+    Container<Key, Val, Compare, Allocator> l2{{{1, 1}, {2, 2}, {2, 2}}, comp};
     assert(testOrder(l1, l2, Order::less));
 
-    Container<Key, Val, Compare> l3{{{1, 1}, {2, 1}, {2, 1}}, comp};
-    Container<Key, Val, Compare> l4{{{2, 2}, {2, 2}, {1, 1}}, comp};
+    Container<Key, Val, Compare, Allocator> l3{{{1, 1}, {2, 1}, {2, 1}}, comp};
+    Container<Key, Val, Compare, Allocator> l4{{{2, 2}, {2, 2}, {1, 1}}, comp};
     assert(testOrder(l3, l4, Order::less));
   }
   // Greater, due to contained values
   {
-    Container<Key, Val, Compare> l1{{{1, 1}, {2, 3}, {2, 3}}, comp};
-    Container<Key, Val, Compare> l2{{{1, 1}, {2, 2}, {2, 2}}, comp};
+    Container<Key, Val, Compare, Allocator> l1{{{1, 1}, {2, 3}, {2, 3}}, comp};
+    Container<Key, Val, Compare, Allocator> l2{{{1, 1}, {2, 2}, {2, 2}}, comp};
     assert(testOrder(l1, l2, Order::greater));
 
-    Container<Key, Val, Compare> l3{{{1, 1}, {2, 3}, {2, 3}}, comp};
-    Container<Key, Val, Compare> l4{{{2, 2}, {2, 2}, {1, 1}}, comp};
+    Container<Key, Val, Compare, Allocator> l3{{{1, 1}, {2, 3}, {2, 3}}, comp};
+    Container<Key, Val, Compare, Allocator> l4{{{2, 2}, {2, 2}, {1, 1}}, comp};
     assert(testOrder(l3, l4, Order::greater));
   }
   // Shorter list
   {
-    Container<Key, Val, Compare> l1{{{1, 1}, {2, 2}}, comp};
-    Container<Key, Val, Compare> l2{{{1, 1}, {2, 2}, {2, 2}, {3, 1}}, comp};
+    Container<Key, Val, Compare, Allocator> l1{{{1, 1}, {2, 2}}, comp};
+    Container<Key, Val, Compare, Allocator> l2{{{1, 1}, {2, 2}, {2, 2}, {3, 1}}, comp};
     assert(testOrder(l1, l2, Order::less));
 
-    Container<Key, Val, Compare> l3{{{1, 1}, {2, 2}}, comp};
-    Container<Key, Val, Compare> l4{{{3, 1}, {2, 2}, {2, 2}, {1, 1}}, comp};
+    Container<Key, Val, Compare, Allocator> l3{{{1, 1}, {2, 2}}, comp};
+    Container<Key, Val, Compare, Allocator> l4{{{3, 1}, {2, 2}, {2, 2}, {1, 1}}, comp};
     assert(testOrder(l3, l4, Order::less));
   }
   // Longer list
   {
-    Container<Key, Val, Compare> l1{{{1, 2}, {2, 2}, {2, 2}, {3, 1}}, comp};
-    Container<Key, Val, Compare> l2{{{1, 1}, {2, 2}}, comp};
+    Container<Key, Val, Compare, Allocator> l1{{{1, 2}, {2, 2}, {2, 2}, {3, 1}}, comp};
+    Container<Key, Val, Compare, Allocator> l2{{{1, 1}, {2, 2}}, comp};
     assert(testOrder(l1, l2, Order::greater));
 
-    Container<Key, Val, Compare> l3{{{1, 2}, {2, 2}, {2, 2}, {3, 1}}, comp};
-    Container<Key, Val, Compare> l4{{{2, 2}, {1, 1}}, comp};
+    Container<Key, Val, Compare, Allocator> l3{{{1, 2}, {2, 2}, {2, 2}, {3, 1}}, comp};
+    Container<Key, Val, Compare, Allocator> l4{{{2, 2}, {1, 1}}, comp};
     assert(testOrder(l3, l4, Order::greater));
   }
   // Unordered
   if constexpr (std::is_same_v<Val, PartialOrder>) {
-    Container<Key, Val, Compare> l1{{{1, 1}, {2, std::numeric_limits<int>::min()}, {2, 3}}, comp};
-    Container<Key, Val, Compare> l2{{{1, 1}, {2, 2}, {2, 3}}, comp};
+    Container<Key, Val, Compare, Allocator> l1{{{1, 1}, {2, std::numeric_limits<int>::min()}, {2, 3}}, comp};
+    Container<Key, Val, Compare, Allocator> l2{{{1, 1}, {2, 2}, {2, 3}}, comp};
     assert(testOrder(l1, l2, Order::unordered));
 
-    Container<Key, Val, Compare> l3{{{1, 1}, {2, std::numeric_limits<int>::min()}, {2, 3}}, comp};
-    Container<Key, Val, Compare> l4{{{2, 3}, {2, 2}, {1, 1}}, comp};
+    Container<Key, Val, Compare, Allocator> l3{{{1, 1}, {2, std::numeric_limits<int>::min()}, {2, 3}}, comp};
+    Container<Key, Val, Compare, Allocator> l4{{{2, 3}, {2, 2}, {1, 1}}, comp};
     assert(testOrder(l3, l4, Order::unordered));
   }
 }
@@ -293,94 +308,134 @@ constexpr bool test_ordered_map_container_spaceship() {
   static_assert(std::three_way_comparable<Container<int, int>>);
 
   // Test different comparison categories
-  test_ordered_map_container_spaceship_with_type<Container, int, int, std::strong_ordering>(std::less{});
-  test_ordered_map_container_spaceship_with_type<Container, int, int, std::strong_ordering>(std::greater{});
-  test_ordered_map_container_spaceship_with_type<Container, int, StrongOrder, std::strong_ordering>(std::less{});
-  test_ordered_map_container_spaceship_with_type<Container, int, StrongOrder, std::strong_ordering>(std::greater{});
-  test_ordered_map_container_spaceship_with_type<Container, int, WeakOrder, std::weak_ordering>(std::less{});
-  test_ordered_map_container_spaceship_with_type<Container, int, WeakOrder, std::weak_ordering>(std::greater{});
-  test_ordered_map_container_spaceship_with_type<Container, int, PartialOrder, std::partial_ordering>(std ::less{});
-  test_ordered_map_container_spaceship_with_type<Container, int, PartialOrder, std::partial_ordering>(std ::greater{});
+  test_ordered_map_container_spaceship_with_type<Container,
+                                                 int,
+                                                 int,
+                                                 std::allocator<std::pair<const int, int>>,
+                                                 std::strong_ordering>(std::less{});
+  test_ordered_map_container_spaceship_with_type<Container,
+                                                 int,
+                                                 int,
+                                                 test_allocator<std::pair<const int, int>>,
+                                                 std::strong_ordering>(std::greater{});
+  test_ordered_map_container_spaceship_with_type<Container,
+                                                 int,
+                                                 StrongOrder,
+                                                 std::allocator<std::pair<const int, StrongOrder>>,
+                                                 std::strong_ordering>(std::less{});
+  test_ordered_map_container_spaceship_with_type<Container,
+                                                 int,
+                                                 StrongOrder,
+                                                 test_allocator<std::pair<const int, StrongOrder>>,
+                                                 std::strong_ordering>(std::greater{});
+  test_ordered_map_container_spaceship_with_type<Container,
+                                                 int,
+                                                 WeakOrder,
+                                                 std::allocator<std::pair<const int, WeakOrder>>,
+                                                 std::weak_ordering>(std::less{});
+  test_ordered_map_container_spaceship_with_type<Container,
+                                                 int,
+                                                 WeakOrder,
+                                                 test_allocator<std::pair<const int, WeakOrder>>,
+                                                 std::weak_ordering>(std::greater{});
+  test_ordered_map_container_spaceship_with_type<Container,
+                                                 int,
+                                                 PartialOrder,
+                                                 std::allocator<std::pair<const int, PartialOrder>>,
+                                                 std::partial_ordering>(std ::less{});
+  test_ordered_map_container_spaceship_with_type<Container,
+                                                 int,
+                                                 PartialOrder,
+                                                 test_allocator<std::pair<const int, PartialOrder>>,
+                                                 std::partial_ordering>(std ::greater{});
 
   // `LessAndEqComp` does not have `operator<=>`. Ordering is synthesized based on `operator<`
-  test_ordered_map_container_spaceship_with_type<Container, int, LessAndEqComp, std::weak_ordering>(std::less{});
+  test_ordered_map_container_spaceship_with_type<Container,
+                                                 int,
+                                                 LessAndEqComp,
+                                                 std::allocator<std::pair<const int, LessAndEqComp>>,
+                                                 std::weak_ordering>(std::less{});
 
   return true;
 }
 
 // Implementation detail of `test_ordered_set_container_spaceship`
-template <template <typename...> typename Container, typename Elem, typename Order, typename Compare>
+template <template <typename...> typename Container,
+          typename Elem,
+          typename Allocator,
+          typename Order,
+          typename Compare>
 constexpr void test_ordered_set_spaceship_with_type(Compare comp) {
   // Empty containers
   {
-    Container<Elem, Compare> l1{{}, comp};
-    Container<Elem, Compare> l2{{}, comp};
+    Container<Elem, Compare, Allocator> l1{{}, comp};
+    Container<Elem, Compare, Allocator> l2{{}, comp};
     assert(testOrder(l1, l2, Order::equivalent));
   }
   // Identical contents
   {
-    Container<Elem, Compare> l1{{1, 1, 2}, comp};
-    Container<Elem, Compare> l2{{1, 1, 2}, comp};
+    Container<Elem, Compare, Allocator> l1{{1, 1, 2}, comp};
+    Container<Elem, Compare, Allocator> l2{{1, 1, 2}, comp};
     assert(testOrder(l1, l2, Order::equivalent));
   }
   // Less, due to contained values
   {
-    Container<Elem, Compare> l1{{1, 1, 2, 3}, comp};
-    Container<Elem, Compare> l2{{1, 2, 2, 4}, comp};
+    Container<Elem, Compare, Allocator> l1{{1, 1, 2, 3}, comp};
+    Container<Elem, Compare, Allocator> l2{{1, 2, 2, 4}, comp};
     assert(testOrder(l1, l2, Order::less));
   }
   // Greater, due to contained values
   {
-    Container<Elem, Compare> l1{{1, 2, 2, 4}, comp};
-    Container<Elem, Compare> l2{{1, 1, 2, 3}, comp};
+    Container<Elem, Compare, Allocator> l1{{1, 2, 2, 4}, comp};
+    Container<Elem, Compare, Allocator> l2{{1, 1, 2, 3}, comp};
     assert(testOrder(l1, l2, Order::greater));
   }
   // Shorter list
   {
-    Container<Elem, Compare> l1{{1, 1, 2, 2}, comp};
-    Container<Elem, Compare> l2{{1, 1, 2, 2, 3}, comp};
+    Container<Elem, Compare, Allocator> l1{{1, 1, 2, 2}, comp};
+    Container<Elem, Compare, Allocator> l2{{1, 1, 2, 2, 3}, comp};
     assert(testOrder(l1, l2, Order::less));
   }
   // Longer list
   {
-    Container<Elem, Compare> l1{{1, 1, 2, 2, 3}, comp};
-    Container<Elem, Compare> l2{{1, 1, 2, 2}, comp};
+    Container<Elem, Compare, Allocator> l1{{1, 1, 2, 2, 3}, comp};
+    Container<Elem, Compare, Allocator> l2{{1, 1, 2, 2}, comp};
     assert(testOrder(l1, l2, Order::greater));
   }
   // Unordered
   if constexpr (std::is_same_v< Container<Elem>, std::multiset<PartialOrder>>) {
     if constexpr (std::is_same_v<Elem, PartialOrder> && std::is_same_v<Compare, decltype(std::less{})>) {
-      Container<Elem, Compare> l1{{1, std::numeric_limits<int>::min()}, comp};
-      Container<Elem, Compare> l2{{1, 2}, comp};
+      Container<Elem, Compare, Allocator> l1{{1, std::numeric_limits<int>::min()}, comp};
+      Container<Elem, Compare, Allocator> l2{{1, 2}, comp};
       assert(testOrder(l1, l2, Order::unordered));
     }
     if constexpr (std::is_same_v<Elem, PartialOrder> && std::is_same_v<Compare, decltype(std::less{})>) {
-      Container<Elem, Compare> l1{{1, std::numeric_limits<int>::max()}, comp};
-      Container<Elem, Compare> l2{{1, 2}, comp};
+      Container<Elem, Compare, Allocator> l1{{1, std::numeric_limits<int>::max()}, comp};
+      Container<Elem, Compare, Allocator> l2{{1, 2}, comp};
       assert(testOrder(l1, l2, Order::unordered));
     }
   }
   if constexpr (std::is_same_v< Container<Elem>, std::set<PartialOrder>>) {
     // Unordered values are not supported for `set`
     if constexpr (std::is_same_v<Elem, PartialOrder> && std::is_same_v<Compare, decltype(std::less{})>) {
-      Container<Elem, Compare> l1{{1, std::numeric_limits<int>::min()}, comp};
-      Container<Elem, Compare> l2{{1, 2}, comp};
+      Container<Elem, Compare, Allocator> l1{{1, std::numeric_limits<int>::min()}, comp};
+      Container<Elem, Compare, Allocator> l2{{1, 2}, comp};
       assert(testOrder(l1, l2, Order::less));
     }
     if constexpr (std::is_same_v<Elem, PartialOrder> && std::is_same_v<Compare, decltype(std::less{})>) {
-      Container<Elem, Compare> l1{{1, std::numeric_limits<int>::max()}, comp};
-      Container<Elem, Compare> l2{{1, 2}, comp};
+      Container<Elem, Compare, Allocator> l1{{1, std::numeric_limits<int>::max()}, comp};
+      Container<Elem, Compare, Allocator> l2{{1, 2}, comp};
       assert(testOrder(l1, l2, Order::less));
     }
   }
   if constexpr (std::is_same_v<Elem, PartialOrder> && std::is_same_v<Compare, decltype(std::greater{})>) {
-    Container<Elem, Compare> l1{{1, std::numeric_limits<int>::min()}, comp};
-    Container<Elem, Compare> l2{{1, 2}, comp};
+    Container<Elem, Compare, Allocator> l1{{1, std::numeric_limits<int>::min()}, comp};
+    Container<Elem, Compare, Allocator> l2{{1, 2}, comp};
     assert(testOrder(l1, l2, Order::less));
   }
   if constexpr (std::is_same_v<Elem, PartialOrder> && std::is_same_v<Compare, decltype(std::greater{})>) {
-    Container<Elem, Compare> l1{{1, std::numeric_limits<int>::max()}, comp};
-    Container<Elem, Compare> l2{{1, 2}, comp};
+    Container<Elem, Compare, Allocator> l1{{1, std::numeric_limits<int>::max()}, comp};
+    Container<Elem, Compare, Allocator> l2{{1, 2}, comp};
     assert(testOrder(l1, l2, Order::less));
   }
 }
@@ -396,17 +451,24 @@ constexpr bool test_ordered_set_container_spaceship() {
   static_assert(std::three_way_comparable<Container<int>>);
 
   // Test different comparison categories
-  test_ordered_set_spaceship_with_type<Container, int, std::strong_ordering>(std::less{});
-  test_ordered_set_spaceship_with_type<Container, int, std::strong_ordering>(std::greater{});
-  test_ordered_set_spaceship_with_type<Container, StrongOrder, std::strong_ordering>(std::less{});
-  test_ordered_set_spaceship_with_type<Container, StrongOrder, std::strong_ordering>(std::greater{});
-  test_ordered_set_spaceship_with_type<Container, WeakOrder, std::weak_ordering>(std::less{});
-  test_ordered_set_spaceship_with_type<Container, WeakOrder, std::weak_ordering>(std::greater{});
-  test_ordered_set_spaceship_with_type<Container, PartialOrder, std::partial_ordering>(std::less{});
-  test_ordered_set_spaceship_with_type<Container, PartialOrder, std::partial_ordering>(std::greater{});
+  test_ordered_set_spaceship_with_type<Container, int, std::allocator<int>, std::strong_ordering>(std::less{});
+  test_ordered_set_spaceship_with_type<Container, int, test_allocator<int>, std::strong_ordering>(std::greater{});
+  test_ordered_set_spaceship_with_type<Container, StrongOrder, std::allocator<StrongOrder>, std::strong_ordering>(
+      std::less{});
+  test_ordered_set_spaceship_with_type<Container, StrongOrder, test_allocator<StrongOrder>, std::strong_ordering>(
+      std::greater{});
+  test_ordered_set_spaceship_with_type<Container, WeakOrder, std::allocator<WeakOrder>, std::weak_ordering>(
+      std::less{});
+  test_ordered_set_spaceship_with_type<Container, WeakOrder, test_allocator<WeakOrder>, std::weak_ordering>(
+      std::greater{});
+  test_ordered_set_spaceship_with_type<Container, PartialOrder, std::allocator<PartialOrder>, std::partial_ordering>(
+      std::less{});
+  test_ordered_set_spaceship_with_type<Container, PartialOrder, test_allocator<PartialOrder>, std::partial_ordering>(
+      std::greater{});
 
   // `LessAndEqComp` does not have `operator<=>`. Ordering is synthesized based on `operator<`
-  test_ordered_set_spaceship_with_type<Container, LessAndEqComp, std::weak_ordering>(std::less{});
+  test_ordered_set_spaceship_with_type<Container, LessAndEqComp, std::allocator<LessAndEqComp>, std::weak_ordering>(
+      std::less{});
 
   return true;
 }

From 3dedc9985a65ecdd90bf1d21e137bc641b337bd2 Mon Sep 17 00:00:00 2001
From: Brian Cain <brian.cain@oss.qualcomm.com>
Date: Mon, 17 Feb 2025 09:30:48 -0600
Subject: [PATCH 160/282] [Hexagon] Explicitly truncate constant in UAddSubO
 (#127360)

After #117558 landed, this code would assert "Value is not an N-bit
unsigned value" in getConstant(), from a test case in zig.

Co-authored-by:  Craig Topper <craig.topper@sifive.com>
Fixes #127296

(cherry picked from commit 788cb725d8b92a82e41e64540dccca97c9086a58)
---
 .../lib/Target/Hexagon/HexagonISelLowering.cpp |  2 +-
 llvm/test/CodeGen/Hexagon/iss127296.ll         | 18 ++++++++++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/Hexagon/iss127296.ll

diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
index 12ca0c505bd06..5ce5cae2ff906 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -3273,7 +3273,7 @@ HexagonTargetLowering::LowerUAddSubO(SDValue Op, SelectionDAG &DAG) const {
     if (Opc == ISD::USUBO) {
       SDValue Op = DAG.getNode(ISD::SUB, dl, VTs.VTs[0], {X, Y});
       SDValue Ov = DAG.getSetCC(dl, MVT::i1, Op,
-                                DAG.getConstant(-1, dl, ty(Op)), ISD::SETEQ);
+                                DAG.getAllOnesConstant(dl, ty(Op)), ISD::SETEQ);
       return DAG.getMergeValues({Op, Ov}, dl);
     }
   }
diff --git a/llvm/test/CodeGen/Hexagon/iss127296.ll b/llvm/test/CodeGen/Hexagon/iss127296.ll
new file mode 100644
index 0000000000000..bf0e7a9881014
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/iss127296.ll
@@ -0,0 +1,18 @@
+; RUN: llc -mtriple=hexagon -O0 < %s | FileCheck %s
+
+; CHECK: r0 = add(r0,#-1)
+
+define fastcc void @os.linux.tls.initStatic(i32 %x) {
+  %1 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %x, i32 1)
+  br label %2
+
+  2:                                                ; preds = %0
+  %3 = extractvalue { i32, i1 } %1, 0
+  ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare { i32, i1 } @llvm.usub.with.overflow.i32(i32, i32) #0
+
+attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+

From 34f5f905f7130326736bb63476b07ed55e35cfde Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Tue, 18 Feb 2025 00:15:01 -0800
Subject: [PATCH 161/282] [clang-format] Fix a bug in annotating StartOfName
 (#127545)

Also ensure we can break before ClassHeadName like StartOfName.

Fixes #127470

(cherry picked from commit 13de15c9c49068db850368c45ffed8f7bbf07f20)
---
 clang/lib/Format/TokenAnnotator.cpp           | 6 +++---
 clang/unittests/Format/FormatTest.cpp         | 5 +++++
 clang/unittests/Format/TokenAnnotatorTest.cpp | 4 ++++
 3 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index 87abe76ae1f7e..ac5b25d52ce84 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -2595,7 +2595,7 @@ class AnnotatingParser {
         (!NextNonComment && !Line.InMacroBody) ||
         (NextNonComment &&
          (NextNonComment->isPointerOrReference() ||
-          NextNonComment->is(tok::string_literal) ||
+          NextNonComment->isOneOf(TT_ClassHeadName, tok::string_literal) ||
           (Line.InPragmaDirective && NextNonComment->is(tok::identifier))))) {
       return false;
     }
@@ -6194,8 +6194,8 @@ bool TokenAnnotator::canBreakBefore(const AnnotatedLine &Line,
                 FormatStyle::PAS_Right &&
             (!Right.Next || Right.Next->isNot(TT_FunctionDeclarationName)));
   }
-  if (Right.isOneOf(TT_StartOfName, TT_FunctionDeclarationName) ||
-      Right.is(tok::kw_operator)) {
+  if (Right.isOneOf(TT_StartOfName, TT_FunctionDeclarationName,
+                    TT_ClassHeadName, tok::kw_operator)) {
     return true;
   }
   if (Left.is(TT_PointerOrReference))
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index 30f3533ac73f7..3b7856d6ee150 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -28720,6 +28720,11 @@ TEST_F(FormatTest, WrapNamespaceBodyWithEmptyLinesAlways) {
                Style);
 }
 
+TEST_F(FormatTest, BreakBeforeClassName) {
+  verifyFormat("class ABSL_ATTRIBUTE_TRIVIAL_ABI ABSL_NULLABILITY_COMPATIBLE\n"
+               "    ArenaSafeUniquePtr {};");
+}
+
 } // namespace
 } // namespace test
 } // namespace format
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index 7e90bde8755fb..dffb07c89bacc 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -3239,6 +3239,10 @@ TEST_F(TokenAnnotatorTest, StartOfName) {
   EXPECT_TOKEN(Tokens[0], tok::at, TT_ObjCDecl);
   EXPECT_TOKEN(Tokens[2], tok::identifier, TT_StartOfName);
 
+  Tokens = annotate("class FOO BAR C {};");
+  ASSERT_EQ(Tokens.size(), 8u) << Tokens;
+  EXPECT_TOKEN(Tokens[2], tok::identifier, TT_Unknown); // Not StartOfName
+
   auto Style = getLLVMStyle();
   Style.StatementAttributeLikeMacros.push_back("emit");
   Tokens = annotate("emit foo = 0;", Style);

From f567c03221769d0e5e267a3c239079756aadea80 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Tue, 18 Feb 2025 09:14:04 +0000
Subject: [PATCH 162/282] [libclc] Disable external-calls testing for clspv
 targets (#127529)

These targets don't include all OpenCL builtins, so there will always be
external calls in the final bytecode module.

Fixes #127316.

(cherry picked from commit 9fec0a0942f5a11f4dcfec20aa485a8513661720)
---
 libclc/cmake/modules/AddLibclc.cmake | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/libclc/cmake/modules/AddLibclc.cmake b/libclc/cmake/modules/AddLibclc.cmake
index b520626c6ffd1..717121abb8c98 100644
--- a/libclc/cmake/modules/AddLibclc.cmake
+++ b/libclc/cmake/modules/AddLibclc.cmake
@@ -345,8 +345,9 @@ function(add_libclc_builtin_set)
   add_custom_target( prepare-${obj_suffix} ALL DEPENDS ${obj_suffix} )
   set_target_properties( "prepare-${obj_suffix}" PROPERTIES FOLDER "libclc/Device IR/Prepare" )
 
-  # nvptx-- targets don't include workitem builtins
-  if( NOT ARG_TRIPLE MATCHES ".*ptx.*--$" )
+  # nvptx-- targets don't include workitem builtins, and clspv targets don't
+  # include all OpenCL builtins
+  if( NOT ARG_ARCH MATCHES "^(nvptx|clspv)(64)?$" )
     add_test( NAME external-calls-${obj_suffix}
       COMMAND ./check_external_calls.sh ${CMAKE_CURRENT_BINARY_DIR}/${obj_suffix} ${LLVM_TOOLS_BINARY_DIR}
       WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} )

From 6b57839efb20fdbd43935b8fdd30665d0da6bf29 Mon Sep 17 00:00:00 2001
From: Haojian Wu <hokein.wu@gmail.com>
Date: Mon, 17 Feb 2025 14:40:31 +0100
Subject: [PATCH 163/282] [clang] Fix false positive regression for lifetime
 analysis warning. (#127460)

This fixes a false positive caused by #114044.

For `GSLPointer*` types, it's less clear whether the lifetime issue is
about the GSLPointer object itself or the owner it points to. To avoid
false positives, we take a conservative approach in our heuristic.

Fixes #127195

(This will be backported to release 20).

(cherry picked from commit 9c49b188b8e1434eb774ee8422124ad3e8870dce)
---
 clang/lib/Sema/CheckExprLifetime.cpp          |  5 ++--
 clang/test/Sema/Inputs/lifetime-analysis.h    |  2 ++
 .../Sema/warn-lifetime-analysis-nocfg.cpp     | 24 +++++++++++++++++++
 3 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Sema/CheckExprLifetime.cpp b/clang/lib/Sema/CheckExprLifetime.cpp
index 8963cad86dbca..1f87001f35b57 100644
--- a/clang/lib/Sema/CheckExprLifetime.cpp
+++ b/clang/lib/Sema/CheckExprLifetime.cpp
@@ -1239,11 +1239,12 @@ static AnalysisResult analyzePathForGSLPointer(const IndirectLocalPath &Path,
     }
     // Check the return type, e.g.
     //   const GSLOwner& func(const Foo& foo [[clang::lifetimebound]])
+    //   GSLOwner* func(cosnt Foo& foo [[clang::lifetimebound]])
     //   GSLPointer func(const Foo& foo [[clang::lifetimebound]])
     if (FD &&
-        ((FD->getReturnType()->isReferenceType() &&
+        ((FD->getReturnType()->isPointerOrReferenceType() &&
           isRecordWithAttr<OwnerAttr>(FD->getReturnType()->getPointeeType())) ||
-         isPointerLikeType(FD->getReturnType())))
+         isGLSPointerType(FD->getReturnType())))
       return Report;
 
     return Abandon;
diff --git a/clang/test/Sema/Inputs/lifetime-analysis.h b/clang/test/Sema/Inputs/lifetime-analysis.h
index d318033ff0cc4..2072e4603cead 100644
--- a/clang/test/Sema/Inputs/lifetime-analysis.h
+++ b/clang/test/Sema/Inputs/lifetime-analysis.h
@@ -61,6 +61,7 @@ struct basic_string_view {
   basic_string_view();
   basic_string_view(const T *);
   const T *begin() const;
+  const T *data() const;
 };
 using string_view = basic_string_view<char>;
 
@@ -80,6 +81,7 @@ struct basic_string {
   const T *c_str() const;
   operator basic_string_view<T> () const;
   using const_iterator = iter<T>;
+  const T *data() const;
 };
 using string = basic_string<char>;
 
diff --git a/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp b/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp
index 04bb1330ded4c..66a2a19ceb321 100644
--- a/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp
+++ b/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp
@@ -852,3 +852,27 @@ struct Test {
 };
 
 } // namespace GH120543
+
+namespace GH127195 {
+template <typename T>
+struct StatusOr {
+  T* operator->() [[clang::lifetimebound]];
+  T* value() [[clang::lifetimebound]];
+};
+
+const char* foo() {
+  StatusOr<std::string> s;
+  return s->data(); // expected-warning {{address of stack memory associated with local variable}}
+  
+  StatusOr<std::string_view> s2;
+  return s2->data();
+
+  StatusOr<StatusOr<std::string_view>> s3;
+  return s3.value()->value()->data();
+
+  // FIXME: nested cases are not supported now.
+  StatusOr<StatusOr<std::string>> s4;
+  return s4.value()->value()->data();
+}
+
+} // namespace GH127195

From e503227bc57625a0a22b450f5bd3e78df96ca4fe Mon Sep 17 00:00:00 2001
From: Vigneshwar Jayakumar <vigneshwar.jayakumar@amd.com>
Date: Tue, 11 Feb 2025 12:32:23 -0600
Subject: [PATCH 164/282] AMDGPU: Handle gfx950 XDL Write-VGPR-VALU-WAW wait
 state change (#126132)

There are additional wait states for XDL write VALU WAW hazard in gfx950
compared to gfx940.

(cherry picked from commit 1188b1ff7b956cb65d8ddda5f1e56c432f1a57c7)
---
 .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 17 +++++++------
 .../CodeGen/AMDGPU/mai-hazards-gfx940.mir     | 24 ++++++++++++-------
 2 files changed, 26 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 537181710ed32..646663a92e5e8 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -2605,12 +2605,14 @@ static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses) {
   return NumPasses + 2;
 }
 
-static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses) {
-  // 2 pass -> 5
-  // 4 pass -> 7
-  // 8 pass -> 11
-  // 16 pass -> 19
-  return NumPasses + 3;
+static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses,
+                                                       bool IsGFX950) {
+  // xdl def cycles | gfx940 | gfx950
+  // 2 pass         |  5        5
+  // 4 pass         |  7        8
+  // 8 pass         |  11       12
+  // 16 pass        |  19       20
+  return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
 }
 
 static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses,
@@ -2858,7 +2860,8 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
       } else if (ST.hasGFX940Insts()) {
         NeedWaitStates =
             isXDL(ST, *MFMA)
-                ? GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(NumPasses)
+                ? GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(
+                      NumPasses, ST.hasGFX950Insts())
                 : GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(NumPasses);
       } else {
         switch (NumPasses) {
diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir
index ef30c9a44b2b5..0af37ad8c896e 100644
--- a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir
+++ b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir
@@ -958,7 +958,8 @@ body:             |
 # GCN-LABEL: name: xdl_smfma16x16_write_vgpr_valu_write
 # GCN:      V_MFMA
 # GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GFX940-NEXT: S_NOP 2
+# GFX950-NEXT: S_NOP 3
 # GCN-NEXT: V_MOV_B32
 name:            xdl_smfma16x16_write_vgpr_valu_write
 body:             |
@@ -970,7 +971,8 @@ body:             |
 # GCN:      V_MFMA
 # GCN-NEXT: S_NOP 7
 # GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GFX940-NEXT: S_NOP 2
+# GFX950-NEXT: S_NOP 3
 # GCN-NEXT: V_MOV_B32
 name:            xdl_smfma32x32_write_vgpr_valu_write
 body:             |
@@ -991,7 +993,8 @@ body:             |
 # GCN-LABEL: name: xdl_smfma16x16_write_vgpr_valu_f16_write
 # GCN:      V_MFMA
 # GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GFX940-NEXT: S_NOP 2
+# GFX950-NEXT: S_NOP 3
 # GCN-NEXT: V_FMA_F16_e64
 name:            xdl_smfma16x16_write_vgpr_valu_f16_write
 body:             |
@@ -1003,7 +1006,8 @@ body:             |
 # GCN:      V_MFMA
 # GCN-NEXT: S_NOP 7
 # GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GFX940-NEXT: S_NOP 2
+# GFX950-NEXT: S_NOP 3
 # GCN-NEXT: V_FMA_F16_e64
 name:            xdl_smfma32x32_write_vgpr_valu_f16_write
 body:             |
@@ -1024,7 +1028,8 @@ body:             |
 # GCN-LABEL: name: xdl_smfma16x16_write_vgpr_valu_sdwa_write
 # GCN:      V_MFMA
 # GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GFX940-NEXT: S_NOP 2
+# GFX950-NEXT: S_NOP 3
 # GCN-NEXT: V_MOV_B32_sdwa
 name:            xdl_smfma16x16_write_vgpr_valu_sdwa_write
 body:             |
@@ -1761,7 +1766,8 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_sgemm16X16X16_mfma_write_vgpr_valu_write
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 6
+# GFX940-NEXT: S_NOP 6
+# GFX950-NEXT: S_NOP 7
 # GCN-NEXT: V_MOV_B32
 name:            xdl_sgemm16X16X16_mfma_write_vgpr_valu_write
 body:             |
@@ -2072,7 +2078,8 @@ body:             |
 ...
 # GCN-LABEL: name: smfmac16x16_read_vgpr_srcc_valu_write
 # GCN:      V_SMFMAC
-# GCN-NEXT: S_NOP 6
+# GFX940-NEXT: S_NOP 6
+# GFX950-NEXT: S_NOP 7
 # GCN-NEXT: V_MOV_B32
 name:            smfmac16x16_read_vgpr_srcc_valu_write
 body:             |
@@ -2102,7 +2109,8 @@ body:             |
 # GCN-LABEL: name: smfmac32x32_read_vgpr_srcc_valu_write
 # GCN:      V_SMFMAC
 # GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GFX940-NEXT: S_NOP 2
+# GFX950-NEXT: S_NOP 3
 # GCN-NEXT: V_MOV_B32
 name:            smfmac32x32_read_vgpr_srcc_valu_write
 body:             |

From 04295354b0f0f101773c4f9437680d45a70bab24 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Fri, 7 Feb 2025 15:40:16 +0100
Subject: [PATCH 165/282] Revert "[libc++] Reduce std::conjunction overhead
 (#124259)"

It turns out that the new implementation takes significantly more stack
memory for some reason.

This reverts commit 2696e4fb9567d23ce065a067e7f4909b310daf50.

(cherry picked from commit 0227396417d4625bc93affdd8957ff8d90c76299)
---
 libcxx/include/__type_traits/conjunction.h | 42 ++++++++++++----------
 1 file changed, 24 insertions(+), 18 deletions(-)

diff --git a/libcxx/include/__type_traits/conjunction.h b/libcxx/include/__type_traits/conjunction.h
index 6b6717a50a468..ad9656acd47ec 100644
--- a/libcxx/include/__type_traits/conjunction.h
+++ b/libcxx/include/__type_traits/conjunction.h
@@ -10,6 +10,8 @@
 #define _LIBCPP___TYPE_TRAITS_CONJUNCTION_H
 
 #include <__config>
+#include <__type_traits/conditional.h>
+#include <__type_traits/enable_if.h>
 #include <__type_traits/integral_constant.h>
 #include <__type_traits/is_same.h>
 
@@ -19,29 +21,22 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-template <bool>
-struct _AndImpl;
+template <class...>
+using __expand_to_true _LIBCPP_NODEBUG = true_type;
 
-template <>
-struct _AndImpl<true> {
-  template <class _Res, class _First, class... _Rest>
-  using _Result _LIBCPP_NODEBUG =
-      typename _AndImpl<bool(_First::value) && sizeof...(_Rest) != 0>::template _Result<_First, _Rest...>;
-};
+template <class... _Pred>
+__expand_to_true<__enable_if_t<_Pred::value>...> __and_helper(int);
 
-template <>
-struct _AndImpl<false> {
-  template <class _Res, class...>
-  using _Result _LIBCPP_NODEBUG = _Res;
-};
+template <class...>
+false_type __and_helper(...);
 
 // _And always performs lazy evaluation of its arguments.
 //
 // However, `_And<_Pred...>` itself will evaluate its result immediately (without having to
 // be instantiated) since it is an alias, unlike `conjunction<_Pred...>`, which is a struct.
 // If you want to defer the evaluation of `_And<_Pred...>` itself, use `_Lazy<_And, _Pred...>`.
-template <class... _Args>
-using _And _LIBCPP_NODEBUG = typename _AndImpl<sizeof...(_Args) != 0>::template _Result<true_type, _Args...>;
+template <class... _Pred>
+using _And _LIBCPP_NODEBUG = decltype(std::__and_helper<_Pred...>(0));
 
 template <bool... _Preds>
 struct __all_dummy;
@@ -51,11 +46,22 @@ struct __all : _IsSame<__all_dummy<_Pred...>, __all_dummy<((void)_Pred, true)...
 
 #if _LIBCPP_STD_VER >= 17
 
-template <class... _Args>
-struct _LIBCPP_NO_SPECIALIZATIONS conjunction : _And<_Args...> {};
+template <class...>
+struct _LIBCPP_NO_SPECIALIZATIONS conjunction : true_type {};
+
+_LIBCPP_DIAGNOSTIC_PUSH
+#  if __has_warning("-Winvalid-specialization")
+_LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Winvalid-specialization")
+#  endif
+template <class _Arg>
+struct conjunction<_Arg> : _Arg {};
+
+template <class _Arg, class... _Args>
+struct conjunction<_Arg, _Args...> : conditional_t<!bool(_Arg::value), _Arg, conjunction<_Args...>> {};
+_LIBCPP_DIAGNOSTIC_POP
 
 template <class... _Args>
-_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool conjunction_v = _And<_Args...>::value;
+_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool conjunction_v = conjunction<_Args...>::value;
 
 #endif // _LIBCPP_STD_VER >= 17
 

From 876a5c9e5905a9666748632afba1ff83200ed95b Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Sat, 15 Feb 2025 10:54:00 +0100
Subject: [PATCH 166/282] [libc++] Avoid including <features.h> on arbitrary
 platforms (#125587)

This partially reverts commit 5f2389d4. That commit started checking
whether <features.h> was a valid include unconditionally, however codebases
are free to have such a header on their search path, which breaks compilation.
LLVM libc now provides a more standard way of getting configuration macros
like __LLVM_LIBC__.

After this patch, we only include <features.h> when we're on Linux or
when we're compiling for GPUs.

(cherry picked from commit cffc1ac3491c891ef4f80bcbfa685710e477eeac)
---
 libcxx/include/__configuration/platform.h | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/libcxx/include/__configuration/platform.h b/libcxx/include/__configuration/platform.h
index 2a92ce209b91f..cff99376ee24b 100644
--- a/libcxx/include/__configuration/platform.h
+++ b/libcxx/include/__configuration/platform.h
@@ -30,12 +30,9 @@
 // ... add new file formats here ...
 #endif
 
-// To detect which libc we're using
-#if __has_include(<features.h>)
+// Need to detect which libc we're using if we're on Linux.
+#if defined(__linux__) || defined(__AMDGPU__) || defined(__NVPTX__)
 #  include <features.h>
-#endif
-
-#if defined(__linux__)
 #  if defined(__GLIBC_PREREQ)
 #    define _LIBCPP_GLIBC_PREREQ(a, b) __GLIBC_PREREQ(a, b)
 #  else

From 2b70b17d30744ee6720eb2645ef8b61e043ed295 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Wed, 19 Feb 2025 06:53:30 -0800
Subject: [PATCH 167/282] flang: Fix build with latest libc++ (#127362)

I think this first stopped working with
954836634abb446f18719b14120c386a929a42d1. This patch fixes the following
error:

/home/runner/work/llvm-project/llvm-project/flang/runtime/io-api-minimal.cpp:153:11:
error: '__libcpp_verbose_abort' is missing exception specification
'noexcept'
   153 | void std::__libcpp_verbose_abort(char const *format, ...) {
       |           ^
| noexcept
/mnt/build/bin/../include/c++/v1/__verbose_abort:30:28: note: previous
declaration is here
30 | __printf__, 1, 2) void __libcpp_verbose_abort(const char* __format,
...) _LIBCPP_VERBOSE_ABORT_NOEXCEPT;
       |                            ^
1 error generated.

(cherry picked from commit 2b340c10a611d929fee25e6222909c8915e3d6b6)
---
 flang/runtime/io-api-minimal.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/flang/runtime/io-api-minimal.cpp b/flang/runtime/io-api-minimal.cpp
index 68768427be0c2..93ac82248aa4c 100644
--- a/flang/runtime/io-api-minimal.cpp
+++ b/flang/runtime/io-api-minimal.cpp
@@ -150,7 +150,8 @@ bool IODEF(OutputLogical)(Cookie cookie, bool truth) {
 // Provide own definition for `std::__libcpp_verbose_abort` to avoid dependency
 // on the version provided by libc++.
 
-void std::__libcpp_verbose_abort(char const *format, ...) {
+void std::__libcpp_verbose_abort(char const *format, ...) noexcept(
+    noexcept(std::__libcpp_verbose_abort(""))) {
   va_list list;
   va_start(list, format);
   std::vfprintf(stderr, format, list);

From 239faf0b9dbcf092337a8feb696f1f9bf0671241 Mon Sep 17 00:00:00 2001
From: Kerry McLaughlin <kerry.mclaughlin@arm.com>
Date: Thu, 13 Feb 2025 09:43:16 +0000
Subject: [PATCH 168/282] [LLVM][AArch64] Remove aliases of LSUI instructions
 (#126072)

Removes MnemonicAliases added for instructions available with
the LSUI feature (e.g. CAS -> CAST) which are not equivalent.
The aliases stt[add|clr|set]a & stt[add|clr|set]al are also removed.

(cherry picked from commit d44d806faa879dfb7a7ceb58beeb57cf8d5af430)
---
 .../lib/Target/AArch64/AArch64InstrFormats.td |  23 ++--
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  23 ----
 llvm/test/MC/AArch64/armv8.1a-lse.s           |   2 +
 llvm/test/MC/AArch64/armv9.6a-lsui.s          | 102 +++---------------
 4 files changed, 22 insertions(+), 128 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 3bb5d3cb4d09d..c2eea836fb14f 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -12529,26 +12529,19 @@ multiclass STOPregister<string asm, string instr> {
                     !cast<Instruction>(instr # "X")>;
 }
 
+let Predicates = [HasLSUI] in
 class BaseSTOPregisterLSUI<string asm, RegisterClass OP, Register Reg,
                         Instruction inst> :
-      InstAlias<asm # "\t$Rs, [$Rn]", (inst Reg, OP:$Rs, GPR64sp:$Rn)>;
+      InstAlias<asm # "\t$Rs, [$Rn]", (inst Reg, OP:$Rs, GPR64sp:$Rn), 0>;
 
 multiclass STOPregisterLSUI<string asm, string instr> {
-  def : BaseSTOPregisterLSUI<asm # "a",        GPR32, WZR,
-                    !cast<Instruction>(instr # "W")>;
-  def : BaseSTOPregisterLSUI<asm # "a",        GPR64, XZR,
-                    !cast<Instruction>(instr # "X")>;
-  def : BaseSTOPregisterLSUI<asm # "l",        GPR32, WZR,
-                    !cast<Instruction>(instr # "W")>;
-  def : BaseSTOPregisterLSUI<asm # "l",        GPR64, XZR,
-                    !cast<Instruction>(instr # "X")>;
-  def : BaseSTOPregisterLSUI<asm # "al",        GPR32, WZR,
-                    !cast<Instruction>(instr # "W")>;
-  def : BaseSTOPregisterLSUI<asm # "al",        GPR64, XZR,
-                    !cast<Instruction>(instr # "X")>;
-  def : BaseSTOPregisterLSUI<asm,        GPR32, WZR,
+  def : BaseSTOPregisterLSUI<asm # "l", GPR32, WZR,
+                    !cast<Instruction>(instr # "LW")>;
+  def : BaseSTOPregisterLSUI<asm # "l", GPR64, XZR,
+                    !cast<Instruction>(instr # "LX")>;
+  def : BaseSTOPregisterLSUI<asm, GPR32, WZR,
                     !cast<Instruction>(instr # "W")>;
-  def : BaseSTOPregisterLSUI<asm,        GPR64, XZR,
+  def : BaseSTOPregisterLSUI<asm, GPR64, XZR,
                     !cast<Instruction>(instr # "X")>;
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index b77246200db64..a3a607825c7f6 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -2667,21 +2667,11 @@ defm CASLT  : CompareAndSwapUnprivileged<0b11, 0, 1, "l">;
 defm CASAT  : CompareAndSwapUnprivileged<0b11, 1, 0, "a">;
 defm CASALT : CompareAndSwapUnprivileged<0b11, 1, 1, "al">;
 
-def : MnemonicAlias<"cas", "cast">;
-def : MnemonicAlias<"casl", "caslt">;
-def : MnemonicAlias<"casa", "casat">;
-def : MnemonicAlias<"casal", "casalt">;
-
 // v9.6-a atomic CASPT
 defm CASPT   : CompareAndSwapPairUnprivileged<0b01, 0, 0, "">;
 defm CASPLT  : CompareAndSwapPairUnprivileged<0b01, 0, 1, "l">;
 defm CASPAT  : CompareAndSwapPairUnprivileged<0b01, 1, 0, "a">;
 defm CASPALT : CompareAndSwapPairUnprivileged<0b01, 1, 1, "al">;
-
-def : MnemonicAlias<"casp", "caspt">;
-def : MnemonicAlias<"caspl", "casplt">;
-def : MnemonicAlias<"caspa", "caspat">;
-def : MnemonicAlias<"caspal", "caspalt">;
 }
 
 // v8.1 atomic SWP
@@ -2696,11 +2686,6 @@ let Predicates = [HasLSUI] in {
   defm SWPTA  : SwapLSUI<1, 0, "a">;
   defm SWPTL  : SwapLSUI<0, 1, "l">;
   defm SWPTAL : SwapLSUI<1, 1, "al">;
-
-  def : MnemonicAlias<"swp", "swpt">;
-  def : MnemonicAlias<"swpa", "swpta">;
-  def : MnemonicAlias<"swpl", "swptl">;
-  def : MnemonicAlias<"swpal", "swptal">;
 }
 
 // v9.6-a unprivileged atomic LD<OP> (FEAT_LSUI)
@@ -4865,22 +4850,14 @@ let Predicates = [HasLSUI] in {
 defm LDTXRW : LoadUnprivilegedLSUI<0b10, GPR32, "ldtxr">;
 defm LDTXRX : LoadUnprivilegedLSUI<0b11, GPR64, "ldtxr">;
 
-def : MnemonicAlias<"ldxr", "ldtxr">;
-
 def LDATXRW : LoadExclusiveLSUI <0b10, 1, 1, GPR32, "ldatxr">;
 def LDATXRX : LoadExclusiveLSUI <0b11, 1, 1, GPR64, "ldatxr">;
 
-def : MnemonicAlias<"ldaxr", "ldatxr">;
-
 defm STTXRW : StoreUnprivilegedLSUI<0b10, GPR32, "sttxr">;
 defm STTXRX : StoreUnprivilegedLSUI<0b11, GPR64, "sttxr">;
 
-def : MnemonicAlias<"stxr", "sttxr">;
-
 def STLTXRW : StoreExclusiveLSUI<0b10, 0, 1, GPR32, "stltxr">;
 def STLTXRX : StoreExclusiveLSUI<0b11, 0, 1, GPR64, "stltxr">;
-
-def : MnemonicAlias<"stlxr", "stltxr">;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/MC/AArch64/armv8.1a-lse.s b/llvm/test/MC/AArch64/armv8.1a-lse.s
index b5bbbe66c6ae2..eb9a30dea9e6b 100644
--- a/llvm/test/MC/AArch64/armv8.1a-lse.s
+++ b/llvm/test/MC/AArch64/armv8.1a-lse.s
@@ -7,6 +7,8 @@
 // RUN: not llvm-mc -triple aarch64-none-linux-gnu -mcpu=tsv110 -show-encoding < %s 2> %t | FileCheck %s
 // RUN: FileCheck -check-prefix=CHECK-ERROR < %t %s
 // RUN: not llvm-mc -triple aarch64-none-linux-gnu -mattr=+v8r -show-encoding < %s 2> %t | FileCheck %s
+// RUN: FileCheck -check-prefix=CHECK-ERROR < %t %s
+// RUN: not llvm-mc -triple aarch64-none-linux-gnu -mattr=+v8.1a,+lse,+lsui -show-encoding < %s 2> %t | FileCheck %s
 // RUN: FileCheck -check-prefix=CHECK-ERROR < %t %s
   .text
 
diff --git a/llvm/test/MC/AArch64/armv9.6a-lsui.s b/llvm/test/MC/AArch64/armv9.6a-lsui.s
index b48db1f9b5570..d4a5e1f980560 100644
--- a/llvm/test/MC/AArch64/armv9.6a-lsui.s
+++ b/llvm/test/MC/AArch64/armv9.6a-lsui.s
@@ -313,42 +313,16 @@ _func:
 // ERROR: instruction requires: lsui
 
   sttaddl    w0, [x2]
-// CHECK: ldtadd	w0, wzr, [x2]                   // encoding: [0x5f,0x04,0x20,0x19]
+// CHECK: ldtaddl	w0, wzr, [x2]                   // encoding: [0x5f,0x04,0x60,0x19]
 // ERROR: instruction requires: lsui
   sttaddl    w2, [sp]
-// CHECK: ldtadd	w2, wzr, [sp]                   // encoding: [0xff,0x07,0x22,0x19]
+// CHECK: ldtaddl	w2, wzr, [sp]                   // encoding: [0xff,0x07,0x62,0x19]
 // ERROR: instruction requires: lsui
   sttaddl    x0, [x2]
-// CHECK: ldtadd	x0, xzr, [x2]                   // encoding: [0x5f,0x04,0x20,0x59]
+// CHECK: ldtaddl	x0, xzr, [x2]                   // encoding: [0x5f,0x04,0x60,0x59]
 // ERROR: instruction requires: lsui
   sttaddl    x2, [sp]
-// CHECK: ldtadd	x2, xzr, [sp]                   // encoding: [0xff,0x07,0x22,0x59]
-// ERROR: instruction requires: lsui
-
-  sttadda    w0, [x2]
-// CHECK: ldtadd	w0, wzr, [x2]                   // encoding: [0x5f,0x04,0x20,0x19]
-// ERROR: instruction requires: lsui
-  sttadda    w2, [sp]
-// CHECK: ldtadd	w2, wzr, [sp]                   // encoding: [0xff,0x07,0x22,0x19]
-// ERROR: instruction requires: lsui
-  sttadda    x0, [x2]
-// CHECK: ldtadd	x0, xzr, [x2]                   // encoding: [0x5f,0x04,0x20,0x59]
-// ERROR: instruction requires: lsui
-  sttadda    x2, [sp]
-// CHECK: ldtadd	x2, xzr, [sp]                   // encoding: [0xff,0x07,0x22,0x59]
-// ERROR: instruction requires: lsui
-
-  sttaddal   w0, [x2]
-// CHECK: ldtadd	w0, wzr, [x2]                   // encoding: [0x5f,0x04,0x20,0x19]
-// ERROR: instruction requires: lsui
-  sttaddal   w2, [sp]
-// CHECK: ldtadd	w2, wzr, [sp]                   // encoding: [0xff,0x07,0x22,0x19]
-// ERROR: instruction requires: lsui
-  sttaddal   x0, [x2]
-// CHECK: ldtadd	x0, xzr, [x2]                   // encoding: [0x5f,0x04,0x20,0x59]
-// ERROR: instruction requires: lsui
-  sttaddal   x2, [sp]
-// CHECK: ldtadd	x2, xzr, [sp]                   // encoding: [0xff,0x07,0x22,0x59]
+// CHECK: ldtaddl	x2, xzr, [sp]                   // encoding: [0xff,0x07,0x62,0x59]
 // ERROR: instruction requires: lsui
 
   sttclr     w0, [x2]
@@ -362,45 +336,19 @@ _func:
 // ERROR: instruction requires: lsui
   sttclr     x2, [sp]
 // CHECK: ldtclr	x2, xzr, [sp]                   // encoding: [0xff,0x17,0x22,0x59]
-// ERROR: instruction requires: lsui
-
-  sttclra    w0, [x2]
-// CHECK: ldtclr	w0, wzr, [x2]                   // encoding: [0x5f,0x14,0x20,0x19]
-// ERROR: instruction requires: lsui
-  sttclra    w2, [sp]
-// CHECK: ldtclr	w2, wzr, [sp]                   // encoding: [0xff,0x17,0x22,0x19]
-// ERROR: instruction requires: lsui
-  sttclra    x0, [x2]
-// CHECK: ldtclr	x0, xzr, [x2]                   // encoding: [0x5f,0x14,0x20,0x59]
-// ERROR: instruction requires: lsui
-  sttclra    x2, [sp]
-// CHECK: ldtclr	x2, xzr, [sp]                   // encoding: [0xff,0x17,0x22,0x59]
 // ERROR: instruction requires: lsui
 
   sttclrl    w0, [x2]
-// CHECK: ldtclr	w0, wzr, [x2]                   // encoding: [0x5f,0x14,0x20,0x19]
+// CHECK: ldtclrl	w0, wzr, [x2]                   // encoding: [0x5f,0x14,0x60,0x19]
 // ERROR: instruction requires: lsui
   sttclrl    w2, [sp]
-// CHECK: ldtclr	w2, wzr, [sp]                   // encoding: [0xff,0x17,0x22,0x19]
+// CHECK: ldtclrl	w2, wzr, [sp]                   // encoding: [0xff,0x17,0x62,0x19]
 // ERROR: instruction requires: lsui
   sttclrl    x0, [x2]
-// CHECK: ldtclr	x0, xzr, [x2]                   // encoding: [0x5f,0x14,0x20,0x59]
+// CHECK: ldtclrl	x0, xzr, [x2]                   // encoding: [0x5f,0x14,0x60,0x59]
 // ERROR: instruction requires: lsui
   sttclrl    x2, [sp]
-// CHECK: ldtclr	x2, xzr, [sp]                   // encoding: [0xff,0x17,0x22,0x59]
-// ERROR: instruction requires: lsui
-
-  sttclral   w0, [x2]
-// CHECK: ldtclr	w0, wzr, [x2]                   // encoding: [0x5f,0x14,0x20,0x19]
-// ERROR: instruction requires: lsui
-  sttclral   x2, [sp]
-// CHECK: ldtclr	x2, xzr, [sp]                   // encoding: [0xff,0x17,0x22,0x59]
-// ERROR: instruction requires: lsui
-  sttclral   x0, [x2]
-// CHECK: ldtclr	x0, xzr, [x2]                   // encoding: [0x5f,0x14,0x20,0x59]
-// ERROR: instruction requires: lsui
-  sttclral   x2, [sp]
-// CHECK: ldtclr	x2, xzr, [sp]                   // encoding: [0xff,0x17,0x22,0x59]
+// CHECK: ldtclrl	x2, xzr, [sp]                   // encoding: [0xff,0x17,0x62,0x59]
 // ERROR: instruction requires: lsui
 
   sttset     w0, [x2]
@@ -414,45 +362,19 @@ _func:
 // ERROR: instruction requires: lsui
   sttset     x2, [sp]
 // CHECK: ldtset	x2, xzr, [sp]                   // encoding: [0xff,0x37,0x22,0x59]
-// ERROR: instruction requires: lsui
-
-  sttseta    w0, [x2]
-// CHECK: ldtset	w0, wzr, [x2]                   // encoding: [0x5f,0x34,0x20,0x19]
-// ERROR: instruction requires: lsui
-  sttseta    w2, [sp]
-// CHECK: ldtset	w2, wzr, [sp]                   // encoding: [0xff,0x37,0x22,0x19]
-// ERROR: instruction requires: lsui
-  sttseta    x0, [x2]
-// CHECK: ldtset	x0, xzr, [x2]                   // encoding: [0x5f,0x34,0x20,0x59]
-// ERROR: instruction requires: lsui
-  sttseta    x2, [sp]
-// CHECK: ldtset	x2, xzr, [sp]                   // encoding: [0xff,0x37,0x22,0x59]
 // ERROR: instruction requires: lsui
 
   sttsetl    w0, [x2]
-// CHECK: ldtset	w0, wzr, [x2]                   // encoding: [0x5f,0x34,0x20,0x19]
+// CHECK: ldtsetl	w0, wzr, [x2]                   // encoding: [0x5f,0x34,0x60,0x19]
 // ERROR: instruction requires: lsui
   sttsetl    w2, [sp]
-// CHECK: ldtset	w2, wzr, [sp]                   // encoding: [0xff,0x37,0x22,0x19]
+// CHECK: ldtsetl	w2, wzr, [sp]                   // encoding: [0xff,0x37,0x62,0x19]
 // ERROR: instruction requires: lsui
   sttsetl    x0, [x2]
-// CHECK: ldtset	x0, xzr, [x2]                   // encoding: [0x5f,0x34,0x20,0x59]
+// CHECK: ldtsetl	x0, xzr, [x2]                   // encoding: [0x5f,0x34,0x60,0x59]
 // ERROR: instruction requires: lsui
   sttsetl    x2, [sp]
-// CHECK: ldtset	x2, xzr, [sp]                   // encoding: [0xff,0x37,0x22,0x59]
-// ERROR: instruction requires: lsui
-
-  sttsetal   w0, [x2]
-// CHECK: ldtset	w0, wzr, [x2]                   // encoding: [0x5f,0x34,0x20,0x19]
-// ERROR: instruction requires: lsui
-  sttsetal   x2, [sp]
-// CHECK: ldtset	x2, xzr, [sp]                   // encoding: [0xff,0x37,0x22,0x59]
-// ERROR: instruction requires: lsui
-  sttsetal   x0, [x2]
-// CHECK: ldtset	x0, xzr, [x2]                   // encoding: [0x5f,0x34,0x20,0x59]
-// ERROR: instruction requires: lsui
-  sttsetal   x2, [sp]
-// CHECK: ldtset	x2, xzr, [sp]                   // encoding: [0xff,0x37,0x22,0x59]
+// CHECK: ldtsetl	x2, xzr, [sp]                   // encoding: [0xff,0x37,0x62,0x59]
 // ERROR: instruction requires: lsui
 
 //------------------------------------------------------------------------------

From e7e2c16cac97696a89a053f72b94112739b6897f Mon Sep 17 00:00:00 2001
From: Mark de Wever <koraq@xs4all.nl>
Date: Mon, 17 Feb 2025 19:08:07 +0100
Subject: [PATCH 169/282] [libc++][TZDB] Fixes mapping of nonexisting time.
 (#127330)

All non-existing local times in a contiguous range should map to the
same time point. This fixes a bug, were the times inside the range were
mapped to the wrong time.

Fixes: #113654
(cherry picked from commit 941f7cbf5a3e7aa9f36b002dc22cfdb4ff50fea8)
---
 libcxx/include/__chrono/time_zone.h             |  8 ++++++--
 .../time.zone.members/to_sys_choose.pass.cpp    | 17 +++++++++++++++--
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/libcxx/include/__chrono/time_zone.h b/libcxx/include/__chrono/time_zone.h
index ab5c22eceaaf1..d18d59d2736bf 100644
--- a/libcxx/include/__chrono/time_zone.h
+++ b/libcxx/include/__chrono/time_zone.h
@@ -103,10 +103,14 @@ class _LIBCPP_AVAILABILITY_TZDB time_zone {
   to_sys(const local_time<_Duration>& __time, choose __z) const {
     local_info __info = get_info(__time);
     switch (__info.result) {
-    case local_info::unique:
-    case local_info::nonexistent: // first and second are the same
+    case local_info::unique: // first and second are the same
       return sys_time<common_type_t<_Duration, seconds>>{__time.time_since_epoch() - __info.first.offset};
 
+    case local_info::nonexistent:
+      // first and second are the same
+      // All non-existing values are converted to the same time.
+      return sys_time<common_type_t<_Duration, seconds>>{__info.first.end};
+
     case local_info::ambiguous:
       switch (__z) {
       case choose::earliest:
diff --git a/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/to_sys_choose.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/to_sys_choose.pass.cpp
index bad4ef352e9b9..1147c9fadf9ae 100644
--- a/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/to_sys_choose.pass.cpp
+++ b/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/to_sys_choose.pass.cpp
@@ -88,7 +88,7 @@ static void test_nonexistent() {
   // Pick an historic date where it's well known what the time zone rules were.
   // This makes it unlikely updates to the database change these rules.
   std::chrono::local_time<std::chrono::seconds> time{
-      (std::chrono::sys_days{std::chrono::March / 30 / 1986} + 2h + 30min).time_since_epoch()};
+      (std::chrono::sys_days{std::chrono::March / 30 / 1986} + 2h).time_since_epoch()};
 
   std::chrono::sys_seconds expected{time.time_since_epoch() - 1h};
 
@@ -100,6 +100,13 @@ static void test_nonexistent() {
   assert(tz->to_sys(time + 0us, std::chrono::choose::latest) == expected);
   assert(tz->to_sys(time + 0ms, std::chrono::choose::earliest) == expected);
   assert(tz->to_sys(time + 0s, std::chrono::choose::latest) == expected);
+
+  // The entire nonexisting hour should map to the same time.
+  // For nonexistant the value of std::chrono::choose has no effect.
+  assert(tz->to_sys(time + 1s, std::chrono::choose::earliest) == expected);
+  assert(tz->to_sys(time + 1min, std::chrono::choose::latest) == expected);
+  assert(tz->to_sys(time + 30min, std::chrono::choose::earliest) == expected);
+  assert(tz->to_sys(time + 59min + 59s, std::chrono::choose::latest) == expected);
 }
 
 // Tests ambiguous conversions.
@@ -120,7 +127,7 @@ static void test_ambiguous() {
   // Pick an historic date where it's well known what the time zone rules were.
   // This makes it unlikely updates to the database change these rules.
   std::chrono::local_time<std::chrono::seconds> time{
-      (std::chrono::sys_days{std::chrono::September / 28 / 1986} + 2h + 30min).time_since_epoch()};
+      (std::chrono::sys_days{std::chrono::September / 28 / 1986} + 2h).time_since_epoch()};
 
   std::chrono::sys_seconds earlier{time.time_since_epoch() - 2h};
   std::chrono::sys_seconds later{time.time_since_epoch() - 1h};
@@ -133,6 +140,12 @@ static void test_ambiguous() {
   assert(tz->to_sys(time + 0us, std::chrono::choose::latest) == later);
   assert(tz->to_sys(time + 0ms, std::chrono::choose::earliest) == earlier);
   assert(tz->to_sys(time + 0s, std::chrono::choose::latest) == later);
+
+  // Test times in the ambigious hour
+  assert(tz->to_sys(time + 1s, std::chrono::choose::earliest) == earlier + 1s);
+  assert(tz->to_sys(time + 1min, std::chrono::choose::latest) == later + 1min);
+  assert(tz->to_sys(time + 30min, std::chrono::choose::earliest) == earlier + 30min);
+  assert(tz->to_sys(time + 59min + 59s, std::chrono::choose::latest) == later + 59min + 59s);
 }
 
 // This test does the basic validations of this function. The library function

From 6ea0367372362e50ddaa6ec51a502c1ca19d26d7 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Wed, 19 Feb 2025 19:16:30 -0800
Subject: [PATCH 170/282] workflows/release-binaries: Disable Flang on x86_64
 macOS (#127216)

The flang build was taking 2-3 hours and causing the entire job to
timeout, so we need to disable it.

(cherry picked from commit 3e5ae5777d92b6f8c647c3f6969fbca0f0f769ff)
---
 .github/workflows/release-binaries.yml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.github/workflows/release-binaries.yml b/.github/workflows/release-binaries.yml
index 204ee6405382f..fa3dfa5b7d313 100644
--- a/.github/workflows/release-binaries.yml
+++ b/.github/workflows/release-binaries.yml
@@ -138,6 +138,11 @@ jobs:
             arches=arm64
           else
             arches=x86_64
+            # Disable Flang builds on macOS x86_64.  The FortranLower library takes
+            # 2-3 hours to build on macOS, much slower than on Linux.
+            # The long build time causes the release build to time out on x86_64,
+            # so we need to disable flang there.
+            target_cmake_flags="$target_cmake_flags -DLLVM_RELEASE_ENABLE_PROJECTS='clang;lld;lldb;clang-tools-extra;bolt;polly;mlir'"
           fi
           target_cmake_flags="$target_cmake_flags -DBOOTSTRAP_DARWIN_osx_ARCHS=$arches -DBOOTSTRAP_DARWIN_osx_BUILTIN_ARCHS=$arches"
         fi

From 05cce88ab900af4a4e08018e96f55b0c308ab973 Mon Sep 17 00:00:00 2001
From: "A. Jiang" <de34@live.cn>
Date: Wed, 19 Feb 2025 09:06:51 +0800
Subject: [PATCH 171/282] [libc++] Set feature-test macro
 `__cpp_lib_atomic_float` (#127559)

The corresponding feature was implemented in LLVM 18 (by #67799), but
this FTM wasn't added before.

(cherry picked from commit 2207e3e32549306bf563c6987f790cabe8d4ea78)
---
 libcxx/docs/FeatureTestMacroTable.rst         |  2 +-
 libcxx/docs/Status/Cxx20Papers.csv            |  2 +-
 libcxx/include/version                        |  2 +-
 .../atomic.version.compile.pass.cpp           | 48 ++++++-------------
 .../version.version.compile.pass.cpp          | 48 ++++++-------------
 .../generate_feature_test_macro_components.py |  1 -
 6 files changed, 33 insertions(+), 70 deletions(-)

diff --git a/libcxx/docs/FeatureTestMacroTable.rst b/libcxx/docs/FeatureTestMacroTable.rst
index ccaa784ccb088..dcf9838edd74b 100644
--- a/libcxx/docs/FeatureTestMacroTable.rst
+++ b/libcxx/docs/FeatureTestMacroTable.rst
@@ -174,7 +174,7 @@ Status
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_atomic_flag_test``                             ``201907L``
     ---------------------------------------------------------- -----------------
-    ``__cpp_lib_atomic_float``                                 *unimplemented*
+    ``__cpp_lib_atomic_float``                                 ``201711L``
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_atomic_lock_free_type_aliases``                ``201907L``
     ---------------------------------------------------------- -----------------
diff --git a/libcxx/docs/Status/Cxx20Papers.csv b/libcxx/docs/Status/Cxx20Papers.csv
index 524c6d0ac8be0..b595da3728841 100644
--- a/libcxx/docs/Status/Cxx20Papers.csv
+++ b/libcxx/docs/Status/Cxx20Papers.csv
@@ -2,7 +2,7 @@
 "`P0463R1 <https://wg21.link/P0463R1>`__","Endian just Endian","2017-07 (Toronto)","|Complete|","7",""
 "`P0674R1 <https://wg21.link/P0674R1>`__","Extending make_shared to Support Arrays","2017-07 (Toronto)","|Complete|","15",""
 "","","","","",""
-"`P0020R6 <https://wg21.link/P0020R6>`__","Floating Point Atomic","2017-11 (Albuquerque)","|Complete|","18",""
+"`P0020R6 <https://wg21.link/P0020R6>`__","Floating Point Atomic","2017-11 (Albuquerque)","|Complete|","18","The feature-test macro was not set until LLVM 20."
 "`P0053R7 <https://wg21.link/P0053R7>`__","C++ Synchronized Buffered Ostream","2017-11 (Albuquerque)","|Complete|","18",""
 "`P0202R3 <https://wg21.link/P0202R3>`__","Add constexpr modifiers to functions in <algorithm> and <utility> Headers","2017-11 (Albuquerque)","|Complete|","12",""
 "`P0415R1 <https://wg21.link/P0415R1>`__","Constexpr for ``std::complex``\ ","2017-11 (Albuquerque)","|Complete|","16",""
diff --git a/libcxx/include/version b/libcxx/include/version
index c5966b90c061d..63ead9fd5d29d 100644
--- a/libcxx/include/version
+++ b/libcxx/include/version
@@ -378,7 +378,7 @@ __cpp_lib_void_t                                        201411L <type_traits>
 # define __cpp_lib_array_constexpr                      201811L
 # define __cpp_lib_assume_aligned                       201811L
 # define __cpp_lib_atomic_flag_test                     201907L
-// # define __cpp_lib_atomic_float                         201711L
+# define __cpp_lib_atomic_float                         201711L
 # define __cpp_lib_atomic_lock_free_type_aliases        201907L
 # define __cpp_lib_atomic_ref                           201806L
 // # define __cpp_lib_atomic_shared_ptr                    201711L
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/atomic.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/atomic.version.compile.pass.cpp
index 9ed18fbfe19ac..5a21e6320bffe 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/atomic.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/atomic.version.compile.pass.cpp
@@ -169,17 +169,11 @@
 #   error "__cpp_lib_atomic_flag_test should have the value 201907L in c++20"
 # endif
 
-# if !defined(_LIBCPP_VERSION)
-#   ifndef __cpp_lib_atomic_float
-#     error "__cpp_lib_atomic_float should be defined in c++20"
-#   endif
-#   if __cpp_lib_atomic_float != 201711L
-#     error "__cpp_lib_atomic_float should have the value 201711L in c++20"
-#   endif
-# else // _LIBCPP_VERSION
-#   ifdef __cpp_lib_atomic_float
-#     error "__cpp_lib_atomic_float should not be defined because it is unimplemented in libc++!"
-#   endif
+# ifndef __cpp_lib_atomic_float
+#   error "__cpp_lib_atomic_float should be defined in c++20"
+# endif
+# if __cpp_lib_atomic_float != 201711L
+#   error "__cpp_lib_atomic_float should have the value 201711L in c++20"
 # endif
 
 # ifndef __cpp_lib_atomic_is_always_lock_free
@@ -262,17 +256,11 @@
 #   error "__cpp_lib_atomic_flag_test should have the value 201907L in c++23"
 # endif
 
-# if !defined(_LIBCPP_VERSION)
-#   ifndef __cpp_lib_atomic_float
-#     error "__cpp_lib_atomic_float should be defined in c++23"
-#   endif
-#   if __cpp_lib_atomic_float != 201711L
-#     error "__cpp_lib_atomic_float should have the value 201711L in c++23"
-#   endif
-# else // _LIBCPP_VERSION
-#   ifdef __cpp_lib_atomic_float
-#     error "__cpp_lib_atomic_float should not be defined because it is unimplemented in libc++!"
-#   endif
+# ifndef __cpp_lib_atomic_float
+#   error "__cpp_lib_atomic_float should be defined in c++23"
+# endif
+# if __cpp_lib_atomic_float != 201711L
+#   error "__cpp_lib_atomic_float should have the value 201711L in c++23"
 # endif
 
 # ifndef __cpp_lib_atomic_is_always_lock_free
@@ -355,17 +343,11 @@
 #   error "__cpp_lib_atomic_flag_test should have the value 201907L in c++26"
 # endif
 
-# if !defined(_LIBCPP_VERSION)
-#   ifndef __cpp_lib_atomic_float
-#     error "__cpp_lib_atomic_float should be defined in c++26"
-#   endif
-#   if __cpp_lib_atomic_float != 201711L
-#     error "__cpp_lib_atomic_float should have the value 201711L in c++26"
-#   endif
-# else // _LIBCPP_VERSION
-#   ifdef __cpp_lib_atomic_float
-#     error "__cpp_lib_atomic_float should not be defined because it is unimplemented in libc++!"
-#   endif
+# ifndef __cpp_lib_atomic_float
+#   error "__cpp_lib_atomic_float should be defined in c++26"
+# endif
+# if __cpp_lib_atomic_float != 201711L
+#   error "__cpp_lib_atomic_float should have the value 201711L in c++26"
 # endif
 
 # ifndef __cpp_lib_atomic_is_always_lock_free
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
index 137d6cb428930..1e4465d515e6b 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
@@ -3282,17 +3282,11 @@
 #   error "__cpp_lib_atomic_flag_test should have the value 201907L in c++20"
 # endif
 
-# if !defined(_LIBCPP_VERSION)
-#   ifndef __cpp_lib_atomic_float
-#     error "__cpp_lib_atomic_float should be defined in c++20"
-#   endif
-#   if __cpp_lib_atomic_float != 201711L
-#     error "__cpp_lib_atomic_float should have the value 201711L in c++20"
-#   endif
-# else // _LIBCPP_VERSION
-#   ifdef __cpp_lib_atomic_float
-#     error "__cpp_lib_atomic_float should not be defined because it is unimplemented in libc++!"
-#   endif
+# ifndef __cpp_lib_atomic_float
+#   error "__cpp_lib_atomic_float should be defined in c++20"
+# endif
+# if __cpp_lib_atomic_float != 201711L
+#   error "__cpp_lib_atomic_float should have the value 201711L in c++20"
 # endif
 
 # ifndef __cpp_lib_atomic_is_always_lock_free
@@ -4707,17 +4701,11 @@
 #   error "__cpp_lib_atomic_flag_test should have the value 201907L in c++23"
 # endif
 
-# if !defined(_LIBCPP_VERSION)
-#   ifndef __cpp_lib_atomic_float
-#     error "__cpp_lib_atomic_float should be defined in c++23"
-#   endif
-#   if __cpp_lib_atomic_float != 201711L
-#     error "__cpp_lib_atomic_float should have the value 201711L in c++23"
-#   endif
-# else // _LIBCPP_VERSION
-#   ifdef __cpp_lib_atomic_float
-#     error "__cpp_lib_atomic_float should not be defined because it is unimplemented in libc++!"
-#   endif
+# ifndef __cpp_lib_atomic_float
+#   error "__cpp_lib_atomic_float should be defined in c++23"
+# endif
+# if __cpp_lib_atomic_float != 201711L
+#   error "__cpp_lib_atomic_float should have the value 201711L in c++23"
 # endif
 
 # ifndef __cpp_lib_atomic_is_always_lock_free
@@ -6369,17 +6357,11 @@
 #   error "__cpp_lib_atomic_flag_test should have the value 201907L in c++26"
 # endif
 
-# if !defined(_LIBCPP_VERSION)
-#   ifndef __cpp_lib_atomic_float
-#     error "__cpp_lib_atomic_float should be defined in c++26"
-#   endif
-#   if __cpp_lib_atomic_float != 201711L
-#     error "__cpp_lib_atomic_float should have the value 201711L in c++26"
-#   endif
-# else // _LIBCPP_VERSION
-#   ifdef __cpp_lib_atomic_float
-#     error "__cpp_lib_atomic_float should not be defined because it is unimplemented in libc++!"
-#   endif
+# ifndef __cpp_lib_atomic_float
+#   error "__cpp_lib_atomic_float should be defined in c++26"
+# endif
+# if __cpp_lib_atomic_float != 201711L
+#   error "__cpp_lib_atomic_float should have the value 201711L in c++26"
 # endif
 
 # ifndef __cpp_lib_atomic_is_always_lock_free
diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py
index 00318c2d2a3cd..a26bab8790f6a 100755
--- a/libcxx/utils/generate_feature_test_macro_components.py
+++ b/libcxx/utils/generate_feature_test_macro_components.py
@@ -162,7 +162,6 @@ def add_version_header(tc):
             "name": "__cpp_lib_atomic_float",
             "values": {"c++20": 201711},
             "headers": ["atomic"],
-            "unimplemented": True,
         },
         {
             "name": "__cpp_lib_atomic_is_always_lock_free",

From 6dcece41472d64555f0ca2539a3e1e1e5feec083 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Thu, 20 Feb 2025 12:22:11 +0000
Subject: [PATCH 172/282] [GlobalISel][AArch64] Fix fptoi.sat lowering.
 (#127901)

The SDAG version uses fminnum/fmaxnum, in converting it to fcmp+select
it appears the order of the operands was chosen badly. This switches the
conditions used to keep the constant on the RHS.

(cherry picked from commit 70ed381b1693697dec3efcaed161d3626d16cff1)
---
 .../CodeGen/GlobalISel/LegalizerHelper.cpp    |   4 +-
 .../test/CodeGen/AArch64/fptosi-sat-scalar.ll |  12 +-
 .../test/CodeGen/AArch64/fptosi-sat-vector.ll | 144 +++++++++---------
 .../test/CodeGen/AArch64/fptoui-sat-scalar.ll |  12 +-
 .../test/CodeGen/AArch64/fptoui-sat-vector.ll | 144 +++++++++---------
 5 files changed, 158 insertions(+), 158 deletions(-)

diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index d0a62340a5f32..536c193d52080 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -7801,13 +7801,13 @@ LegalizerHelper::lowerFPTOINT_SAT(MachineInstr &MI) {
   if (AreExactFloatBounds) {
     // Clamp Src by MinFloat from below. If Src is NaN the result is MinFloat.
     auto MaxC = MIRBuilder.buildFConstant(SrcTy, MinFloat);
-    auto MaxP = MIRBuilder.buildFCmp(CmpInst::FCMP_ULT,
+    auto MaxP = MIRBuilder.buildFCmp(CmpInst::FCMP_OGT,
                                      SrcTy.changeElementSize(1), Src, MaxC);
     auto Max = MIRBuilder.buildSelect(SrcTy, MaxP, Src, MaxC);
     // Clamp by MaxFloat from above. NaN cannot occur.
     auto MinC = MIRBuilder.buildFConstant(SrcTy, MaxFloat);
     auto MinP =
-        MIRBuilder.buildFCmp(CmpInst::FCMP_OGT, SrcTy.changeElementSize(1), Max,
+        MIRBuilder.buildFCmp(CmpInst::FCMP_OLT, SrcTy.changeElementSize(1), Max,
                              MinC, MachineInstr::FmNoNans);
     auto Min =
         MIRBuilder.buildSelect(SrcTy, MinP, Max, MinC, MachineInstr::FmNoNans);
diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-scalar.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-scalar.ll
index bfb5c67801e6c..39e2db3a52d2c 100644
--- a/llvm/test/CodeGen/AArch64/fptosi-sat-scalar.ll
+++ b/llvm/test/CodeGen/AArch64/fptosi-sat-scalar.ll
@@ -987,25 +987,25 @@ define i32 @test_signed_f128_i32(fp128 %f) {
 ; CHECK-GI-NEXT:    adrp x8, .LCPI30_1
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI30_1]
-; CHECK-GI-NEXT:    bl __getf2
+; CHECK-GI-NEXT:    bl __gttf2
 ; CHECK-GI-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    cmp w0, #0
 ; CHECK-GI-NEXT:    mov x9, #-4603241769126068224 // =0xc01e000000000000
 ; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    csel x19, x8, xzr, lt
+; CHECK-GI-NEXT:    csel x19, x8, xzr, gt
 ; CHECK-GI-NEXT:    mov x8, v0.d[1]
 ; CHECK-GI-NEXT:    mov v0.d[0], x19
-; CHECK-GI-NEXT:    csel x20, x8, x9, lt
+; CHECK-GI-NEXT:    csel x20, x8, x9, gt
 ; CHECK-GI-NEXT:    adrp x8, .LCPI30_0
 ; CHECK-GI-NEXT:    mov v0.d[1], x20
 ; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI30_0]
-; CHECK-GI-NEXT:    bl __gttf2
+; CHECK-GI-NEXT:    bl __lttf2
 ; CHECK-GI-NEXT:    cmp w0, #0
-; CHECK-GI-NEXT:    csel x8, x19, xzr, gt
+; CHECK-GI-NEXT:    csel x8, x19, xzr, lt
 ; CHECK-GI-NEXT:    mov v0.d[0], x8
 ; CHECK-GI-NEXT:    mov x8, #281474976448512 // =0xfffffffc0000
 ; CHECK-GI-NEXT:    movk x8, #16413, lsl #48
-; CHECK-GI-NEXT:    csel x8, x20, x8, gt
+; CHECK-GI-NEXT:    csel x8, x20, x8, lt
 ; CHECK-GI-NEXT:    mov v0.d[1], x8
 ; CHECK-GI-NEXT:    bl __fixtfsi
 ; CHECK-GI-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
index b2b3430f4d85e..67d625dd16473 100644
--- a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
@@ -535,25 +535,25 @@ define <1 x i32> @test_signed_v1f128_v1i32(<1 x fp128> %f) {
 ; CHECK-GI-NEXT:    adrp x8, .LCPI14_1
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI14_1]
-; CHECK-GI-NEXT:    bl __getf2
+; CHECK-GI-NEXT:    bl __gttf2
 ; CHECK-GI-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    cmp w0, #0
 ; CHECK-GI-NEXT:    mov x9, #-4603241769126068224 // =0xc01e000000000000
 ; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    csel x19, x8, xzr, lt
+; CHECK-GI-NEXT:    csel x19, x8, xzr, gt
 ; CHECK-GI-NEXT:    mov x8, v0.d[1]
 ; CHECK-GI-NEXT:    mov v0.d[0], x19
-; CHECK-GI-NEXT:    csel x20, x8, x9, lt
+; CHECK-GI-NEXT:    csel x20, x8, x9, gt
 ; CHECK-GI-NEXT:    adrp x8, .LCPI14_0
 ; CHECK-GI-NEXT:    mov v0.d[1], x20
 ; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI14_0]
-; CHECK-GI-NEXT:    bl __gttf2
+; CHECK-GI-NEXT:    bl __lttf2
 ; CHECK-GI-NEXT:    cmp w0, #0
-; CHECK-GI-NEXT:    csel x8, x19, xzr, gt
+; CHECK-GI-NEXT:    csel x8, x19, xzr, lt
 ; CHECK-GI-NEXT:    mov v0.d[0], x8
 ; CHECK-GI-NEXT:    mov x8, #281474976448512 // =0xfffffffc0000
 ; CHECK-GI-NEXT:    movk x8, #16413, lsl #48
-; CHECK-GI-NEXT:    csel x8, x20, x8, gt
+; CHECK-GI-NEXT:    csel x8, x20, x8, lt
 ; CHECK-GI-NEXT:    mov v0.d[1], x8
 ; CHECK-GI-NEXT:    bl __fixtfsi
 ; CHECK-GI-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
@@ -656,26 +656,26 @@ define <2 x i32> @test_signed_v2f128_v2i32(<2 x fp128> %f) {
 ; CHECK-GI-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    mov v1.16b, v2.16b
 ; CHECK-GI-NEXT:    str q2, [sp, #16] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    bl __getf2
+; CHECK-GI-NEXT:    bl __gttf2
 ; CHECK-GI-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    cmp w0, #0
 ; CHECK-GI-NEXT:    mov x20, #-4603241769126068224 // =0xc01e000000000000
 ; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    csel x19, x8, xzr, lt
+; CHECK-GI-NEXT:    csel x19, x8, xzr, gt
 ; CHECK-GI-NEXT:    mov x8, v0.d[1]
 ; CHECK-GI-NEXT:    mov v0.d[0], x19
-; CHECK-GI-NEXT:    csel x21, x8, x20, lt
+; CHECK-GI-NEXT:    csel x21, x8, x20, gt
 ; CHECK-GI-NEXT:    adrp x8, .LCPI15_0
 ; CHECK-GI-NEXT:    mov v0.d[1], x21
 ; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI15_0]
 ; CHECK-GI-NEXT:    str q1, [sp] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    bl __gttf2
+; CHECK-GI-NEXT:    bl __lttf2
 ; CHECK-GI-NEXT:    cmp w0, #0
 ; CHECK-GI-NEXT:    mov x22, #281474976448512 // =0xfffffffc0000
-; CHECK-GI-NEXT:    csel x8, x19, xzr, gt
+; CHECK-GI-NEXT:    csel x8, x19, xzr, lt
 ; CHECK-GI-NEXT:    movk x22, #16413, lsl #48
 ; CHECK-GI-NEXT:    mov v0.d[0], x8
-; CHECK-GI-NEXT:    csel x8, x21, x22, gt
+; CHECK-GI-NEXT:    csel x8, x21, x22, lt
 ; CHECK-GI-NEXT:    mov v0.d[1], x8
 ; CHECK-GI-NEXT:    bl __fixtfsi
 ; CHECK-GI-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
@@ -686,21 +686,21 @@ define <2 x i32> @test_signed_v2f128_v2i32(<2 x fp128> %f) {
 ; CHECK-GI-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    cmp w0, #0
 ; CHECK-GI-NEXT:    csel w21, wzr, w19, ne
-; CHECK-GI-NEXT:    bl __getf2
+; CHECK-GI-NEXT:    bl __gttf2
 ; CHECK-GI-NEXT:    ldr q0, [sp, #48] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    cmp w0, #0
 ; CHECK-GI-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    csel x19, x8, xzr, lt
+; CHECK-GI-NEXT:    csel x19, x8, xzr, gt
 ; CHECK-GI-NEXT:    mov x8, v0.d[1]
 ; CHECK-GI-NEXT:    mov v0.d[0], x19
-; CHECK-GI-NEXT:    csel x20, x8, x20, lt
+; CHECK-GI-NEXT:    csel x20, x8, x20, gt
 ; CHECK-GI-NEXT:    mov v0.d[1], x20
-; CHECK-GI-NEXT:    bl __gttf2
+; CHECK-GI-NEXT:    bl __lttf2
 ; CHECK-GI-NEXT:    cmp w0, #0
-; CHECK-GI-NEXT:    csel x8, x19, xzr, gt
+; CHECK-GI-NEXT:    csel x8, x19, xzr, lt
 ; CHECK-GI-NEXT:    mov v0.d[0], x8
-; CHECK-GI-NEXT:    csel x8, x20, x22, gt
+; CHECK-GI-NEXT:    csel x8, x20, x22, lt
 ; CHECK-GI-NEXT:    mov v0.d[1], x8
 ; CHECK-GI-NEXT:    bl __fixtfsi
 ; CHECK-GI-NEXT:    ldr q0, [sp, #48] // 16-byte Folded Reload
@@ -827,26 +827,26 @@ define <3 x i32> @test_signed_v3f128_v3i32(<3 x fp128> %f) {
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    str q2, [sp, #64] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    str q1, [sp, #32] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    bl __getf2
+; CHECK-GI-NEXT:    bl __gttf2
 ; CHECK-GI-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    cmp w0, #0
 ; CHECK-GI-NEXT:    mov x20, #-4603241769126068224 // =0xc01e000000000000
 ; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    csel x19, x8, xzr, lt
+; CHECK-GI-NEXT:    csel x19, x8, xzr, gt
 ; CHECK-GI-NEXT:    mov x8, v0.d[1]
 ; CHECK-GI-NEXT:    mov v0.d[0], x19
-; CHECK-GI-NEXT:    csel x21, x8, x20, lt
+; CHECK-GI-NEXT:    csel x21, x8, x20, gt
 ; CHECK-GI-NEXT:    adrp x8, .LCPI16_0
 ; CHECK-GI-NEXT:    mov v0.d[1], x21
 ; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI16_0]
 ; CHECK-GI-NEXT:    str q1, [sp, #16] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    bl __gttf2
+; CHECK-GI-NEXT:    bl __lttf2
 ; CHECK-GI-NEXT:    cmp w0, #0
 ; CHECK-GI-NEXT:    mov x22, #281474976448512 // =0xfffffffc0000
-; CHECK-GI-NEXT:    csel x8, x19, xzr, gt
+; CHECK-GI-NEXT:    csel x8, x19, xzr, lt
 ; CHECK-GI-NEXT:    movk x22, #16413, lsl #48
 ; CHECK-GI-NEXT:    mov v0.d[0], x8
-; CHECK-GI-NEXT:    csel x8, x21, x22, gt
+; CHECK-GI-NEXT:    csel x8, x21, x22, lt
 ; CHECK-GI-NEXT:    mov v0.d[1], x8
 ; CHECK-GI-NEXT:    bl __fixtfsi
 ; CHECK-GI-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
@@ -856,21 +856,21 @@ define <3 x i32> @test_signed_v3f128_v3i32(<3 x fp128> %f) {
 ; CHECK-GI-NEXT:    ldp q1, q0, [sp, #32] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    cmp w0, #0
 ; CHECK-GI-NEXT:    csel w21, wzr, w19, ne
-; CHECK-GI-NEXT:    bl __getf2
+; CHECK-GI-NEXT:    bl __gttf2
 ; CHECK-GI-NEXT:    ldr q0, [sp, #48] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    cmp w0, #0
 ; CHECK-GI-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    csel x19, x8, xzr, lt
+; CHECK-GI-NEXT:    csel x19, x8, xzr, gt
 ; CHECK-GI-NEXT:    mov x8, v0.d[1]
 ; CHECK-GI-NEXT:    mov v0.d[0], x19
-; CHECK-GI-NEXT:    csel x23, x8, x20, lt
+; CHECK-GI-NEXT:    csel x23, x8, x20, gt
 ; CHECK-GI-NEXT:    mov v0.d[1], x23
-; CHECK-GI-NEXT:    bl __gttf2
+; CHECK-GI-NEXT:    bl __lttf2
 ; CHECK-GI-NEXT:    cmp w0, #0
-; CHECK-GI-NEXT:    csel x8, x19, xzr, gt
+; CHECK-GI-NEXT:    csel x8, x19, xzr, lt
 ; CHECK-GI-NEXT:    mov v0.d[0], x8
-; CHECK-GI-NEXT:    csel x8, x23, x22, gt
+; CHECK-GI-NEXT:    csel x8, x23, x22, lt
 ; CHECK-GI-NEXT:    mov v0.d[1], x8
 ; CHECK-GI-NEXT:    bl __fixtfsi
 ; CHECK-GI-NEXT:    ldr q0, [sp, #48] // 16-byte Folded Reload
@@ -881,21 +881,21 @@ define <3 x i32> @test_signed_v3f128_v3i32(<3 x fp128> %f) {
 ; CHECK-GI-NEXT:    ldr q1, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    cmp w0, #0
 ; CHECK-GI-NEXT:    csel w23, wzr, w19, ne
-; CHECK-GI-NEXT:    bl __getf2
+; CHECK-GI-NEXT:    bl __gttf2
 ; CHECK-GI-NEXT:    ldr q0, [sp, #64] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    cmp w0, #0
 ; CHECK-GI-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    csel x19, x8, xzr, lt
+; CHECK-GI-NEXT:    csel x19, x8, xzr, gt
 ; CHECK-GI-NEXT:    mov x8, v0.d[1]
 ; CHECK-GI-NEXT:    mov v0.d[0], x19
-; CHECK-GI-NEXT:    csel x20, x8, x20, lt
+; CHECK-GI-NEXT:    csel x20, x8, x20, gt
 ; CHECK-GI-NEXT:    mov v0.d[1], x20
-; CHECK-GI-NEXT:    bl __gttf2
+; CHECK-GI-NEXT:    bl __lttf2
 ; CHECK-GI-NEXT:    cmp w0, #0
-; CHECK-GI-NEXT:    csel x8, x19, xzr, gt
+; CHECK-GI-NEXT:    csel x8, x19, xzr, lt
 ; CHECK-GI-NEXT:    mov v0.d[0], x8
-; CHECK-GI-NEXT:    csel x8, x20, x22, gt
+; CHECK-GI-NEXT:    csel x8, x20, x22, lt
 ; CHECK-GI-NEXT:    mov v0.d[1], x8
 ; CHECK-GI-NEXT:    bl __fixtfsi
 ; CHECK-GI-NEXT:    ldr q0, [sp, #64] // 16-byte Folded Reload
@@ -1043,26 +1043,26 @@ define <4 x i32> @test_signed_v4f128_v4i32(<4 x fp128> %f) {
 ; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI17_1]
 ; CHECK-GI-NEXT:    str q2, [sp, #32] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    stp q1, q3, [sp, #64] // 32-byte Folded Spill
-; CHECK-GI-NEXT:    bl __getf2
+; CHECK-GI-NEXT:    bl __gttf2
 ; CHECK-GI-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    cmp w0, #0
 ; CHECK-GI-NEXT:    mov x20, #-4603241769126068224 // =0xc01e000000000000
 ; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    csel x19, x8, xzr, lt
+; CHECK-GI-NEXT:    csel x19, x8, xzr, gt
 ; CHECK-GI-NEXT:    mov x8, v0.d[1]
 ; CHECK-GI-NEXT:    mov v0.d[0], x19
-; CHECK-GI-NEXT:    csel x21, x8, x20, lt
+; CHECK-GI-NEXT:    csel x21, x8, x20, gt
 ; CHECK-GI-NEXT:    adrp x8, .LCPI17_0
 ; CHECK-GI-NEXT:    mov v0.d[1], x21
 ; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI17_0]
 ; CHECK-GI-NEXT:    str q1, [sp, #48] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    bl __gttf2
+; CHECK-GI-NEXT:    bl __lttf2
 ; CHECK-GI-NEXT:    cmp w0, #0
 ; CHECK-GI-NEXT:    mov x22, #281474976448512 // =0xfffffffc0000
-; CHECK-GI-NEXT:    csel x8, x19, xzr, gt
+; CHECK-GI-NEXT:    csel x8, x19, xzr, lt
 ; CHECK-GI-NEXT:    movk x22, #16413, lsl #48
 ; CHECK-GI-NEXT:    mov v0.d[0], x8
-; CHECK-GI-NEXT:    csel x8, x21, x22, gt
+; CHECK-GI-NEXT:    csel x8, x21, x22, lt
 ; CHECK-GI-NEXT:    mov v0.d[1], x8
 ; CHECK-GI-NEXT:    bl __fixtfsi
 ; CHECK-GI-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
@@ -1073,21 +1073,21 @@ define <4 x i32> @test_signed_v4f128_v4i32(<4 x fp128> %f) {
 ; CHECK-GI-NEXT:    ldr q1, [sp, #64] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    cmp w0, #0
 ; CHECK-GI-NEXT:    csel w21, wzr, w19, ne
-; CHECK-GI-NEXT:    bl __getf2
+; CHECK-GI-NEXT:    bl __gttf2
 ; CHECK-GI-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    cmp w0, #0
 ; CHECK-GI-NEXT:    ldr q1, [sp, #48] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    csel x19, x8, xzr, lt
+; CHECK-GI-NEXT:    csel x19, x8, xzr, gt
 ; CHECK-GI-NEXT:    mov x8, v0.d[1]
 ; CHECK-GI-NEXT:    mov v0.d[0], x19
-; CHECK-GI-NEXT:    csel x23, x8, x20, lt
+; CHECK-GI-NEXT:    csel x23, x8, x20, gt
 ; CHECK-GI-NEXT:    mov v0.d[1], x23
-; CHECK-GI-NEXT:    bl __gttf2
+; CHECK-GI-NEXT:    bl __lttf2
 ; CHECK-GI-NEXT:    cmp w0, #0
-; CHECK-GI-NEXT:    csel x8, x19, xzr, gt
+; CHECK-GI-NEXT:    csel x8, x19, xzr, lt
 ; CHECK-GI-NEXT:    mov v0.d[0], x8
-; CHECK-GI-NEXT:    csel x8, x23, x22, gt
+; CHECK-GI-NEXT:    csel x8, x23, x22, lt
 ; CHECK-GI-NEXT:    mov v0.d[1], x8
 ; CHECK-GI-NEXT:    bl __fixtfsi
 ; CHECK-GI-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
@@ -1098,20 +1098,20 @@ define <4 x i32> @test_signed_v4f128_v4i32(<4 x fp128> %f) {
 ; CHECK-GI-NEXT:    ldr q1, [sp, #64] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    cmp w0, #0
 ; CHECK-GI-NEXT:    csel w23, wzr, w19, ne
-; CHECK-GI-NEXT:    bl __getf2
+; CHECK-GI-NEXT:    bl __gttf2
 ; CHECK-GI-NEXT:    ldp q0, q1, [sp, #32] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    cmp w0, #0
 ; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    csel x19, x8, xzr, lt
+; CHECK-GI-NEXT:    csel x19, x8, xzr, gt
 ; CHECK-GI-NEXT:    mov x8, v0.d[1]
 ; CHECK-GI-NEXT:    mov v0.d[0], x19
-; CHECK-GI-NEXT:    csel x24, x8, x20, lt
+; CHECK-GI-NEXT:    csel x24, x8, x20, gt
 ; CHECK-GI-NEXT:    mov v0.d[1], x24
-; CHECK-GI-NEXT:    bl __gttf2
+; CHECK-GI-NEXT:    bl __lttf2
 ; CHECK-GI-NEXT:    cmp w0, #0
-; CHECK-GI-NEXT:    csel x8, x19, xzr, gt
+; CHECK-GI-NEXT:    csel x8, x19, xzr, lt
 ; CHECK-GI-NEXT:    mov v0.d[0], x8
-; CHECK-GI-NEXT:    csel x8, x24, x22, gt
+; CHECK-GI-NEXT:    csel x8, x24, x22, lt
 ; CHECK-GI-NEXT:    mov v0.d[1], x8
 ; CHECK-GI-NEXT:    bl __fixtfsi
 ; CHECK-GI-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
@@ -1121,21 +1121,21 @@ define <4 x i32> @test_signed_v4f128_v4i32(<4 x fp128> %f) {
 ; CHECK-GI-NEXT:    ldp q1, q0, [sp, #64] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    cmp w0, #0
 ; CHECK-GI-NEXT:    csel w24, wzr, w19, ne
-; CHECK-GI-NEXT:    bl __getf2
+; CHECK-GI-NEXT:    bl __gttf2
 ; CHECK-GI-NEXT:    ldr q0, [sp, #80] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    cmp w0, #0
 ; CHECK-GI-NEXT:    ldr q1, [sp, #48] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    csel x19, x8, xzr, lt
+; CHECK-GI-NEXT:    csel x19, x8, xzr, gt
 ; CHECK-GI-NEXT:    mov x8, v0.d[1]
 ; CHECK-GI-NEXT:    mov v0.d[0], x19
-; CHECK-GI-NEXT:    csel x20, x8, x20, lt
+; CHECK-GI-NEXT:    csel x20, x8, x20, gt
 ; CHECK-GI-NEXT:    mov v0.d[1], x20
-; CHECK-GI-NEXT:    bl __gttf2
+; CHECK-GI-NEXT:    bl __lttf2
 ; CHECK-GI-NEXT:    cmp w0, #0
-; CHECK-GI-NEXT:    csel x8, x19, xzr, gt
+; CHECK-GI-NEXT:    csel x8, x19, xzr, lt
 ; CHECK-GI-NEXT:    mov v0.d[0], x8
-; CHECK-GI-NEXT:    csel x8, x20, x22, gt
+; CHECK-GI-NEXT:    csel x8, x20, x22, lt
 ; CHECK-GI-NEXT:    mov v0.d[1], x8
 ; CHECK-GI-NEXT:    bl __fixtfsi
 ; CHECK-GI-NEXT:    ldr q0, [sp, #80] // 16-byte Folded Reload
@@ -5633,26 +5633,26 @@ define <2 x i64> @test_signed_v2f128_v2i64(<2 x fp128> %f) {
 ; CHECK-GI-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    mov v1.16b, v2.16b
 ; CHECK-GI-NEXT:    str q2, [sp, #16] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    bl __getf2
+; CHECK-GI-NEXT:    bl __gttf2
 ; CHECK-GI-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    cmp w0, #0
 ; CHECK-GI-NEXT:    mov x20, #-4594234569871327232 // =0xc03e000000000000
 ; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    csel x19, x8, xzr, lt
+; CHECK-GI-NEXT:    csel x19, x8, xzr, gt
 ; CHECK-GI-NEXT:    mov x8, v0.d[1]
 ; CHECK-GI-NEXT:    mov v0.d[0], x19
-; CHECK-GI-NEXT:    csel x21, x8, x20, lt
+; CHECK-GI-NEXT:    csel x21, x8, x20, gt
 ; CHECK-GI-NEXT:    adrp x8, .LCPI86_0
 ; CHECK-GI-NEXT:    mov v0.d[1], x21
 ; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI86_0]
 ; CHECK-GI-NEXT:    str q1, [sp] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    bl __gttf2
+; CHECK-GI-NEXT:    bl __lttf2
 ; CHECK-GI-NEXT:    mov x22, #-1125899906842624 // =0xfffc000000000000
 ; CHECK-GI-NEXT:    cmp w0, #0
 ; CHECK-GI-NEXT:    mov x23, #4629137466983448575 // =0x403dffffffffffff
-; CHECK-GI-NEXT:    csel x8, x19, x22, gt
+; CHECK-GI-NEXT:    csel x8, x19, x22, lt
 ; CHECK-GI-NEXT:    mov v0.d[0], x8
-; CHECK-GI-NEXT:    csel x8, x21, x23, gt
+; CHECK-GI-NEXT:    csel x8, x21, x23, lt
 ; CHECK-GI-NEXT:    mov v0.d[1], x8
 ; CHECK-GI-NEXT:    bl __fixtfdi
 ; CHECK-GI-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
@@ -5663,21 +5663,21 @@ define <2 x i64> @test_signed_v2f128_v2i64(<2 x fp128> %f) {
 ; CHECK-GI-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    cmp w0, #0
 ; CHECK-GI-NEXT:    csel x21, xzr, x19, ne
-; CHECK-GI-NEXT:    bl __getf2
+; CHECK-GI-NEXT:    bl __gttf2
 ; CHECK-GI-NEXT:    ldr q0, [sp, #48] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    cmp w0, #0
 ; CHECK-GI-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    csel x19, x8, xzr, lt
+; CHECK-GI-NEXT:    csel x19, x8, xzr, gt
 ; CHECK-GI-NEXT:    mov x8, v0.d[1]
 ; CHECK-GI-NEXT:    mov v0.d[0], x19
-; CHECK-GI-NEXT:    csel x20, x8, x20, lt
+; CHECK-GI-NEXT:    csel x20, x8, x20, gt
 ; CHECK-GI-NEXT:    mov v0.d[1], x20
-; CHECK-GI-NEXT:    bl __gttf2
+; CHECK-GI-NEXT:    bl __lttf2
 ; CHECK-GI-NEXT:    cmp w0, #0
-; CHECK-GI-NEXT:    csel x8, x19, x22, gt
+; CHECK-GI-NEXT:    csel x8, x19, x22, lt
 ; CHECK-GI-NEXT:    mov v0.d[0], x8
-; CHECK-GI-NEXT:    csel x8, x20, x23, gt
+; CHECK-GI-NEXT:    csel x8, x20, x23, lt
 ; CHECK-GI-NEXT:    mov v0.d[1], x8
 ; CHECK-GI-NEXT:    bl __fixtfdi
 ; CHECK-GI-NEXT:    ldr q0, [sp, #48] // 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/fptoui-sat-scalar.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-scalar.ll
index 0dea7be5052d0..46950e7a60349 100644
--- a/llvm/test/CodeGen/AArch64/fptoui-sat-scalar.ll
+++ b/llvm/test/CodeGen/AArch64/fptoui-sat-scalar.ll
@@ -797,25 +797,25 @@ define i32 @test_unsigned_f128_i32(fp128 %f) {
 ; CHECK-GI-NEXT:    adrp x8, .LCPI30_1
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI30_1]
-; CHECK-GI-NEXT:    bl __getf2
+; CHECK-GI-NEXT:    bl __gttf2
 ; CHECK-GI-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    cmp w0, #0
 ; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    csel x19, x8, xzr, lt
+; CHECK-GI-NEXT:    csel x19, x8, xzr, gt
 ; CHECK-GI-NEXT:    mov x8, v0.d[1]
 ; CHECK-GI-NEXT:    mov v0.d[0], x19
-; CHECK-GI-NEXT:    csel x20, x8, xzr, lt
+; CHECK-GI-NEXT:    csel x20, x8, xzr, gt
 ; CHECK-GI-NEXT:    adrp x8, .LCPI30_0
 ; CHECK-GI-NEXT:    mov v0.d[1], x20
 ; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI30_0]
-; CHECK-GI-NEXT:    bl __gttf2
+; CHECK-GI-NEXT:    bl __lttf2
 ; CHECK-GI-NEXT:    cmp w0, #0
 ; CHECK-GI-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-GI-NEXT:    csel x8, x19, xzr, gt
+; CHECK-GI-NEXT:    csel x8, x19, xzr, lt
 ; CHECK-GI-NEXT:    mov v0.d[0], x8
 ; CHECK-GI-NEXT:    mov x8, #281474976579584 // =0xfffffffe0000
 ; CHECK-GI-NEXT:    movk x8, #16414, lsl #48
-; CHECK-GI-NEXT:    csel x8, x20, x8, gt
+; CHECK-GI-NEXT:    csel x8, x20, x8, lt
 ; CHECK-GI-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v0.d[1], x8
 ; CHECK-GI-NEXT:    add sp, sp, #48
diff --git a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
index b76df6a101e5f..4d3486d4a2993 100644
--- a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
@@ -491,24 +491,24 @@ define <1 x i32> @test_unsigned_v1f128_v1i32(<1 x fp128> %f) {
 ; CHECK-GI-NEXT:    adrp x8, .LCPI14_1
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI14_1]
-; CHECK-GI-NEXT:    bl __getf2
+; CHECK-GI-NEXT:    bl __gttf2
 ; CHECK-GI-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    cmp w0, #0
 ; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    csel x19, x8, xzr, lt
+; CHECK-GI-NEXT:    csel x19, x8, xzr, gt
 ; CHECK-GI-NEXT:    mov x8, v0.d[1]
 ; CHECK-GI-NEXT:    mov v0.d[0], x19
-; CHECK-GI-NEXT:    csel x20, x8, xzr, lt
+; CHECK-GI-NEXT:    csel x20, x8, xzr, gt
 ; CHECK-GI-NEXT:    adrp x8, .LCPI14_0
 ; CHECK-GI-NEXT:    mov v0.d[1], x20
 ; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI14_0]
-; CHECK-GI-NEXT:    bl __gttf2
+; CHECK-GI-NEXT:    bl __lttf2
 ; CHECK-GI-NEXT:    cmp w0, #0
-; CHECK-GI-NEXT:    csel x8, x19, xzr, gt
+; CHECK-GI-NEXT:    csel x8, x19, xzr, lt
 ; CHECK-GI-NEXT:    mov v0.d[0], x8
 ; CHECK-GI-NEXT:    mov x8, #281474976579584 // =0xfffffffe0000
 ; CHECK-GI-NEXT:    movk x8, #16414, lsl #48
-; CHECK-GI-NEXT:    csel x8, x20, x8, gt
+; CHECK-GI-NEXT:    csel x8, x20, x8, lt
 ; CHECK-GI-NEXT:    mov v0.d[1], x8
 ; CHECK-GI-NEXT:    bl __fixunstfsi
 ; CHECK-GI-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
@@ -588,44 +588,44 @@ define <2 x i32> @test_unsigned_v2f128_v2i32(<2 x fp128> %f) {
 ; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI15_1]
 ; CHECK-GI-NEXT:    stp q2, q1, [sp, #16] // 32-byte Folded Spill
 ; CHECK-GI-NEXT:    mov v1.16b, v2.16b
-; CHECK-GI-NEXT:    bl __getf2
+; CHECK-GI-NEXT:    bl __gttf2
 ; CHECK-GI-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    cmp w0, #0
 ; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    csel x19, x8, xzr, lt
+; CHECK-GI-NEXT:    csel x19, x8, xzr, gt
 ; CHECK-GI-NEXT:    mov x8, v0.d[1]
 ; CHECK-GI-NEXT:    mov v0.d[0], x19
-; CHECK-GI-NEXT:    csel x20, x8, xzr, lt
+; CHECK-GI-NEXT:    csel x20, x8, xzr, gt
 ; CHECK-GI-NEXT:    adrp x8, .LCPI15_0
 ; CHECK-GI-NEXT:    mov v0.d[1], x20
 ; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI15_0]
 ; CHECK-GI-NEXT:    str q1, [sp] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    bl __gttf2
+; CHECK-GI-NEXT:    bl __lttf2
 ; CHECK-GI-NEXT:    cmp w0, #0
 ; CHECK-GI-NEXT:    mov x21, #281474976579584 // =0xfffffffe0000
-; CHECK-GI-NEXT:    csel x8, x19, xzr, gt
+; CHECK-GI-NEXT:    csel x8, x19, xzr, lt
 ; CHECK-GI-NEXT:    movk x21, #16414, lsl #48
 ; CHECK-GI-NEXT:    mov v0.d[0], x8
-; CHECK-GI-NEXT:    csel x8, x20, x21, gt
+; CHECK-GI-NEXT:    csel x8, x20, x21, lt
 ; CHECK-GI-NEXT:    mov v0.d[1], x8
 ; CHECK-GI-NEXT:    bl __fixunstfsi
 ; CHECK-GI-NEXT:    ldp q1, q0, [sp, #16] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    mov w19, w0
-; CHECK-GI-NEXT:    bl __getf2
+; CHECK-GI-NEXT:    bl __gttf2
 ; CHECK-GI-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    cmp w0, #0
 ; CHECK-GI-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    csel x20, x8, xzr, lt
+; CHECK-GI-NEXT:    csel x20, x8, xzr, gt
 ; CHECK-GI-NEXT:    mov x8, v0.d[1]
 ; CHECK-GI-NEXT:    mov v0.d[0], x20
-; CHECK-GI-NEXT:    csel x22, x8, xzr, lt
+; CHECK-GI-NEXT:    csel x22, x8, xzr, gt
 ; CHECK-GI-NEXT:    mov v0.d[1], x22
-; CHECK-GI-NEXT:    bl __gttf2
+; CHECK-GI-NEXT:    bl __lttf2
 ; CHECK-GI-NEXT:    cmp w0, #0
-; CHECK-GI-NEXT:    csel x8, x20, xzr, gt
+; CHECK-GI-NEXT:    csel x8, x20, xzr, lt
 ; CHECK-GI-NEXT:    mov v0.d[0], x8
-; CHECK-GI-NEXT:    csel x8, x22, x21, gt
+; CHECK-GI-NEXT:    csel x8, x22, x21, lt
 ; CHECK-GI-NEXT:    mov v0.d[1], x8
 ; CHECK-GI-NEXT:    bl __fixunstfsi
 ; CHECK-GI-NEXT:    mov v0.s[0], w19
@@ -722,63 +722,63 @@ define <3 x i32> @test_unsigned_v3f128_v3i32(<3 x fp128> %f) {
 ; CHECK-GI-NEXT:    stp q1, q0, [sp] // 32-byte Folded Spill
 ; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI16_1]
 ; CHECK-GI-NEXT:    stp q1, q2, [sp, #32] // 32-byte Folded Spill
-; CHECK-GI-NEXT:    bl __getf2
+; CHECK-GI-NEXT:    bl __gttf2
 ; CHECK-GI-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    cmp w0, #0
 ; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    csel x19, x8, xzr, lt
+; CHECK-GI-NEXT:    csel x19, x8, xzr, gt
 ; CHECK-GI-NEXT:    mov x8, v0.d[1]
 ; CHECK-GI-NEXT:    mov v0.d[0], x19
-; CHECK-GI-NEXT:    csel x20, x8, xzr, lt
+; CHECK-GI-NEXT:    csel x20, x8, xzr, gt
 ; CHECK-GI-NEXT:    adrp x8, .LCPI16_0
 ; CHECK-GI-NEXT:    mov v0.d[1], x20
 ; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI16_0]
 ; CHECK-GI-NEXT:    str q1, [sp, #16] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    bl __gttf2
+; CHECK-GI-NEXT:    bl __lttf2
 ; CHECK-GI-NEXT:    cmp w0, #0
 ; CHECK-GI-NEXT:    mov x21, #281474976579584 // =0xfffffffe0000
-; CHECK-GI-NEXT:    csel x8, x19, xzr, gt
+; CHECK-GI-NEXT:    csel x8, x19, xzr, lt
 ; CHECK-GI-NEXT:    movk x21, #16414, lsl #48
 ; CHECK-GI-NEXT:    mov v0.d[0], x8
-; CHECK-GI-NEXT:    csel x8, x20, x21, gt
+; CHECK-GI-NEXT:    csel x8, x20, x21, lt
 ; CHECK-GI-NEXT:    mov v0.d[1], x8
 ; CHECK-GI-NEXT:    bl __fixunstfsi
 ; CHECK-GI-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldr q1, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov w19, w0
-; CHECK-GI-NEXT:    bl __getf2
+; CHECK-GI-NEXT:    bl __gttf2
 ; CHECK-GI-NEXT:    ldp q0, q1, [sp] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    cmp w0, #0
 ; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    csel x20, x8, xzr, lt
+; CHECK-GI-NEXT:    csel x20, x8, xzr, gt
 ; CHECK-GI-NEXT:    mov x8, v0.d[1]
 ; CHECK-GI-NEXT:    mov v0.d[0], x20
-; CHECK-GI-NEXT:    csel x22, x8, xzr, lt
+; CHECK-GI-NEXT:    csel x22, x8, xzr, gt
 ; CHECK-GI-NEXT:    mov v0.d[1], x22
-; CHECK-GI-NEXT:    bl __gttf2
+; CHECK-GI-NEXT:    bl __lttf2
 ; CHECK-GI-NEXT:    cmp w0, #0
-; CHECK-GI-NEXT:    csel x8, x20, xzr, gt
+; CHECK-GI-NEXT:    csel x8, x20, xzr, lt
 ; CHECK-GI-NEXT:    mov v0.d[0], x8
-; CHECK-GI-NEXT:    csel x8, x22, x21, gt
+; CHECK-GI-NEXT:    csel x8, x22, x21, lt
 ; CHECK-GI-NEXT:    mov v0.d[1], x8
 ; CHECK-GI-NEXT:    bl __fixunstfsi
 ; CHECK-GI-NEXT:    ldp q1, q0, [sp, #32] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    mov w20, w0
-; CHECK-GI-NEXT:    bl __getf2
+; CHECK-GI-NEXT:    bl __gttf2
 ; CHECK-GI-NEXT:    ldr q0, [sp, #48] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    cmp w0, #0
 ; CHECK-GI-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    csel x22, x8, xzr, lt
+; CHECK-GI-NEXT:    csel x22, x8, xzr, gt
 ; CHECK-GI-NEXT:    mov x8, v0.d[1]
 ; CHECK-GI-NEXT:    mov v0.d[0], x22
-; CHECK-GI-NEXT:    csel x23, x8, xzr, lt
+; CHECK-GI-NEXT:    csel x23, x8, xzr, gt
 ; CHECK-GI-NEXT:    mov v0.d[1], x23
-; CHECK-GI-NEXT:    bl __gttf2
+; CHECK-GI-NEXT:    bl __lttf2
 ; CHECK-GI-NEXT:    cmp w0, #0
-; CHECK-GI-NEXT:    csel x8, x22, xzr, gt
+; CHECK-GI-NEXT:    csel x8, x22, xzr, lt
 ; CHECK-GI-NEXT:    mov v0.d[0], x8
-; CHECK-GI-NEXT:    csel x8, x23, x21, gt
+; CHECK-GI-NEXT:    csel x8, x23, x21, lt
 ; CHECK-GI-NEXT:    mov v0.d[1], x8
 ; CHECK-GI-NEXT:    bl __fixunstfsi
 ; CHECK-GI-NEXT:    mov v0.s[0], w19
@@ -895,84 +895,84 @@ define <4 x i32> @test_unsigned_v4f128_v4i32(<4 x fp128> %f) {
 ; CHECK-GI-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    str q3, [sp, #32] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    str q1, [sp, #64] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    bl __getf2
+; CHECK-GI-NEXT:    bl __gttf2
 ; CHECK-GI-NEXT:    ldr q0, [sp, #48] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    cmp w0, #0
 ; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    csel x19, x8, xzr, lt
+; CHECK-GI-NEXT:    csel x19, x8, xzr, gt
 ; CHECK-GI-NEXT:    mov x8, v0.d[1]
 ; CHECK-GI-NEXT:    mov v0.d[0], x19
-; CHECK-GI-NEXT:    csel x20, x8, xzr, lt
+; CHECK-GI-NEXT:    csel x20, x8, xzr, gt
 ; CHECK-GI-NEXT:    adrp x8, .LCPI17_0
 ; CHECK-GI-NEXT:    mov v0.d[1], x20
 ; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI17_0]
 ; CHECK-GI-NEXT:    str q1, [sp, #48] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    bl __gttf2
+; CHECK-GI-NEXT:    bl __lttf2
 ; CHECK-GI-NEXT:    cmp w0, #0
 ; CHECK-GI-NEXT:    mov x22, #281474976579584 // =0xfffffffe0000
-; CHECK-GI-NEXT:    csel x8, x19, xzr, gt
+; CHECK-GI-NEXT:    csel x8, x19, xzr, lt
 ; CHECK-GI-NEXT:    movk x22, #16414, lsl #48
 ; CHECK-GI-NEXT:    mov v0.d[0], x8
-; CHECK-GI-NEXT:    csel x8, x20, x22, gt
+; CHECK-GI-NEXT:    csel x8, x20, x22, lt
 ; CHECK-GI-NEXT:    mov v0.d[1], x8
 ; CHECK-GI-NEXT:    bl __fixunstfsi
 ; CHECK-GI-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldr q1, [sp, #64] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov w19, w0
-; CHECK-GI-NEXT:    bl __getf2
+; CHECK-GI-NEXT:    bl __gttf2
 ; CHECK-GI-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    cmp w0, #0
 ; CHECK-GI-NEXT:    ldr q1, [sp, #48] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    csel x20, x8, xzr, lt
+; CHECK-GI-NEXT:    csel x20, x8, xzr, gt
 ; CHECK-GI-NEXT:    mov x8, v0.d[1]
 ; CHECK-GI-NEXT:    mov v0.d[0], x20
-; CHECK-GI-NEXT:    csel x21, x8, xzr, lt
+; CHECK-GI-NEXT:    csel x21, x8, xzr, gt
 ; CHECK-GI-NEXT:    mov v0.d[1], x21
-; CHECK-GI-NEXT:    bl __gttf2
+; CHECK-GI-NEXT:    bl __lttf2
 ; CHECK-GI-NEXT:    cmp w0, #0
-; CHECK-GI-NEXT:    csel x8, x20, xzr, gt
+; CHECK-GI-NEXT:    csel x8, x20, xzr, lt
 ; CHECK-GI-NEXT:    mov v0.d[0], x8
-; CHECK-GI-NEXT:    csel x8, x21, x22, gt
+; CHECK-GI-NEXT:    csel x8, x21, x22, lt
 ; CHECK-GI-NEXT:    mov v0.d[1], x8
 ; CHECK-GI-NEXT:    bl __fixunstfsi
 ; CHECK-GI-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldr q1, [sp, #64] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov w20, w0
-; CHECK-GI-NEXT:    bl __getf2
+; CHECK-GI-NEXT:    bl __gttf2
 ; CHECK-GI-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    cmp w0, #0
 ; CHECK-GI-NEXT:    ldr q1, [sp, #48] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    csel x21, x8, xzr, lt
+; CHECK-GI-NEXT:    csel x21, x8, xzr, gt
 ; CHECK-GI-NEXT:    mov x8, v0.d[1]
 ; CHECK-GI-NEXT:    mov v0.d[0], x21
-; CHECK-GI-NEXT:    csel x23, x8, xzr, lt
+; CHECK-GI-NEXT:    csel x23, x8, xzr, gt
 ; CHECK-GI-NEXT:    mov v0.d[1], x23
-; CHECK-GI-NEXT:    bl __gttf2
+; CHECK-GI-NEXT:    bl __lttf2
 ; CHECK-GI-NEXT:    cmp w0, #0
-; CHECK-GI-NEXT:    csel x8, x21, xzr, gt
+; CHECK-GI-NEXT:    csel x8, x21, xzr, lt
 ; CHECK-GI-NEXT:    mov v0.d[0], x8
-; CHECK-GI-NEXT:    csel x8, x23, x22, gt
+; CHECK-GI-NEXT:    csel x8, x23, x22, lt
 ; CHECK-GI-NEXT:    mov v0.d[1], x8
 ; CHECK-GI-NEXT:    bl __fixunstfsi
 ; CHECK-GI-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldr q1, [sp, #64] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov w21, w0
-; CHECK-GI-NEXT:    bl __getf2
+; CHECK-GI-NEXT:    bl __gttf2
 ; CHECK-GI-NEXT:    ldp q0, q1, [sp, #32] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    cmp w0, #0
 ; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    csel x23, x8, xzr, lt
+; CHECK-GI-NEXT:    csel x23, x8, xzr, gt
 ; CHECK-GI-NEXT:    mov x8, v0.d[1]
 ; CHECK-GI-NEXT:    mov v0.d[0], x23
-; CHECK-GI-NEXT:    csel x24, x8, xzr, lt
+; CHECK-GI-NEXT:    csel x24, x8, xzr, gt
 ; CHECK-GI-NEXT:    mov v0.d[1], x24
-; CHECK-GI-NEXT:    bl __gttf2
+; CHECK-GI-NEXT:    bl __lttf2
 ; CHECK-GI-NEXT:    cmp w0, #0
-; CHECK-GI-NEXT:    csel x8, x23, xzr, gt
+; CHECK-GI-NEXT:    csel x8, x23, xzr, lt
 ; CHECK-GI-NEXT:    mov v0.d[0], x8
-; CHECK-GI-NEXT:    csel x8, x24, x22, gt
+; CHECK-GI-NEXT:    csel x8, x24, x22, lt
 ; CHECK-GI-NEXT:    mov v0.d[1], x8
 ; CHECK-GI-NEXT:    bl __fixunstfsi
 ; CHECK-GI-NEXT:    mov v0.s[0], w19
@@ -4614,44 +4614,44 @@ define <2 x i64> @test_signed_v2f128_v2i64(<2 x fp128> %f) {
 ; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI86_1]
 ; CHECK-GI-NEXT:    stp q2, q1, [sp, #16] // 32-byte Folded Spill
 ; CHECK-GI-NEXT:    mov v1.16b, v2.16b
-; CHECK-GI-NEXT:    bl __getf2
+; CHECK-GI-NEXT:    bl __gttf2
 ; CHECK-GI-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    cmp w0, #0
 ; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    csel x19, x8, xzr, lt
+; CHECK-GI-NEXT:    csel x19, x8, xzr, gt
 ; CHECK-GI-NEXT:    mov x8, v0.d[1]
 ; CHECK-GI-NEXT:    mov v0.d[0], x19
-; CHECK-GI-NEXT:    csel x20, x8, xzr, lt
+; CHECK-GI-NEXT:    csel x20, x8, xzr, gt
 ; CHECK-GI-NEXT:    adrp x8, .LCPI86_0
 ; CHECK-GI-NEXT:    mov v0.d[1], x20
 ; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI86_0]
 ; CHECK-GI-NEXT:    str q1, [sp] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    bl __gttf2
+; CHECK-GI-NEXT:    bl __lttf2
 ; CHECK-GI-NEXT:    mov x21, #-562949953421312 // =0xfffe000000000000
 ; CHECK-GI-NEXT:    cmp w0, #0
 ; CHECK-GI-NEXT:    mov x22, #4629418941960159231 // =0x403effffffffffff
-; CHECK-GI-NEXT:    csel x8, x19, x21, gt
+; CHECK-GI-NEXT:    csel x8, x19, x21, lt
 ; CHECK-GI-NEXT:    mov v0.d[0], x8
-; CHECK-GI-NEXT:    csel x8, x20, x22, gt
+; CHECK-GI-NEXT:    csel x8, x20, x22, lt
 ; CHECK-GI-NEXT:    mov v0.d[1], x8
 ; CHECK-GI-NEXT:    bl __fixunstfdi
 ; CHECK-GI-NEXT:    ldp q1, q0, [sp, #16] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    mov x19, x0
-; CHECK-GI-NEXT:    bl __getf2
+; CHECK-GI-NEXT:    bl __gttf2
 ; CHECK-GI-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    cmp w0, #0
 ; CHECK-GI-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    csel x20, x8, xzr, lt
+; CHECK-GI-NEXT:    csel x20, x8, xzr, gt
 ; CHECK-GI-NEXT:    mov x8, v0.d[1]
 ; CHECK-GI-NEXT:    mov v0.d[0], x20
-; CHECK-GI-NEXT:    csel x23, x8, xzr, lt
+; CHECK-GI-NEXT:    csel x23, x8, xzr, gt
 ; CHECK-GI-NEXT:    mov v0.d[1], x23
-; CHECK-GI-NEXT:    bl __gttf2
+; CHECK-GI-NEXT:    bl __lttf2
 ; CHECK-GI-NEXT:    cmp w0, #0
-; CHECK-GI-NEXT:    csel x8, x20, x21, gt
+; CHECK-GI-NEXT:    csel x8, x20, x21, lt
 ; CHECK-GI-NEXT:    mov v0.d[0], x8
-; CHECK-GI-NEXT:    csel x8, x23, x22, gt
+; CHECK-GI-NEXT:    csel x8, x23, x22, lt
 ; CHECK-GI-NEXT:    mov v0.d[1], x8
 ; CHECK-GI-NEXT:    bl __fixunstfdi
 ; CHECK-GI-NEXT:    mov v0.d[0], x19

From 3007684f86468c344c5d0b77217b40b33173cb02 Mon Sep 17 00:00:00 2001
From: Younan Zhang <zyn7109@gmail.com>
Date: Fri, 21 Feb 2025 07:08:46 +0800
Subject: [PATCH 173/282] release/20.x: [Clang] Remove the PackExpansion
 restrictions for rewrite substitution (#127174)

This backports c08b80eb525a6e6a34d74634bf5181f11ed12984 with a release
note towards 20 so that we could resolve some pains in CTAD.
---
 clang/docs/ReleaseNotes.rst                |  2 +
 clang/lib/Sema/SemaTemplate.cpp            |  2 +-
 clang/lib/Sema/SemaTemplateInstantiate.cpp | 32 +++++++--------
 clang/test/AST/ast-dump-ctad-alias.cpp     | 46 ++++++++++++++++++++++
 4 files changed, 65 insertions(+), 17 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index ad1a5e7ae282e..03c420bcfd932 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -1053,6 +1053,8 @@ Bug Fixes to C++ Support
   template parameter. Now, such expression can be used with ``static_assert`` and ``constexpr``. (#GH123498)
 - Correctly determine the implicit constexprness of lambdas in dependent contexts. (#GH97958) (#GH114234)
 - Fix that some dependent immediate expressions did not cause immediate escalation (#GH119046)
+- Fixed a substitution bug in transforming CTAD aliases when the type alias contains a non-pack template argument
+  corresponding to a pack parameter (#GH124715)
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index 3944c4f67bab9..f4045debf4521 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -4905,7 +4905,7 @@ bool Sema::CheckTemplateTypeArgument(
     [[fallthrough]];
   }
   default: {
-    // We allow instantiateing a template with template argument packs when
+    // We allow instantiating a template with template argument packs when
     // building deduction guides.
     if (Arg.getKind() == TemplateArgument::Pack &&
         CodeSynthesisContexts.back().Kind ==
diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp
index c45d3ffe2508b..eec56b7493bad 100644
--- a/clang/lib/Sema/SemaTemplateInstantiate.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp
@@ -1467,6 +1467,18 @@ namespace {
       }
     }
 
+    static TemplateArgument
+    getTemplateArgumentPackPatternForRewrite(const TemplateArgument &TA) {
+      if (TA.getKind() != TemplateArgument::Pack)
+        return TA;
+      assert(TA.pack_size() == 1 &&
+             "unexpected pack arguments in template rewrite");
+      TemplateArgument Arg = *TA.pack_begin();
+      if (Arg.isPackExpansion())
+        Arg = Arg.getPackExpansionPattern();
+      return Arg;
+    }
+
     /// Transform the given declaration by instantiating a reference to
     /// this declaration.
     Decl *TransformDecl(SourceLocation Loc, Decl *D);
@@ -1624,7 +1636,7 @@ namespace {
           TemplateArgumentLoc Input = SemaRef.getTrivialTemplateArgumentLoc(
               pack, QualType(), SourceLocation{});
           TemplateArgumentLoc Output;
-          if (SemaRef.SubstTemplateArgument(Input, TemplateArgs, Output))
+          if (TransformTemplateArgument(Input, Output, Uneval))
             return true; // fails
           TArgs.push_back(Output.getArgument());
         }
@@ -2036,11 +2048,7 @@ TemplateName TemplateInstantiator::TransformTemplateName(
       if (TemplateArgs.isRewrite()) {
         // We're rewriting the template parameter as a reference to another
         // template parameter.
-        if (Arg.getKind() == TemplateArgument::Pack) {
-          assert(Arg.pack_size() == 1 && Arg.pack_begin()->isPackExpansion() &&
-                 "unexpected pack arguments in template rewrite");
-          Arg = Arg.pack_begin()->getPackExpansionPattern();
-        }
+        Arg = getTemplateArgumentPackPatternForRewrite(Arg);
         assert(Arg.getKind() == TemplateArgument::Template &&
                "unexpected nontype template argument kind in template rewrite");
         return Arg.getAsTemplate();
@@ -2121,11 +2129,7 @@ TemplateInstantiator::TransformTemplateParmRefExpr(DeclRefExpr *E,
   if (TemplateArgs.isRewrite()) {
     // We're rewriting the template parameter as a reference to another
     // template parameter.
-    if (Arg.getKind() == TemplateArgument::Pack) {
-      assert(Arg.pack_size() == 1 && Arg.pack_begin()->isPackExpansion() &&
-             "unexpected pack arguments in template rewrite");
-      Arg = Arg.pack_begin()->getPackExpansionPattern();
-    }
+    Arg = getTemplateArgumentPackPatternForRewrite(Arg);
     assert(Arg.getKind() == TemplateArgument::Expression &&
            "unexpected nontype template argument kind in template rewrite");
     // FIXME: This can lead to the same subexpression appearing multiple times
@@ -2578,11 +2582,7 @@ TemplateInstantiator::TransformTemplateTypeParmType(TypeLocBuilder &TLB,
     if (TemplateArgs.isRewrite()) {
       // We're rewriting the template parameter as a reference to another
       // template parameter.
-      if (Arg.getKind() == TemplateArgument::Pack) {
-        assert(Arg.pack_size() == 1 && Arg.pack_begin()->isPackExpansion() &&
-               "unexpected pack arguments in template rewrite");
-        Arg = Arg.pack_begin()->getPackExpansionPattern();
-      }
+      Arg = getTemplateArgumentPackPatternForRewrite(Arg);
       assert(Arg.getKind() == TemplateArgument::Type &&
              "unexpected nontype template argument kind in template rewrite");
       QualType NewT = Arg.getAsType();
diff --git a/clang/test/AST/ast-dump-ctad-alias.cpp b/clang/test/AST/ast-dump-ctad-alias.cpp
index b1631f7822ce0..f39a4cee518ce 100644
--- a/clang/test/AST/ast-dump-ctad-alias.cpp
+++ b/clang/test/AST/ast-dump-ctad-alias.cpp
@@ -156,3 +156,49 @@ ATemplatedClass2 test2(list);
 // CHECK-NEXT: |-TypeTraitExpr {{.*}} 'bool' __is_deducible
 
 } // namespace GH90209
+
+namespace GH124715 {
+
+template <class T, class... Args>
+concept invocable = true;
+
+template <class T, class... Args> struct Struct {
+  template <class U>
+    requires invocable<U, Args...>
+  Struct(U, Args...) {}
+};
+
+template <class...> struct Packs {};
+
+template <class Lambda, class... Args>
+Struct(Lambda lambda, Args... args) -> Struct<Lambda, Args...>;
+
+template <class T, class... Ts> using Alias = Struct<T, Packs<Ts...>>;
+
+void foo() {
+  Alias([](int) {}, Packs<int>());
+}
+
+// CHECK:      |-FunctionTemplateDecl {{.*}} implicit <deduction guide for Alias>
+// CHECK-NEXT: | |-TemplateTypeParmDecl {{.*}} class depth 0 index 0 T
+// CHECK-NEXT: | |-TemplateTypeParmDecl {{.*}} class depth 0 index 1 ... Ts
+// CHECK-NEXT: | |-TemplateTypeParmDecl {{.*}} class depth 0 index 2 U
+// CHECK-NEXT: | |-BinaryOperator {{.*}} 'bool' '&&'
+// CHECK-NEXT: | | |-ConceptSpecializationExpr {{.*}} 'bool' Concept {{.*}} 'invocable'
+// CHECK-NEXT: | | | |-ImplicitConceptSpecializationDecl {{.*}}
+// CHECK-NEXT: | | | | |-TemplateArgument type 'type-parameter-0-2'
+// CHECK-NEXT: | | | | | `-TemplateTypeParmType {{.*}} 'type-parameter-0-2' dependent depth 0 index 2
+// CHECK-NEXT: | | | | `-TemplateArgument pack '<Packs<type-parameter-0-1...>>'
+// CHECK-NEXT: | | | |   `-TemplateArgument type 'Packs<type-parameter-0-1...>'
+// CHECK-NEXT: | | | |     `-TemplateSpecializationType {{.*}} 'Packs<type-parameter-0-1...>' dependent
+// CHECK-NEXT: | | | |       |-name: 'GH124715::Packs'
+// CHECK-NEXT: | | | |       | `-ClassTemplateDecl {{.*}} Packs
+// CHECK-NEXT: | | | |       `-TemplateArgument pack '<type-parameter-0-1...>'
+// CHECK-NEXT: | | | |         `-TemplateArgument type 'type-parameter-0-1...'
+// CHECK-NEXT: | | | |           `-PackExpansionType {{.*}} 'type-parameter-0-1...' dependent
+// CHECK-NEXT: | | | |             `-TemplateTypeParmType {{.*}} 'type-parameter-0-1' dependent contains_unexpanded_pack depth 0 index 1 pack
+// CHECK-NEXT: | | | |-TemplateArgument {{.*}} type 'U':'type-parameter-0-2'
+// CHECK-NEXT: | | | | `-TemplateTypeParmType {{.*}} 'U' dependent depth 0 index 2
+// CHECK-NEXT: | | | |   `-TemplateTypeParm {{.*}} 'U'
+
+} // namespace GH124715

From 99947c59de7b8ecbdda2a8b8ce78abc3083adee0 Mon Sep 17 00:00:00 2001
From: Zixu Wang <9819235+zixu-w@users.noreply.github.com>
Date: Thu, 13 Feb 2025 16:12:22 -0800
Subject: [PATCH 174/282] =?UTF-8?q?Revert=20"[C++20][Modules][Serializatio?=
 =?UTF-8?q?n]=20Delay=20marking=20pending=20incompl=E2=80=A6=20(#127136)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…ete decl chains until the end of `finishPendingActions`. (#121245)"

This reverts commit a9e249f64e800fbb20a3b26c0cfb68c1a1aee5e1.

Reverting this change because of issue #126973.

(cherry picked from commit 912b154f3a3f8c3cebf5cc5731fd8b0749762da5)
---
 clang/lib/Serialization/ASTReader.cpp | 25 +++----
 clang/test/Modules/pr121245.cpp       | 93 ---------------------------
 2 files changed, 13 insertions(+), 105 deletions(-)
 delete mode 100644 clang/test/Modules/pr121245.cpp

diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index 24acd6e297e71..f524251c48ddd 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -10186,12 +10186,12 @@ void ASTReader::visitTopLevelModuleMaps(
 }
 
 void ASTReader::finishPendingActions() {
-  while (!PendingIdentifierInfos.empty() ||
-         !PendingDeducedFunctionTypes.empty() ||
-         !PendingDeducedVarTypes.empty() || !PendingDeclChains.empty() ||
-         !PendingMacroIDs.empty() || !PendingDeclContextInfos.empty() ||
-         !PendingUpdateRecords.empty() ||
-         !PendingObjCExtensionIvarRedeclarations.empty()) {
+  while (
+      !PendingIdentifierInfos.empty() || !PendingDeducedFunctionTypes.empty() ||
+      !PendingDeducedVarTypes.empty() || !PendingIncompleteDeclChains.empty() ||
+      !PendingDeclChains.empty() || !PendingMacroIDs.empty() ||
+      !PendingDeclContextInfos.empty() || !PendingUpdateRecords.empty() ||
+      !PendingObjCExtensionIvarRedeclarations.empty()) {
     // If any identifiers with corresponding top-level declarations have
     // been loaded, load those declarations now.
     using TopLevelDeclsMap =
@@ -10239,6 +10239,13 @@ void ASTReader::finishPendingActions() {
     }
     PendingDeducedVarTypes.clear();
 
+    // For each decl chain that we wanted to complete while deserializing, mark
+    // it as "still needs to be completed".
+    for (unsigned I = 0; I != PendingIncompleteDeclChains.size(); ++I) {
+      markIncompleteDeclChain(PendingIncompleteDeclChains[I]);
+    }
+    PendingIncompleteDeclChains.clear();
+
     // Load pending declaration chains.
     for (unsigned I = 0; I != PendingDeclChains.size(); ++I)
       loadPendingDeclChain(PendingDeclChains[I].first,
@@ -10476,12 +10483,6 @@ void ASTReader::finishPendingActions() {
   for (auto *ND : PendingMergedDefinitionsToDeduplicate)
     getContext().deduplicateMergedDefinitonsFor(ND);
   PendingMergedDefinitionsToDeduplicate.clear();
-
-  // For each decl chain that we wanted to complete while deserializing, mark
-  // it as "still needs to be completed".
-  for (Decl *D : PendingIncompleteDeclChains)
-    markIncompleteDeclChain(D);
-  PendingIncompleteDeclChains.clear();
 }
 
 void ASTReader::diagnoseOdrViolations() {
diff --git a/clang/test/Modules/pr121245.cpp b/clang/test/Modules/pr121245.cpp
deleted file mode 100644
index 0e276ad0e435d..0000000000000
--- a/clang/test/Modules/pr121245.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-// If this test fails, it should be investigated under Debug builds.
-// Before the PR, this test was encountering an `llvm_unreachable()`.
-
-// RUN: rm -rf %t
-// RUN: mkdir -p %t
-// RUN: split-file %s %t
-// RUN: cd %t
-
-// RUN: %clang_cc1 -std=c++20 -emit-header-unit -xc++-user-header %t/hu-01.h \
-// RUN:  -fcxx-exceptions -o %t/hu-01.pcm
-
-// RUN: %clang_cc1 -std=c++20 -emit-header-unit -xc++-user-header %t/hu-02.h \
-// RUN:  -Wno-experimental-header-units -fcxx-exceptions \
-// RUN:  -fmodule-file=%t/hu-01.pcm -o %t/hu-02.pcm
-
-// RUN: %clang_cc1 -std=c++20 -emit-header-unit -xc++-user-header %t/hu-03.h \
-// RUN:  -Wno-experimental-header-units -fcxx-exceptions \
-// RUN:  -fmodule-file=%t/hu-01.pcm -o %t/hu-03.pcm
-
-// RUN: %clang_cc1 -std=c++20 -emit-header-unit -xc++-user-header %t/hu-04.h \
-// RUN:  -Wno-experimental-header-units -fcxx-exceptions \
-// RUN:  -fmodule-file=%t/hu-01.pcm -o %t/hu-04.pcm
-
-// RUN: %clang_cc1 -std=c++20 -emit-header-unit -xc++-user-header %t/hu-05.h \
-// RUN:  -Wno-experimental-header-units -fcxx-exceptions \
-// RUN:  -fmodule-file=%t/hu-03.pcm -fmodule-file=%t/hu-04.pcm \
-// RUN:  -fmodule-file=%t/hu-01.pcm -o %t/hu-05.pcm
-
-// RUN: %clang_cc1 -std=c++20 -emit-obj %t/main.cpp \
-// RUN:  -Wno-experimental-header-units -fcxx-exceptions \
-// RUN:  -fmodule-file=%t/hu-02.pcm -fmodule-file=%t/hu-05.pcm \
-// RUN:  -fmodule-file=%t/hu-04.pcm -fmodule-file=%t/hu-03.pcm \
-// RUN:  -fmodule-file=%t/hu-01.pcm
-
-//--- hu-01.h
-template <typename T>
-struct A {
-  A() {}
-  ~A() {}
-};
-
-template <typename T>
-struct EBO : T {
-  EBO() = default;
-};
-
-template <typename T>
-struct HT : EBO<A<T>> {};
-
-//--- hu-02.h
-import "hu-01.h";
-
-inline void f() {
-  HT<int>();
-}
-
-//--- hu-03.h
-import "hu-01.h";
-
-struct C {
-  C();
-
-  HT<long> _;
-};
-
-//--- hu-04.h
-import "hu-01.h";
-
-void g(HT<long> = {});
-
-//--- hu-05.h
-import "hu-03.h";
-import "hu-04.h";
-import "hu-01.h";
-
-struct B {
-  virtual ~B() = default;
-
-  virtual void f() {
-    HT<long>();
-  }
-};
-
-//--- main.cpp
-import "hu-02.h";
-import "hu-05.h";
-import "hu-03.h";
-
-int main() {
-  f();
-  C();
-  B();
-}

From 9189ca87225272ff5cd36f40411924b88c6985e3 Mon Sep 17 00:00:00 2001
From: Matheus Izvekov <mizvekov@gmail.com>
Date: Wed, 5 Feb 2025 12:02:24 -0300
Subject: [PATCH 175/282] Backport: [clang] fix P3310 overload resolution flag
 propagation (#125791)

Class templates might be only instantiated when they are required to be
complete, but checking the template args against the primary template is
immediate.

This result is cached so that later when the class is instantiated,
checking against the primary template is not repeated.

The 'MatchedPackOnParmToNonPackOnArg' flag is also produced upon
checking against the primary template, so it needs to be cached in the
specialziation as well.

This fixes a bug which has not been in any release, so there are no
release notes.

Fixes #125290
---
 clang/include/clang/AST/DeclTemplate.h        |   16 +-
 clang/include/clang/Sema/Sema.h               |    4 +-
 clang/lib/AST/ASTImporter.cpp                 |    6 +-
 clang/lib/AST/DeclTemplate.cpp                |   47 +-
 clang/lib/AST/JSONNodeDumper.cpp              |    5 +
 clang/lib/AST/TextNodeDumper.cpp              |    5 +-
 clang/lib/Sema/SemaTemplate.cpp               |    8 +-
 clang/lib/Sema/SemaTemplateDeduction.cpp      |    2 -
 .../lib/Sema/SemaTemplateInstantiateDecl.cpp  |    2 +-
 clang/lib/Sema/SemaType.cpp                   |    3 +-
 clang/lib/Serialization/ASTReaderDecl.cpp     |    1 +
 clang/lib/Serialization/ASTWriterDecl.cpp     |    1 +
 clang/test/AST/ast-dump-templates.cpp         | 6047 ++++++++++++++++-
 clang/test/AST/gen_ast_dump_json_test.py      |   21 +-
 clang/test/SemaTemplate/cwg2398.cpp           |   20 +
 .../Clang/CxxModuleHandler.cpp                |    3 +-
 16 files changed, 6146 insertions(+), 45 deletions(-)

diff --git a/clang/include/clang/AST/DeclTemplate.h b/clang/include/clang/AST/DeclTemplate.h
index 9ecff2c898acd..03c43765206b1 100644
--- a/clang/include/clang/AST/DeclTemplate.h
+++ b/clang/include/clang/AST/DeclTemplate.h
@@ -1841,15 +1841,23 @@ class ClassTemplateSpecializationDecl : public CXXRecordDecl,
   LLVM_PREFERRED_TYPE(TemplateSpecializationKind)
   unsigned SpecializationKind : 3;
 
+  /// Indicate that we have matched a parameter pack with a non pack
+  /// argument, when the opposite match is also allowed (strict pack match).
+  /// This needs to be cached as deduction is performed during declaration,
+  /// and we need the information to be preserved so that it is consistent
+  /// during instantiation.
+  bool MatchedPackOnParmToNonPackOnArg : 1;
+
 protected:
   ClassTemplateSpecializationDecl(ASTContext &Context, Kind DK, TagKind TK,
                                   DeclContext *DC, SourceLocation StartLoc,
                                   SourceLocation IdLoc,
                                   ClassTemplateDecl *SpecializedTemplate,
                                   ArrayRef<TemplateArgument> Args,
+                                  bool MatchedPackOnParmToNonPackOnArg,
                                   ClassTemplateSpecializationDecl *PrevDecl);
 
-  explicit ClassTemplateSpecializationDecl(ASTContext &C, Kind DK);
+  ClassTemplateSpecializationDecl(ASTContext &C, Kind DK);
 
 public:
   friend class ASTDeclReader;
@@ -1859,7 +1867,7 @@ class ClassTemplateSpecializationDecl : public CXXRecordDecl,
   Create(ASTContext &Context, TagKind TK, DeclContext *DC,
          SourceLocation StartLoc, SourceLocation IdLoc,
          ClassTemplateDecl *SpecializedTemplate,
-         ArrayRef<TemplateArgument> Args,
+         ArrayRef<TemplateArgument> Args, bool MatchedPackOnParmToNonPackOnArg,
          ClassTemplateSpecializationDecl *PrevDecl);
   static ClassTemplateSpecializationDecl *CreateDeserialized(ASTContext &C,
                                                              GlobalDeclID ID);
@@ -1930,6 +1938,10 @@ class ClassTemplateSpecializationDecl : public CXXRecordDecl,
     SpecializationKind = TSK;
   }
 
+  bool hasMatchedPackOnParmToNonPackOnArg() const {
+    return MatchedPackOnParmToNonPackOnArg;
+  }
+
   /// Get the point of instantiation (if any), or null if none.
   SourceLocation getPointOfInstantiation() const {
     return PointOfInstantiation;
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index eb82d1b978e94..a30a7076ea5d4 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -13491,8 +13491,8 @@ class Sema final : public SemaBase {
   bool InstantiateClassTemplateSpecialization(
       SourceLocation PointOfInstantiation,
       ClassTemplateSpecializationDecl *ClassTemplateSpec,
-      TemplateSpecializationKind TSK, bool Complain = true,
-      bool PrimaryHasMatchedPackOnParmToNonPackOnArg = false);
+      TemplateSpecializationKind TSK, bool Complain,
+      bool PrimaryHasMatchedPackOnParmToNonPackOnArg);
 
   /// Instantiates the definitions of all of the member
   /// of the given class, which is an instantiation of a class template
diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp
index 09fa10f716ec1..13e7f93233a7f 100644
--- a/clang/lib/AST/ASTImporter.cpp
+++ b/clang/lib/AST/ASTImporter.cpp
@@ -6321,9 +6321,9 @@ ExpectedDecl ASTNodeImporter::VisitClassTemplateSpecializationDecl(
     updateLookupTableForTemplateParameters(*ToTPList);
   } else { // Not a partial specialization.
     if (GetImportedOrCreateDecl(
-            D2, D, Importer.getToContext(), D->getTagKind(), DC,
-            *BeginLocOrErr, *IdLocOrErr, ClassTemplate, TemplateArgs,
-            PrevDecl))
+            D2, D, Importer.getToContext(), D->getTagKind(), DC, *BeginLocOrErr,
+            *IdLocOrErr, ClassTemplate, TemplateArgs,
+            D->hasMatchedPackOnParmToNonPackOnArg(), PrevDecl))
       return D2;
 
     // Update InsertPos, because preceding import calls may have invalidated
diff --git a/clang/lib/AST/DeclTemplate.cpp b/clang/lib/AST/DeclTemplate.cpp
index 2e1ed9e10713a..fe8734d262a96 100644
--- a/clang/lib/AST/DeclTemplate.cpp
+++ b/clang/lib/AST/DeclTemplate.cpp
@@ -957,18 +957,20 @@ FunctionTemplateSpecializationInfo *FunctionTemplateSpecializationInfo::Create(
 // ClassTemplateSpecializationDecl Implementation
 //===----------------------------------------------------------------------===//
 
-ClassTemplateSpecializationDecl::
-ClassTemplateSpecializationDecl(ASTContext &Context, Kind DK, TagKind TK,
-                                DeclContext *DC, SourceLocation StartLoc,
-                                SourceLocation IdLoc,
-                                ClassTemplateDecl *SpecializedTemplate,
-                                ArrayRef<TemplateArgument> Args,
-                                ClassTemplateSpecializationDecl *PrevDecl)
+ClassTemplateSpecializationDecl::ClassTemplateSpecializationDecl(
+    ASTContext &Context, Kind DK, TagKind TK, DeclContext *DC,
+    SourceLocation StartLoc, SourceLocation IdLoc,
+    ClassTemplateDecl *SpecializedTemplate, ArrayRef<TemplateArgument> Args,
+    bool MatchedPackOnParmToNonPackOnArg,
+    ClassTemplateSpecializationDecl *PrevDecl)
     : CXXRecordDecl(DK, TK, Context, DC, StartLoc, IdLoc,
                     SpecializedTemplate->getIdentifier(), PrevDecl),
-    SpecializedTemplate(SpecializedTemplate),
-    TemplateArgs(TemplateArgumentList::CreateCopy(Context, Args)),
-    SpecializationKind(TSK_Undeclared) {
+      SpecializedTemplate(SpecializedTemplate),
+      TemplateArgs(TemplateArgumentList::CreateCopy(Context, Args)),
+      SpecializationKind(TSK_Undeclared),
+      MatchedPackOnParmToNonPackOnArg(MatchedPackOnParmToNonPackOnArg) {
+  assert(DK == Kind::ClassTemplateSpecialization ||
+         MatchedPackOnParmToNonPackOnArg == false);
 }
 
 ClassTemplateSpecializationDecl::ClassTemplateSpecializationDecl(ASTContext &C,
@@ -977,18 +979,14 @@ ClassTemplateSpecializationDecl::ClassTemplateSpecializationDecl(ASTContext &C,
                     SourceLocation(), nullptr, nullptr),
       SpecializationKind(TSK_Undeclared) {}
 
-ClassTemplateSpecializationDecl *
-ClassTemplateSpecializationDecl::Create(ASTContext &Context, TagKind TK,
-                                        DeclContext *DC,
-                                        SourceLocation StartLoc,
-                                        SourceLocation IdLoc,
-                                        ClassTemplateDecl *SpecializedTemplate,
-                                        ArrayRef<TemplateArgument> Args,
-                                   ClassTemplateSpecializationDecl *PrevDecl) {
-  auto *Result =
-      new (Context, DC) ClassTemplateSpecializationDecl(
-          Context, ClassTemplateSpecialization, TK, DC, StartLoc, IdLoc,
-          SpecializedTemplate, Args, PrevDecl);
+ClassTemplateSpecializationDecl *ClassTemplateSpecializationDecl::Create(
+    ASTContext &Context, TagKind TK, DeclContext *DC, SourceLocation StartLoc,
+    SourceLocation IdLoc, ClassTemplateDecl *SpecializedTemplate,
+    ArrayRef<TemplateArgument> Args, bool MatchedPackOnParmToNonPackOnArg,
+    ClassTemplateSpecializationDecl *PrevDecl) {
+  auto *Result = new (Context, DC) ClassTemplateSpecializationDecl(
+      Context, ClassTemplateSpecialization, TK, DC, StartLoc, IdLoc,
+      SpecializedTemplate, Args, MatchedPackOnParmToNonPackOnArg, PrevDecl);
   Result->setMayHaveOutOfDateDef(false);
 
   // If the template decl is incomplete, copy the external lexical storage from
@@ -1175,7 +1173,10 @@ ClassTemplatePartialSpecializationDecl::ClassTemplatePartialSpecializationDecl(
     ClassTemplatePartialSpecializationDecl *PrevDecl)
     : ClassTemplateSpecializationDecl(
           Context, ClassTemplatePartialSpecialization, TK, DC, StartLoc, IdLoc,
-          SpecializedTemplate, Args, PrevDecl),
+          // Tracking MatchedPackOnParmToNonPackOnArg for Partial
+          // Specializations is not needed.
+          SpecializedTemplate, Args, /*MatchedPackOnParmToNonPackOnArg=*/false,
+          PrevDecl),
       TemplateParams(Params), InstantiatedFromMember(nullptr, false) {
   if (AdoptTemplateParameterList(Params, this))
     setInvalidDecl();
diff --git a/clang/lib/AST/JSONNodeDumper.cpp b/clang/lib/AST/JSONNodeDumper.cpp
index 36ef1fc8c79db..4ab0e7cb5815b 100644
--- a/clang/lib/AST/JSONNodeDumper.cpp
+++ b/clang/lib/AST/JSONNodeDumper.cpp
@@ -1003,6 +1003,11 @@ void JSONNodeDumper::VisitRecordDecl(const RecordDecl *RD) {
 void JSONNodeDumper::VisitCXXRecordDecl(const CXXRecordDecl *RD) {
   VisitRecordDecl(RD);
 
+  if (const auto *CTSD = dyn_cast<ClassTemplateSpecializationDecl>(RD)) {
+    if (CTSD->hasMatchedPackOnParmToNonPackOnArg())
+      JOS.attribute("strict-pack-match", true);
+  }
+
   // All other information requires a complete definition.
   if (!RD->isCompleteDefinition())
     return;
diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp
index a57cba9597482..7f5a825b68798 100644
--- a/clang/lib/AST/TextNodeDumper.cpp
+++ b/clang/lib/AST/TextNodeDumper.cpp
@@ -2525,8 +2525,11 @@ void TextNodeDumper::VisitCXXRecordDecl(const CXXRecordDecl *D) {
     OS << " instantiated_from";
     dumpPointer(Instance);
   }
-  if (const auto *CTSD = dyn_cast<ClassTemplateSpecializationDecl>(D))
+  if (const auto *CTSD = dyn_cast<ClassTemplateSpecializationDecl>(D)) {
     dumpTemplateSpecializationKind(CTSD->getSpecializationKind());
+    if (CTSD->hasMatchedPackOnParmToNonPackOnArg())
+      OS << " strict-pack-match";
+  }
 
   dumpNestedNameSpecifier(D->getQualifier());
 
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index f4045debf4521..938671055333c 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -3651,7 +3651,7 @@ QualType Sema::CheckTemplateIdType(TemplateName Name,
           ClassTemplate->getDeclContext(),
           ClassTemplate->getTemplatedDecl()->getBeginLoc(),
           ClassTemplate->getLocation(), ClassTemplate, CTAI.CanonicalConverted,
-          nullptr);
+          CTAI.MatchedPackOnParmToNonPackOnArg, nullptr);
       ClassTemplate->AddSpecialization(Decl, InsertPos);
       if (ClassTemplate->isOutOfLine())
         Decl->setLexicalDeclContext(ClassTemplate->getLexicalDeclContext());
@@ -8566,7 +8566,8 @@ DeclResult Sema::ActOnClassTemplateSpecialization(
     // this explicit specialization or friend declaration.
     Specialization = ClassTemplateSpecializationDecl::Create(
         Context, Kind, DC, KWLoc, TemplateNameLoc, ClassTemplate,
-        CTAI.CanonicalConverted, PrevDecl);
+        CTAI.CanonicalConverted, CTAI.MatchedPackOnParmToNonPackOnArg,
+        PrevDecl);
     Specialization->setTemplateArgsAsWritten(TemplateArgs);
     SetNestedNameSpecifier(*this, Specialization, SS);
     if (TemplateParameterLists.size() > 0) {
@@ -9909,7 +9910,8 @@ DeclResult Sema::ActOnExplicitInstantiation(
     // this explicit specialization.
     Specialization = ClassTemplateSpecializationDecl::Create(
         Context, Kind, ClassTemplate->getDeclContext(), KWLoc, TemplateNameLoc,
-        ClassTemplate, CTAI.CanonicalConverted, PrevDecl);
+        ClassTemplate, CTAI.CanonicalConverted,
+        CTAI.MatchedPackOnParmToNonPackOnArg, PrevDecl);
     SetNestedNameSpecifier(*this, Specialization, SS);
 
     // A MSInheritanceAttr attached to the previous declaration must be
diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp
index 137942f0c30bf..425c41f0f6236 100644
--- a/clang/lib/Sema/SemaTemplateDeduction.cpp
+++ b/clang/lib/Sema/SemaTemplateDeduction.cpp
@@ -3341,8 +3341,6 @@ FinishTemplateArgumentDeduction(
     return ConstraintsNotSatisfied
                ? TemplateDeductionResult::ConstraintsNotSatisfied
                : TemplateDeductionResult::SubstitutionFailure;
-  if (InstCTAI.MatchedPackOnParmToNonPackOnArg)
-    Info.setMatchedPackOnParmToNonPackOnArg();
 
   TemplateParameterList *TemplateParams = Template->getTemplateParameters();
   for (unsigned I = 0, E = TemplateParams->size(); I != E; ++I) {
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index 131f5c8ad1a09..b4f4469ed4e48 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -4007,7 +4007,7 @@ TemplateDeclInstantiator::VisitClassTemplateSpecializationDecl(
       ClassTemplateSpecializationDecl::Create(
           SemaRef.Context, D->getTagKind(), Owner, D->getBeginLoc(),
           D->getLocation(), InstClassTemplate, CTAI.CanonicalConverted,
-          PrevDecl);
+          CTAI.MatchedPackOnParmToNonPackOnArg, PrevDecl);
   InstD->setTemplateArgsAsWritten(InstTemplateArgs);
 
   // Add this partial specialization to the set of class template partial
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index 1fa5239a597c8..77b52b832d771 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -9399,7 +9399,8 @@ bool Sema::RequireCompleteTypeImpl(SourceLocation Loc, QualType T,
         runWithSufficientStackSpace(Loc, [&] {
           Diagnosed = InstantiateClassTemplateSpecialization(
               Loc, ClassTemplateSpec, TSK_ImplicitInstantiation,
-              /*Complain=*/Diagnoser);
+              /*Complain=*/Diagnoser,
+              ClassTemplateSpec->hasMatchedPackOnParmToNonPackOnArg());
         });
         Instantiated = true;
       }
diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index 8fbb0a8d3edd8..8921b92178ee2 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -2532,6 +2532,7 @@ RedeclarableResult ASTDeclReader::VisitClassTemplateSpecializationDeclImpl(
   D->TemplateArgs = TemplateArgumentList::CreateCopy(C, TemplArgs);
   D->PointOfInstantiation = readSourceLocation();
   D->SpecializationKind = (TemplateSpecializationKind)Record.readInt();
+  D->MatchedPackOnParmToNonPackOnArg = Record.readBool();
 
   bool writtenAsCanonicalDecl = Record.readInt();
   if (writtenAsCanonicalDecl) {
diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp
index fa2294da95de8..3505db441e829 100644
--- a/clang/lib/Serialization/ASTWriterDecl.cpp
+++ b/clang/lib/Serialization/ASTWriterDecl.cpp
@@ -1843,6 +1843,7 @@ void ASTDeclWriter::VisitClassTemplateSpecializationDecl(
   Record.AddTemplateArgumentList(&D->getTemplateArgs());
   Record.AddSourceLocation(D->getPointOfInstantiation());
   Record.push_back(D->getSpecializationKind());
+  Record.push_back(D->hasMatchedPackOnParmToNonPackOnArg());
   Record.push_back(D->isCanonicalDecl());
 
   if (D->isCanonicalDecl()) {
diff --git a/clang/test/AST/ast-dump-templates.cpp b/clang/test/AST/ast-dump-templates.cpp
index 9fcafbcbcc46b..86af8c50f3174 100644
--- a/clang/test/AST/ast-dump-templates.cpp
+++ b/clang/test/AST/ast-dump-templates.cpp
@@ -1,7 +1,15 @@
-// RUN: %clang_cc1 -std=c++1z -ast-print %s > %t
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -std=c++17 -ast-dump=json %s | FileCheck --check-prefix=JSON %s
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -std=c++17 -ast-print %s > %t
 // RUN: FileCheck < %t %s -check-prefix=CHECK1
 // RUN: FileCheck < %t %s -check-prefix=CHECK2
-// RUN: %clang_cc1 -std=c++1z -ast-dump %s | FileCheck --check-prefix=DUMP %s
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -std=c++17 -ast-dump %s | FileCheck --check-prefix=DUMP %s
+
+// Test with serialization:
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -std=c++17 -emit-pch -o %t %s
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -x c++ -std=c++17 -include-pch %t \
+// RUN: -ast-dump-all /dev/null \
+// RUN: | sed -e "s/ <undeserialized declarations>//" -e "s/ imported//" \
+// RUN: | FileCheck --strict-whitespace --check-prefix=DUMP %s
 
 template <int X, typename Y, int Z = 5>
 struct foo {
@@ -118,3 +126,6038 @@ void func() {
 // DUMP-NEXT:     `-TemplateTypeParm {{.*}} 'Key'
 }
 }
+
+namespace test7 {
+  template <template<class> class TT> struct A {};
+  template <class...> class B {};
+  template struct A<B>;
+// DUMP-LABEL: NamespaceDecl {{.*}} test7{{$}}
+// DUMP:       ClassTemplateSpecializationDecl {{.*}} struct A definition explicit_instantiation_definition strict-pack-match{{$}}
+} // namespce test7
+
+// NOTE: CHECK lines have been autogenerated by gen_ast_dump_json_test.py
+
+
+// JSON-NOT: {{^}}Dumping
+// JSON:  "kind": "TranslationUnitDecl",
+// JSON-NEXT:  "loc": {},
+// JSON-NEXT:  "range": {
+// JSON-NEXT:   "begin": {},
+// JSON-NEXT:   "end": {}
+// JSON-NEXT:  },
+// JSON-NEXT:  "inner": [
+// JSON-NEXT:   {
+// JSON-NEXT:    "id": "0x{{.*}}",
+// JSON-NEXT:    "kind": "TypedefDecl",
+// JSON-NEXT:    "loc": {},
+// JSON-NEXT:    "range": {
+// JSON-NEXT:     "begin": {},
+// JSON-NEXT:     "end": {}
+// JSON-NEXT:    },
+// JSON-NEXT:    "isImplicit": true,
+// JSON-NEXT:    "name": "__int128_t",
+// JSON-NEXT:    "type": {
+// JSON-NEXT:     "qualType": "__int128"
+// JSON-NEXT:    },
+// JSON-NEXT:    "inner": [
+// JSON-NEXT:     {
+// JSON-NEXT:      "id": "0x{{.*}}",
+// JSON-NEXT:      "kind": "BuiltinType",
+// JSON-NEXT:      "type": {
+// JSON-NEXT:       "qualType": "__int128"
+// JSON-NEXT:      }
+// JSON-NEXT:     }
+// JSON-NEXT:    ]
+// JSON-NEXT:   },
+// JSON-NEXT:   {
+// JSON-NEXT:    "id": "0x{{.*}}",
+// JSON-NEXT:    "kind": "TypedefDecl",
+// JSON-NEXT:    "loc": {},
+// JSON-NEXT:    "range": {
+// JSON-NEXT:     "begin": {},
+// JSON-NEXT:     "end": {}
+// JSON-NEXT:    },
+// JSON-NEXT:    "isImplicit": true,
+// JSON-NEXT:    "name": "__uint128_t",
+// JSON-NEXT:    "type": {
+// JSON-NEXT:     "qualType": "unsigned __int128"
+// JSON-NEXT:    },
+// JSON-NEXT:    "inner": [
+// JSON-NEXT:     {
+// JSON-NEXT:      "id": "0x{{.*}}",
+// JSON-NEXT:      "kind": "BuiltinType",
+// JSON-NEXT:      "type": {
+// JSON-NEXT:       "qualType": "unsigned __int128"
+// JSON-NEXT:      }
+// JSON-NEXT:     }
+// JSON-NEXT:    ]
+// JSON-NEXT:   },
+// JSON-NEXT:   {
+// JSON-NEXT:    "id": "0x{{.*}}",
+// JSON-NEXT:    "kind": "TypedefDecl",
+// JSON-NEXT:    "loc": {},
+// JSON-NEXT:    "range": {
+// JSON-NEXT:     "begin": {},
+// JSON-NEXT:     "end": {}
+// JSON-NEXT:    },
+// JSON-NEXT:    "isImplicit": true,
+// JSON-NEXT:    "name": "__NSConstantString",
+// JSON-NEXT:    "type": {
+// JSON-NEXT:     "qualType": "__NSConstantString_tag"
+// JSON-NEXT:    },
+// JSON-NEXT:    "inner": [
+// JSON-NEXT:     {
+// JSON-NEXT:      "id": "0x{{.*}}",
+// JSON-NEXT:      "kind": "RecordType",
+// JSON-NEXT:      "type": {
+// JSON-NEXT:       "qualType": "__NSConstantString_tag"
+// JSON-NEXT:      },
+// JSON-NEXT:      "decl": {
+// JSON-NEXT:       "id": "0x{{.*}}",
+// JSON-NEXT:       "kind": "CXXRecordDecl",
+// JSON-NEXT:       "name": "__NSConstantString_tag"
+// JSON-NEXT:      }
+// JSON-NEXT:     }
+// JSON-NEXT:    ]
+// JSON-NEXT:   },
+// JSON-NEXT:   {
+// JSON-NEXT:    "id": "0x{{.*}}",
+// JSON-NEXT:    "kind": "TypedefDecl",
+// JSON-NEXT:    "loc": {},
+// JSON-NEXT:    "range": {
+// JSON-NEXT:     "begin": {},
+// JSON-NEXT:     "end": {}
+// JSON-NEXT:    },
+// JSON-NEXT:    "isImplicit": true,
+// JSON-NEXT:    "name": "__builtin_ms_va_list",
+// JSON-NEXT:    "type": {
+// JSON-NEXT:     "qualType": "char *"
+// JSON-NEXT:    },
+// JSON-NEXT:    "inner": [
+// JSON-NEXT:     {
+// JSON-NEXT:      "id": "0x{{.*}}",
+// JSON-NEXT:      "kind": "PointerType",
+// JSON-NEXT:      "type": {
+// JSON-NEXT:       "qualType": "char *"
+// JSON-NEXT:      },
+// JSON-NEXT:      "inner": [
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "BuiltinType",
+// JSON-NEXT:        "type": {
+// JSON-NEXT:         "qualType": "char"
+// JSON-NEXT:        }
+// JSON-NEXT:       }
+// JSON-NEXT:      ]
+// JSON-NEXT:     }
+// JSON-NEXT:    ]
+// JSON-NEXT:   },
+// JSON-NEXT:   {
+// JSON-NEXT:    "id": "0x{{.*}}",
+// JSON-NEXT:    "kind": "TypedefDecl",
+// JSON-NEXT:    "loc": {},
+// JSON-NEXT:    "range": {
+// JSON-NEXT:     "begin": {},
+// JSON-NEXT:     "end": {}
+// JSON-NEXT:    },
+// JSON-NEXT:    "isImplicit": true,
+// JSON-NEXT:    "name": "__builtin_va_list",
+// JSON-NEXT:    "type": {
+// JSON-NEXT:     "qualType": "__va_list_tag[1]"
+// JSON-NEXT:    },
+// JSON-NEXT:    "inner": [
+// JSON-NEXT:     {
+// JSON-NEXT:      "id": "0x{{.*}}",
+// JSON-NEXT:      "kind": "ConstantArrayType",
+// JSON-NEXT:      "type": {
+// JSON-NEXT:       "qualType": "__va_list_tag[1]"
+// JSON-NEXT:      },
+// JSON-NEXT:      "size": 1,
+// JSON-NEXT:      "inner": [
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "RecordType",
+// JSON-NEXT:        "type": {
+// JSON-NEXT:         "qualType": "__va_list_tag"
+// JSON-NEXT:        },
+// JSON-NEXT:        "decl": {
+// JSON-NEXT:         "id": "0x{{.*}}",
+// JSON-NEXT:         "kind": "CXXRecordDecl",
+// JSON-NEXT:         "name": "__va_list_tag"
+// JSON-NEXT:        }
+// JSON-NEXT:       }
+// JSON-NEXT:      ]
+// JSON-NEXT:     }
+// JSON-NEXT:    ]
+// JSON-NEXT:   },
+// JSON-NEXT:   {
+// JSON-NEXT:    "id": "0x{{.*}}",
+// JSON-NEXT:    "kind": "ClassTemplateDecl",
+// JSON-NEXT:    "loc": {
+// JSON-NEXT:     "offset": 812,
+// JSON-NEXT:     "file": "{{.*}}",
+// JSON-NEXT:     "line": 15,
+// JSON-NEXT:     "col": 8,
+// JSON-NEXT:     "tokLen": 3
+// JSON-NEXT:    },
+// JSON-NEXT:    "range": {
+// JSON-NEXT:     "begin": {
+// JSON-NEXT:      "offset": 765,
+// JSON-NEXT:      "line": 14,
+// JSON-NEXT:      "col": 1,
+// JSON-NEXT:      "tokLen": 8
+// JSON-NEXT:     },
+// JSON-NEXT:     "end": {
+// JSON-NEXT:      "offset": 879,
+// JSON-NEXT:      "line": 19,
+// JSON-NEXT:      "col": 1,
+// JSON-NEXT:      "tokLen": 1
+// JSON-NEXT:     }
+// JSON-NEXT:    },
+// JSON-NEXT:    "name": "foo",
+// JSON-NEXT:    "inner": [
+// JSON-NEXT:     {
+// JSON-NEXT:      "id": "0x{{.*}}",
+// JSON-NEXT:      "kind": "NonTypeTemplateParmDecl",
+// JSON-NEXT:      "loc": {
+// JSON-NEXT:       "offset": 779,
+// JSON-NEXT:       "line": 14,
+// JSON-NEXT:       "col": 15,
+// JSON-NEXT:       "tokLen": 1
+// JSON-NEXT:      },
+// JSON-NEXT:      "range": {
+// JSON-NEXT:       "begin": {
+// JSON-NEXT:        "offset": 775,
+// JSON-NEXT:        "col": 11,
+// JSON-NEXT:        "tokLen": 3
+// JSON-NEXT:       },
+// JSON-NEXT:       "end": {
+// JSON-NEXT:        "offset": 779,
+// JSON-NEXT:        "col": 15,
+// JSON-NEXT:        "tokLen": 1
+// JSON-NEXT:       }
+// JSON-NEXT:      },
+// JSON-NEXT:      "isReferenced": true,
+// JSON-NEXT:      "name": "X",
+// JSON-NEXT:      "type": {
+// JSON-NEXT:       "qualType": "int"
+// JSON-NEXT:      },
+// JSON-NEXT:      "depth": 0,
+// JSON-NEXT:      "index": 0
+// JSON-NEXT:     },
+// JSON-NEXT:     {
+// JSON-NEXT:      "id": "0x{{.*}}",
+// JSON-NEXT:      "kind": "TemplateTypeParmDecl",
+// JSON-NEXT:      "loc": {
+// JSON-NEXT:       "offset": 791,
+// JSON-NEXT:       "col": 27,
+// JSON-NEXT:       "tokLen": 1
+// JSON-NEXT:      },
+// JSON-NEXT:      "range": {
+// JSON-NEXT:       "begin": {
+// JSON-NEXT:        "offset": 782,
+// JSON-NEXT:        "col": 18,
+// JSON-NEXT:        "tokLen": 8
+// JSON-NEXT:       },
+// JSON-NEXT:       "end": {
+// JSON-NEXT:        "offset": 791,
+// JSON-NEXT:        "col": 27,
+// JSON-NEXT:        "tokLen": 1
+// JSON-NEXT:       }
+// JSON-NEXT:      },
+// JSON-NEXT:      "isReferenced": true,
+// JSON-NEXT:      "name": "Y",
+// JSON-NEXT:      "tagUsed": "typename",
+// JSON-NEXT:      "depth": 0,
+// JSON-NEXT:      "index": 1
+// JSON-NEXT:     },
+// JSON-NEXT:     {
+// JSON-NEXT:      "id": "0x{{.*}}",
+// JSON-NEXT:      "kind": "NonTypeTemplateParmDecl",
+// JSON-NEXT:      "loc": {
+// JSON-NEXT:       "offset": 798,
+// JSON-NEXT:       "col": 34,
+// JSON-NEXT:       "tokLen": 1
+// JSON-NEXT:      },
+// JSON-NEXT:      "range": {
+// JSON-NEXT:       "begin": {
+// JSON-NEXT:        "offset": 794,
+// JSON-NEXT:        "col": 30,
+// JSON-NEXT:        "tokLen": 3
+// JSON-NEXT:       },
+// JSON-NEXT:       "end": {
+// JSON-NEXT:        "offset": 802,
+// JSON-NEXT:        "col": 38,
+// JSON-NEXT:        "tokLen": 1
+// JSON-NEXT:       }
+// JSON-NEXT:      },
+// JSON-NEXT:      "isReferenced": true,
+// JSON-NEXT:      "name": "Z",
+// JSON-NEXT:      "type": {
+// JSON-NEXT:       "qualType": "int"
+// JSON-NEXT:      },
+// JSON-NEXT:      "depth": 0,
+// JSON-NEXT:      "index": 2,
+// JSON-NEXT:      "defaultArg": {
+// JSON-NEXT:       "kind": "TemplateArgument",
+// JSON-NEXT:       "isExpr": true
+// JSON-NEXT:      },
+// JSON-NEXT:      "inner": [
+// JSON-NEXT:       {
+// JSON-NEXT:        "kind": "TemplateArgument",
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 802,
+// JSON-NEXT:          "col": 38,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 802,
+// JSON-NEXT:          "col": 38,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "isExpr": true,
+// JSON-NEXT:        "inner": [
+// JSON-NEXT:         {
+// JSON-NEXT:          "id": "0x{{.*}}",
+// JSON-NEXT:          "kind": "IntegerLiteral",
+// JSON-NEXT:          "range": {
+// JSON-NEXT:           "begin": {
+// JSON-NEXT:            "offset": 802,
+// JSON-NEXT:            "col": 38,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           },
+// JSON-NEXT:           "end": {
+// JSON-NEXT:            "offset": 802,
+// JSON-NEXT:            "col": 38,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           }
+// JSON-NEXT:          },
+// JSON-NEXT:          "type": {
+// JSON-NEXT:           "qualType": "int"
+// JSON-NEXT:          },
+// JSON-NEXT:          "valueCategory": "prvalue",
+// JSON-NEXT:          "value": "5"
+// JSON-NEXT:         }
+// JSON-NEXT:        ]
+// JSON-NEXT:       }
+// JSON-NEXT:      ]
+// JSON-NEXT:     },
+// JSON-NEXT:     {
+// JSON-NEXT:      "id": "0x{{.*}}",
+// JSON-NEXT:      "kind": "CXXRecordDecl",
+// JSON-NEXT:      "loc": {
+// JSON-NEXT:       "offset": 812,
+// JSON-NEXT:       "line": 15,
+// JSON-NEXT:       "col": 8,
+// JSON-NEXT:       "tokLen": 3
+// JSON-NEXT:      },
+// JSON-NEXT:      "range": {
+// JSON-NEXT:       "begin": {
+// JSON-NEXT:        "offset": 805,
+// JSON-NEXT:        "col": 1,
+// JSON-NEXT:        "tokLen": 6
+// JSON-NEXT:       },
+// JSON-NEXT:       "end": {
+// JSON-NEXT:        "offset": 879,
+// JSON-NEXT:        "line": 19,
+// JSON-NEXT:        "col": 1,
+// JSON-NEXT:        "tokLen": 1
+// JSON-NEXT:       }
+// JSON-NEXT:      },
+// JSON-NEXT:      "name": "foo",
+// JSON-NEXT:      "tagUsed": "struct",
+// JSON-NEXT:      "completeDefinition": true,
+// JSON-NEXT:      "definitionData": {
+// JSON-NEXT:       "canConstDefaultInit": true,
+// JSON-NEXT:       "copyAssign": {
+// JSON-NEXT:        "hasConstParam": true,
+// JSON-NEXT:        "implicitHasConstParam": true,
+// JSON-NEXT:        "needsImplicit": true,
+// JSON-NEXT:        "simple": true,
+// JSON-NEXT:        "trivial": true
+// JSON-NEXT:       },
+// JSON-NEXT:       "copyCtor": {
+// JSON-NEXT:        "hasConstParam": true,
+// JSON-NEXT:        "implicitHasConstParam": true,
+// JSON-NEXT:        "needsImplicit": true,
+// JSON-NEXT:        "simple": true,
+// JSON-NEXT:        "trivial": true
+// JSON-NEXT:       },
+// JSON-NEXT:       "defaultCtor": {
+// JSON-NEXT:        "exists": true,
+// JSON-NEXT:        "nonTrivial": true,
+// JSON-NEXT:        "userProvided": true
+// JSON-NEXT:       },
+// JSON-NEXT:       "dtor": {
+// JSON-NEXT:        "irrelevant": true,
+// JSON-NEXT:        "needsImplicit": true,
+// JSON-NEXT:        "simple": true,
+// JSON-NEXT:        "trivial": true
+// JSON-NEXT:       },
+// JSON-NEXT:       "hasUserDeclaredConstructor": true,
+// JSON-NEXT:       "isStandardLayout": true,
+// JSON-NEXT:       "isTriviallyCopyable": true,
+// JSON-NEXT:       "moveAssign": {
+// JSON-NEXT:        "exists": true,
+// JSON-NEXT:        "needsImplicit": true,
+// JSON-NEXT:        "simple": true,
+// JSON-NEXT:        "trivial": true
+// JSON-NEXT:       },
+// JSON-NEXT:       "moveCtor": {
+// JSON-NEXT:        "exists": true,
+// JSON-NEXT:        "needsImplicit": true,
+// JSON-NEXT:        "simple": true,
+// JSON-NEXT:        "trivial": true
+// JSON-NEXT:       }
+// JSON-NEXT:      },
+// JSON-NEXT:      "inner": [
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "CXXRecordDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 812,
+// JSON-NEXT:         "line": 15,
+// JSON-NEXT:         "col": 8,
+// JSON-NEXT:         "tokLen": 3
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 805,
+// JSON-NEXT:          "col": 1,
+// JSON-NEXT:          "tokLen": 6
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 812,
+// JSON-NEXT:          "col": 8,
+// JSON-NEXT:          "tokLen": 3
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "isImplicit": true,
+// JSON-NEXT:        "isReferenced": true,
+// JSON-NEXT:        "name": "foo",
+// JSON-NEXT:        "tagUsed": "struct"
+// JSON-NEXT:       },
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "FieldDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 824,
+// JSON-NEXT:         "line": 16,
+// JSON-NEXT:         "col": 7,
+// JSON-NEXT:         "tokLen": 8
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 820,
+// JSON-NEXT:          "col": 3,
+// JSON-NEXT:          "tokLen": 3
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 824,
+// JSON-NEXT:          "col": 7,
+// JSON-NEXT:          "tokLen": 8
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "name": "constant",
+// JSON-NEXT:        "type": {
+// JSON-NEXT:         "qualType": "int"
+// JSON-NEXT:        }
+// JSON-NEXT:       },
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "CXXConstructorDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 836,
+// JSON-NEXT:         "line": 17,
+// JSON-NEXT:         "col": 3,
+// JSON-NEXT:         "tokLen": 3
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 836,
+// JSON-NEXT:          "col": 3,
+// JSON-NEXT:          "tokLen": 3
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 843,
+// JSON-NEXT:          "col": 10,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "name": "foo<X, Y, Z>",
+// JSON-NEXT:        "type": {
+// JSON-NEXT:         "qualType": "void ()"
+// JSON-NEXT:        },
+// JSON-NEXT:        "inner": [
+// JSON-NEXT:         {
+// JSON-NEXT:          "id": "0x{{.*}}",
+// JSON-NEXT:          "kind": "CompoundStmt",
+// JSON-NEXT:          "range": {
+// JSON-NEXT:           "begin": {
+// JSON-NEXT:            "offset": 842,
+// JSON-NEXT:            "col": 9,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           },
+// JSON-NEXT:           "end": {
+// JSON-NEXT:            "offset": 843,
+// JSON-NEXT:            "col": 10,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           }
+// JSON-NEXT:          }
+// JSON-NEXT:         }
+// JSON-NEXT:        ]
+// JSON-NEXT:       },
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "CXXMethodDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 849,
+// JSON-NEXT:         "line": 18,
+// JSON-NEXT:         "col": 5,
+// JSON-NEXT:         "tokLen": 6
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 847,
+// JSON-NEXT:          "col": 3,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 877,
+// JSON-NEXT:          "col": 33,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "name": "getSum",
+// JSON-NEXT:        "type": {
+// JSON-NEXT:         "qualType": "Y ()"
+// JSON-NEXT:        },
+// JSON-NEXT:        "inner": [
+// JSON-NEXT:         {
+// JSON-NEXT:          "id": "0x{{.*}}",
+// JSON-NEXT:          "kind": "CompoundStmt",
+// JSON-NEXT:          "range": {
+// JSON-NEXT:           "begin": {
+// JSON-NEXT:            "offset": 858,
+// JSON-NEXT:            "col": 14,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           },
+// JSON-NEXT:           "end": {
+// JSON-NEXT:            "offset": 877,
+// JSON-NEXT:            "col": 33,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           }
+// JSON-NEXT:          },
+// JSON-NEXT:          "inner": [
+// JSON-NEXT:           {
+// JSON-NEXT:            "id": "0x{{.*}}",
+// JSON-NEXT:            "kind": "ReturnStmt",
+// JSON-NEXT:            "range": {
+// JSON-NEXT:             "begin": {
+// JSON-NEXT:              "offset": 860,
+// JSON-NEXT:              "col": 16,
+// JSON-NEXT:              "tokLen": 6
+// JSON-NEXT:             },
+// JSON-NEXT:             "end": {
+// JSON-NEXT:              "offset": 874,
+// JSON-NEXT:              "col": 30,
+// JSON-NEXT:              "tokLen": 1
+// JSON-NEXT:             }
+// JSON-NEXT:            },
+// JSON-NEXT:            "inner": [
+// JSON-NEXT:             {
+// JSON-NEXT:              "id": "0x{{.*}}",
+// JSON-NEXT:              "kind": "CXXUnresolvedConstructExpr",
+// JSON-NEXT:              "range": {
+// JSON-NEXT:               "begin": {
+// JSON-NEXT:                "offset": 867,
+// JSON-NEXT:                "col": 23,
+// JSON-NEXT:                "tokLen": 1
+// JSON-NEXT:               },
+// JSON-NEXT:               "end": {
+// JSON-NEXT:                "offset": 874,
+// JSON-NEXT:                "col": 30,
+// JSON-NEXT:                "tokLen": 1
+// JSON-NEXT:               }
+// JSON-NEXT:              },
+// JSON-NEXT:              "type": {
+// JSON-NEXT:               "qualType": "Y"
+// JSON-NEXT:              },
+// JSON-NEXT:              "valueCategory": "prvalue",
+// JSON-NEXT:              "inner": [
+// JSON-NEXT:               {
+// JSON-NEXT:                "id": "0x{{.*}}",
+// JSON-NEXT:                "kind": "BinaryOperator",
+// JSON-NEXT:                "range": {
+// JSON-NEXT:                 "begin": {
+// JSON-NEXT:                  "offset": 869,
+// JSON-NEXT:                  "col": 25,
+// JSON-NEXT:                  "tokLen": 1
+// JSON-NEXT:                 },
+// JSON-NEXT:                 "end": {
+// JSON-NEXT:                  "offset": 873,
+// JSON-NEXT:                  "col": 29,
+// JSON-NEXT:                  "tokLen": 1
+// JSON-NEXT:                 }
+// JSON-NEXT:                },
+// JSON-NEXT:                "type": {
+// JSON-NEXT:                 "qualType": "int"
+// JSON-NEXT:                },
+// JSON-NEXT:                "valueCategory": "prvalue",
+// JSON-NEXT:                "opcode": "+",
+// JSON-NEXT:                "inner": [
+// JSON-NEXT:                 {
+// JSON-NEXT:                  "id": "0x{{.*}}",
+// JSON-NEXT:                  "kind": "DeclRefExpr",
+// JSON-NEXT:                  "range": {
+// JSON-NEXT:                   "begin": {
+// JSON-NEXT:                    "offset": 869,
+// JSON-NEXT:                    "col": 25,
+// JSON-NEXT:                    "tokLen": 1
+// JSON-NEXT:                   },
+// JSON-NEXT:                   "end": {
+// JSON-NEXT:                    "offset": 869,
+// JSON-NEXT:                    "col": 25,
+// JSON-NEXT:                    "tokLen": 1
+// JSON-NEXT:                   }
+// JSON-NEXT:                  },
+// JSON-NEXT:                  "type": {
+// JSON-NEXT:                   "qualType": "int"
+// JSON-NEXT:                  },
+// JSON-NEXT:                  "valueCategory": "prvalue",
+// JSON-NEXT:                  "referencedDecl": {
+// JSON-NEXT:                   "id": "0x{{.*}}",
+// JSON-NEXT:                   "kind": "NonTypeTemplateParmDecl",
+// JSON-NEXT:                   "name": "X",
+// JSON-NEXT:                   "type": {
+// JSON-NEXT:                    "qualType": "int"
+// JSON-NEXT:                   }
+// JSON-NEXT:                  }
+// JSON-NEXT:                 },
+// JSON-NEXT:                 {
+// JSON-NEXT:                  "id": "0x{{.*}}",
+// JSON-NEXT:                  "kind": "DeclRefExpr",
+// JSON-NEXT:                  "range": {
+// JSON-NEXT:                   "begin": {
+// JSON-NEXT:                    "offset": 873,
+// JSON-NEXT:                    "col": 29,
+// JSON-NEXT:                    "tokLen": 1
+// JSON-NEXT:                   },
+// JSON-NEXT:                   "end": {
+// JSON-NEXT:                    "offset": 873,
+// JSON-NEXT:                    "col": 29,
+// JSON-NEXT:                    "tokLen": 1
+// JSON-NEXT:                   }
+// JSON-NEXT:                  },
+// JSON-NEXT:                  "type": {
+// JSON-NEXT:                   "qualType": "int"
+// JSON-NEXT:                  },
+// JSON-NEXT:                  "valueCategory": "prvalue",
+// JSON-NEXT:                  "referencedDecl": {
+// JSON-NEXT:                   "id": "0x{{.*}}",
+// JSON-NEXT:                   "kind": "NonTypeTemplateParmDecl",
+// JSON-NEXT:                   "name": "Z",
+// JSON-NEXT:                   "type": {
+// JSON-NEXT:                    "qualType": "int"
+// JSON-NEXT:                   }
+// JSON-NEXT:                  }
+// JSON-NEXT:                 }
+// JSON-NEXT:                ]
+// JSON-NEXT:               }
+// JSON-NEXT:              ]
+// JSON-NEXT:             }
+// JSON-NEXT:            ]
+// JSON-NEXT:           }
+// JSON-NEXT:          ]
+// JSON-NEXT:         }
+// JSON-NEXT:        ]
+// JSON-NEXT:       }
+// JSON-NEXT:      ]
+// JSON-NEXT:     },
+// JSON-NEXT:     {
+// JSON-NEXT:      "id": "0x{{.*}}",
+// JSON-NEXT:      "kind": "ClassTemplateSpecializationDecl",
+// JSON-NEXT:      "loc": {
+// JSON-NEXT:       "offset": 812,
+// JSON-NEXT:       "line": 15,
+// JSON-NEXT:       "col": 8,
+// JSON-NEXT:       "tokLen": 3
+// JSON-NEXT:      },
+// JSON-NEXT:      "range": {
+// JSON-NEXT:       "begin": {
+// JSON-NEXT:        "offset": 765,
+// JSON-NEXT:        "line": 14,
+// JSON-NEXT:        "col": 1,
+// JSON-NEXT:        "tokLen": 8
+// JSON-NEXT:       },
+// JSON-NEXT:       "end": {
+// JSON-NEXT:        "offset": 879,
+// JSON-NEXT:        "line": 19,
+// JSON-NEXT:        "col": 1,
+// JSON-NEXT:        "tokLen": 1
+// JSON-NEXT:       }
+// JSON-NEXT:      },
+// JSON-NEXT:      "name": "foo",
+// JSON-NEXT:      "tagUsed": "struct",
+// JSON-NEXT:      "completeDefinition": true,
+// JSON-NEXT:      "definitionData": {
+// JSON-NEXT:       "canConstDefaultInit": true,
+// JSON-NEXT:       "canPassInRegisters": true,
+// JSON-NEXT:       "copyAssign": {
+// JSON-NEXT:        "hasConstParam": true,
+// JSON-NEXT:        "implicitHasConstParam": true,
+// JSON-NEXT:        "needsImplicit": true,
+// JSON-NEXT:        "simple": true,
+// JSON-NEXT:        "trivial": true
+// JSON-NEXT:       },
+// JSON-NEXT:       "copyCtor": {
+// JSON-NEXT:        "hasConstParam": true,
+// JSON-NEXT:        "implicitHasConstParam": true,
+// JSON-NEXT:        "simple": true,
+// JSON-NEXT:        "trivial": true
+// JSON-NEXT:       },
+// JSON-NEXT:       "defaultCtor": {
+// JSON-NEXT:        "exists": true,
+// JSON-NEXT:        "nonTrivial": true,
+// JSON-NEXT:        "userProvided": true
+// JSON-NEXT:       },
+// JSON-NEXT:       "dtor": {
+// JSON-NEXT:        "irrelevant": true,
+// JSON-NEXT:        "simple": true,
+// JSON-NEXT:        "trivial": true
+// JSON-NEXT:       },
+// JSON-NEXT:       "hasUserDeclaredConstructor": true,
+// JSON-NEXT:       "isStandardLayout": true,
+// JSON-NEXT:       "isTriviallyCopyable": true,
+// JSON-NEXT:       "moveAssign": {
+// JSON-NEXT:        "exists": true,
+// JSON-NEXT:        "needsImplicit": true,
+// JSON-NEXT:        "simple": true,
+// JSON-NEXT:        "trivial": true
+// JSON-NEXT:       },
+// JSON-NEXT:       "moveCtor": {
+// JSON-NEXT:        "exists": true,
+// JSON-NEXT:        "simple": true,
+// JSON-NEXT:        "trivial": true
+// JSON-NEXT:       }
+// JSON-NEXT:      },
+// JSON-NEXT:      "inner": [
+// JSON-NEXT:       {
+// JSON-NEXT:        "kind": "TemplateArgument",
+// JSON-NEXT:        "value": 5
+// JSON-NEXT:       },
+// JSON-NEXT:       {
+// JSON-NEXT:        "kind": "TemplateArgument",
+// JSON-NEXT:        "type": {
+// JSON-NEXT:         "qualType": "int"
+// JSON-NEXT:        },
+// JSON-NEXT:        "inner": [
+// JSON-NEXT:         {
+// JSON-NEXT:          "id": "0x{{.*}}",
+// JSON-NEXT:          "kind": "BuiltinType",
+// JSON-NEXT:          "type": {
+// JSON-NEXT:           "qualType": "int"
+// JSON-NEXT:          }
+// JSON-NEXT:         }
+// JSON-NEXT:        ]
+// JSON-NEXT:       },
+// JSON-NEXT:       {
+// JSON-NEXT:        "kind": "TemplateArgument",
+// JSON-NEXT:        "value": 5
+// JSON-NEXT:       },
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "CXXRecordDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 812,
+// JSON-NEXT:         "line": 15,
+// JSON-NEXT:         "col": 8,
+// JSON-NEXT:         "tokLen": 3
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 805,
+// JSON-NEXT:          "col": 1,
+// JSON-NEXT:          "tokLen": 6
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 812,
+// JSON-NEXT:          "col": 8,
+// JSON-NEXT:          "tokLen": 3
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "isImplicit": true,
+// JSON-NEXT:        "name": "foo",
+// JSON-NEXT:        "tagUsed": "struct"
+// JSON-NEXT:       },
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "FieldDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 824,
+// JSON-NEXT:         "line": 16,
+// JSON-NEXT:         "col": 7,
+// JSON-NEXT:         "tokLen": 8
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 820,
+// JSON-NEXT:          "col": 3,
+// JSON-NEXT:          "tokLen": 3
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 824,
+// JSON-NEXT:          "col": 7,
+// JSON-NEXT:          "tokLen": 8
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "name": "constant",
+// JSON-NEXT:        "type": {
+// JSON-NEXT:         "qualType": "int"
+// JSON-NEXT:        }
+// JSON-NEXT:       },
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "CXXConstructorDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 836,
+// JSON-NEXT:         "line": 17,
+// JSON-NEXT:         "col": 3,
+// JSON-NEXT:         "tokLen": 3
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 836,
+// JSON-NEXT:          "col": 3,
+// JSON-NEXT:          "tokLen": 3
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 843,
+// JSON-NEXT:          "col": 10,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "isUsed": true,
+// JSON-NEXT:        "name": "foo",
+// JSON-NEXT:        "mangledName": "_ZN3fooILi5EiLi5EEC1Ev",
+// JSON-NEXT:        "type": {
+// JSON-NEXT:         "qualType": "void ()"
+// JSON-NEXT:        },
+// JSON-NEXT:        "inner": [
+// JSON-NEXT:         {
+// JSON-NEXT:          "id": "0x{{.*}}",
+// JSON-NEXT:          "kind": "CompoundStmt",
+// JSON-NEXT:          "range": {
+// JSON-NEXT:           "begin": {
+// JSON-NEXT:            "offset": 842,
+// JSON-NEXT:            "col": 9,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           },
+// JSON-NEXT:           "end": {
+// JSON-NEXT:            "offset": 843,
+// JSON-NEXT:            "col": 10,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           }
+// JSON-NEXT:          }
+// JSON-NEXT:         }
+// JSON-NEXT:        ]
+// JSON-NEXT:       },
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "CXXMethodDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 849,
+// JSON-NEXT:         "line": 18,
+// JSON-NEXT:         "col": 5,
+// JSON-NEXT:         "tokLen": 6
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 847,
+// JSON-NEXT:          "col": 3,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 877,
+// JSON-NEXT:          "col": 33,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "isUsed": true,
+// JSON-NEXT:        "name": "getSum",
+// JSON-NEXT:        "mangledName": "_ZN3fooILi5EiLi5EE6getSumEv",
+// JSON-NEXT:        "type": {
+// JSON-NEXT:         "qualType": "int ()"
+// JSON-NEXT:        },
+// JSON-NEXT:        "inner": [
+// JSON-NEXT:         {
+// JSON-NEXT:          "id": "0x{{.*}}",
+// JSON-NEXT:          "kind": "CompoundStmt",
+// JSON-NEXT:          "range": {
+// JSON-NEXT:           "begin": {
+// JSON-NEXT:            "offset": 858,
+// JSON-NEXT:            "col": 14,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           },
+// JSON-NEXT:           "end": {
+// JSON-NEXT:            "offset": 877,
+// JSON-NEXT:            "col": 33,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           }
+// JSON-NEXT:          },
+// JSON-NEXT:          "inner": [
+// JSON-NEXT:           {
+// JSON-NEXT:            "id": "0x{{.*}}",
+// JSON-NEXT:            "kind": "ReturnStmt",
+// JSON-NEXT:            "range": {
+// JSON-NEXT:             "begin": {
+// JSON-NEXT:              "offset": 860,
+// JSON-NEXT:              "col": 16,
+// JSON-NEXT:              "tokLen": 6
+// JSON-NEXT:             },
+// JSON-NEXT:             "end": {
+// JSON-NEXT:              "offset": 874,
+// JSON-NEXT:              "col": 30,
+// JSON-NEXT:              "tokLen": 1
+// JSON-NEXT:             }
+// JSON-NEXT:            },
+// JSON-NEXT:            "inner": [
+// JSON-NEXT:             {
+// JSON-NEXT:              "id": "0x{{.*}}",
+// JSON-NEXT:              "kind": "CXXFunctionalCastExpr",
+// JSON-NEXT:              "range": {
+// JSON-NEXT:               "begin": {
+// JSON-NEXT:                "offset": 867,
+// JSON-NEXT:                "col": 23,
+// JSON-NEXT:                "tokLen": 1
+// JSON-NEXT:               },
+// JSON-NEXT:               "end": {
+// JSON-NEXT:                "offset": 874,
+// JSON-NEXT:                "col": 30,
+// JSON-NEXT:                "tokLen": 1
+// JSON-NEXT:               }
+// JSON-NEXT:              },
+// JSON-NEXT:              "type": {
+// JSON-NEXT:               "qualType": "int"
+// JSON-NEXT:              },
+// JSON-NEXT:              "valueCategory": "prvalue",
+// JSON-NEXT:              "castKind": "NoOp",
+// JSON-NEXT:              "inner": [
+// JSON-NEXT:               {
+// JSON-NEXT:                "id": "0x{{.*}}",
+// JSON-NEXT:                "kind": "BinaryOperator",
+// JSON-NEXT:                "range": {
+// JSON-NEXT:                 "begin": {
+// JSON-NEXT:                  "offset": 869,
+// JSON-NEXT:                  "col": 25,
+// JSON-NEXT:                  "tokLen": 1
+// JSON-NEXT:                 },
+// JSON-NEXT:                 "end": {
+// JSON-NEXT:                  "offset": 873,
+// JSON-NEXT:                  "col": 29,
+// JSON-NEXT:                  "tokLen": 1
+// JSON-NEXT:                 }
+// JSON-NEXT:                },
+// JSON-NEXT:                "type": {
+// JSON-NEXT:                 "qualType": "int"
+// JSON-NEXT:                },
+// JSON-NEXT:                "valueCategory": "prvalue",
+// JSON-NEXT:                "opcode": "+",
+// JSON-NEXT:                "inner": [
+// JSON-NEXT:                 {
+// JSON-NEXT:                  "id": "0x{{.*}}",
+// JSON-NEXT:                  "kind": "SubstNonTypeTemplateParmExpr",
+// JSON-NEXT:                  "range": {
+// JSON-NEXT:                   "begin": {
+// JSON-NEXT:                    "offset": 869,
+// JSON-NEXT:                    "col": 25,
+// JSON-NEXT:                    "tokLen": 1
+// JSON-NEXT:                   },
+// JSON-NEXT:                   "end": {
+// JSON-NEXT:                    "offset": 869,
+// JSON-NEXT:                    "col": 25,
+// JSON-NEXT:                    "tokLen": 1
+// JSON-NEXT:                   }
+// JSON-NEXT:                  },
+// JSON-NEXT:                  "type": {
+// JSON-NEXT:                   "qualType": "int"
+// JSON-NEXT:                  },
+// JSON-NEXT:                  "valueCategory": "prvalue",
+// JSON-NEXT:                  "inner": [
+// JSON-NEXT:                   {
+// JSON-NEXT:                    "id": "0x{{.*}}",
+// JSON-NEXT:                    "kind": "NonTypeTemplateParmDecl",
+// JSON-NEXT:                    "loc": {
+// JSON-NEXT:                     "offset": 779,
+// JSON-NEXT:                     "line": 14,
+// JSON-NEXT:                     "col": 15,
+// JSON-NEXT:                     "tokLen": 1
+// JSON-NEXT:                    },
+// JSON-NEXT:                    "range": {
+// JSON-NEXT:                     "begin": {
+// JSON-NEXT:                      "offset": 775,
+// JSON-NEXT:                      "col": 11,
+// JSON-NEXT:                      "tokLen": 3
+// JSON-NEXT:                     },
+// JSON-NEXT:                     "end": {
+// JSON-NEXT:                      "offset": 779,
+// JSON-NEXT:                      "col": 15,
+// JSON-NEXT:                      "tokLen": 1
+// JSON-NEXT:                     }
+// JSON-NEXT:                    },
+// JSON-NEXT:                    "isReferenced": true,
+// JSON-NEXT:                    "name": "X",
+// JSON-NEXT:                    "type": {
+// JSON-NEXT:                     "qualType": "int"
+// JSON-NEXT:                    },
+// JSON-NEXT:                    "depth": 0,
+// JSON-NEXT:                    "index": 0
+// JSON-NEXT:                   },
+// JSON-NEXT:                   {
+// JSON-NEXT:                    "id": "0x{{.*}}",
+// JSON-NEXT:                    "kind": "IntegerLiteral",
+// JSON-NEXT:                    "range": {
+// JSON-NEXT:                     "begin": {
+// JSON-NEXT:                      "offset": 869,
+// JSON-NEXT:                      "line": 18,
+// JSON-NEXT:                      "col": 25,
+// JSON-NEXT:                      "tokLen": 1
+// JSON-NEXT:                     },
+// JSON-NEXT:                     "end": {
+// JSON-NEXT:                      "offset": 869,
+// JSON-NEXT:                      "col": 25,
+// JSON-NEXT:                      "tokLen": 1
+// JSON-NEXT:                     }
+// JSON-NEXT:                    },
+// JSON-NEXT:                    "type": {
+// JSON-NEXT:                     "qualType": "int"
+// JSON-NEXT:                    },
+// JSON-NEXT:                    "valueCategory": "prvalue",
+// JSON-NEXT:                    "value": "5"
+// JSON-NEXT:                   }
+// JSON-NEXT:                  ]
+// JSON-NEXT:                 },
+// JSON-NEXT:                 {
+// JSON-NEXT:                  "id": "0x{{.*}}",
+// JSON-NEXT:                  "kind": "SubstNonTypeTemplateParmExpr",
+// JSON-NEXT:                  "range": {
+// JSON-NEXT:                   "begin": {
+// JSON-NEXT:                    "offset": 873,
+// JSON-NEXT:                    "col": 29,
+// JSON-NEXT:                    "tokLen": 1
+// JSON-NEXT:                   },
+// JSON-NEXT:                   "end": {
+// JSON-NEXT:                    "offset": 873,
+// JSON-NEXT:                    "col": 29,
+// JSON-NEXT:                    "tokLen": 1
+// JSON-NEXT:                   }
+// JSON-NEXT:                  },
+// JSON-NEXT:                  "type": {
+// JSON-NEXT:                   "qualType": "int"
+// JSON-NEXT:                  },
+// JSON-NEXT:                  "valueCategory": "prvalue",
+// JSON-NEXT:                  "inner": [
+// JSON-NEXT:                   {
+// JSON-NEXT:                    "id": "0x{{.*}}",
+// JSON-NEXT:                    "kind": "NonTypeTemplateParmDecl",
+// JSON-NEXT:                    "loc": {
+// JSON-NEXT:                     "offset": 798,
+// JSON-NEXT:                     "line": 14,
+// JSON-NEXT:                     "col": 34,
+// JSON-NEXT:                     "tokLen": 1
+// JSON-NEXT:                    },
+// JSON-NEXT:                    "range": {
+// JSON-NEXT:                     "begin": {
+// JSON-NEXT:                      "offset": 794,
+// JSON-NEXT:                      "col": 30,
+// JSON-NEXT:                      "tokLen": 3
+// JSON-NEXT:                     },
+// JSON-NEXT:                     "end": {
+// JSON-NEXT:                      "offset": 802,
+// JSON-NEXT:                      "col": 38,
+// JSON-NEXT:                      "tokLen": 1
+// JSON-NEXT:                     }
+// JSON-NEXT:                    },
+// JSON-NEXT:                    "isReferenced": true,
+// JSON-NEXT:                    "name": "Z",
+// JSON-NEXT:                    "type": {
+// JSON-NEXT:                     "qualType": "int"
+// JSON-NEXT:                    },
+// JSON-NEXT:                    "depth": 0,
+// JSON-NEXT:                    "index": 2,
+// JSON-NEXT:                    "defaultArg": {
+// JSON-NEXT:                     "kind": "TemplateArgument",
+// JSON-NEXT:                     "isExpr": true
+// JSON-NEXT:                    },
+// JSON-NEXT:                    "inner": [
+// JSON-NEXT:                     {
+// JSON-NEXT:                      "kind": "TemplateArgument",
+// JSON-NEXT:                      "range": {
+// JSON-NEXT:                       "begin": {
+// JSON-NEXT:                        "offset": 802,
+// JSON-NEXT:                        "col": 38,
+// JSON-NEXT:                        "tokLen": 1
+// JSON-NEXT:                       },
+// JSON-NEXT:                       "end": {
+// JSON-NEXT:                        "offset": 802,
+// JSON-NEXT:                        "col": 38,
+// JSON-NEXT:                        "tokLen": 1
+// JSON-NEXT:                       }
+// JSON-NEXT:                      },
+// JSON-NEXT:                      "isExpr": true,
+// JSON-NEXT:                      "inner": [
+// JSON-NEXT:                       {
+// JSON-NEXT:                        "id": "0x{{.*}}",
+// JSON-NEXT:                        "kind": "IntegerLiteral",
+// JSON-NEXT:                        "range": {
+// JSON-NEXT:                         "begin": {
+// JSON-NEXT:                          "offset": 802,
+// JSON-NEXT:                          "col": 38,
+// JSON-NEXT:                          "tokLen": 1
+// JSON-NEXT:                         },
+// JSON-NEXT:                         "end": {
+// JSON-NEXT:                          "offset": 802,
+// JSON-NEXT:                          "col": 38,
+// JSON-NEXT:                          "tokLen": 1
+// JSON-NEXT:                         }
+// JSON-NEXT:                        },
+// JSON-NEXT:                        "type": {
+// JSON-NEXT:                         "qualType": "int"
+// JSON-NEXT:                        },
+// JSON-NEXT:                        "valueCategory": "prvalue",
+// JSON-NEXT:                        "value": "5"
+// JSON-NEXT:                       }
+// JSON-NEXT:                      ]
+// JSON-NEXT:                     }
+// JSON-NEXT:                    ]
+// JSON-NEXT:                   },
+// JSON-NEXT:                   {
+// JSON-NEXT:                    "id": "0x{{.*}}",
+// JSON-NEXT:                    "kind": "IntegerLiteral",
+// JSON-NEXT:                    "range": {
+// JSON-NEXT:                     "begin": {
+// JSON-NEXT:                      "offset": 873,
+// JSON-NEXT:                      "line": 18,
+// JSON-NEXT:                      "col": 29,
+// JSON-NEXT:                      "tokLen": 1
+// JSON-NEXT:                     },
+// JSON-NEXT:                     "end": {
+// JSON-NEXT:                      "offset": 873,
+// JSON-NEXT:                      "col": 29,
+// JSON-NEXT:                      "tokLen": 1
+// JSON-NEXT:                     }
+// JSON-NEXT:                    },
+// JSON-NEXT:                    "type": {
+// JSON-NEXT:                     "qualType": "int"
+// JSON-NEXT:                    },
+// JSON-NEXT:                    "valueCategory": "prvalue",
+// JSON-NEXT:                    "value": "5"
+// JSON-NEXT:                   }
+// JSON-NEXT:                  ]
+// JSON-NEXT:                 }
+// JSON-NEXT:                ]
+// JSON-NEXT:               }
+// JSON-NEXT:              ]
+// JSON-NEXT:             }
+// JSON-NEXT:            ]
+// JSON-NEXT:           }
+// JSON-NEXT:          ]
+// JSON-NEXT:         }
+// JSON-NEXT:        ]
+// JSON-NEXT:       },
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "CXXConstructorDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 812,
+// JSON-NEXT:         "line": 15,
+// JSON-NEXT:         "col": 8,
+// JSON-NEXT:         "tokLen": 3
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 812,
+// JSON-NEXT:          "col": 8,
+// JSON-NEXT:          "tokLen": 3
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 812,
+// JSON-NEXT:          "col": 8,
+// JSON-NEXT:          "tokLen": 3
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "isImplicit": true,
+// JSON-NEXT:        "name": "foo",
+// JSON-NEXT:        "mangledName": "_ZN3fooILi5EiLi5EEC1ERKS0_",
+// JSON-NEXT:        "type": {
+// JSON-NEXT:         "qualType": "void (const foo<5, int> &)"
+// JSON-NEXT:        },
+// JSON-NEXT:        "inline": true,
+// JSON-NEXT:        "constexpr": true,
+// JSON-NEXT:        "explicitlyDefaulted": "default",
+// JSON-NEXT:        "inner": [
+// JSON-NEXT:         {
+// JSON-NEXT:          "id": "0x{{.*}}",
+// JSON-NEXT:          "kind": "ParmVarDecl",
+// JSON-NEXT:          "loc": {
+// JSON-NEXT:           "offset": 812,
+// JSON-NEXT:           "col": 8,
+// JSON-NEXT:           "tokLen": 3
+// JSON-NEXT:          },
+// JSON-NEXT:          "range": {
+// JSON-NEXT:           "begin": {
+// JSON-NEXT:            "offset": 812,
+// JSON-NEXT:            "col": 8,
+// JSON-NEXT:            "tokLen": 3
+// JSON-NEXT:           },
+// JSON-NEXT:           "end": {
+// JSON-NEXT:            "offset": 812,
+// JSON-NEXT:            "col": 8,
+// JSON-NEXT:            "tokLen": 3
+// JSON-NEXT:           }
+// JSON-NEXT:          },
+// JSON-NEXT:          "type": {
+// JSON-NEXT:           "qualType": "const foo<5, int> &"
+// JSON-NEXT:          }
+// JSON-NEXT:         }
+// JSON-NEXT:        ]
+// JSON-NEXT:       },
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "CXXConstructorDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 812,
+// JSON-NEXT:         "col": 8,
+// JSON-NEXT:         "tokLen": 3
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 812,
+// JSON-NEXT:          "col": 8,
+// JSON-NEXT:          "tokLen": 3
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 812,
+// JSON-NEXT:          "col": 8,
+// JSON-NEXT:          "tokLen": 3
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "isImplicit": true,
+// JSON-NEXT:        "name": "foo",
+// JSON-NEXT:        "mangledName": "_ZN3fooILi5EiLi5EEC1EOS0_",
+// JSON-NEXT:        "type": {
+// JSON-NEXT:         "qualType": "void (foo<5, int> &&)"
+// JSON-NEXT:        },
+// JSON-NEXT:        "inline": true,
+// JSON-NEXT:        "constexpr": true,
+// JSON-NEXT:        "explicitlyDefaulted": "default",
+// JSON-NEXT:        "inner": [
+// JSON-NEXT:         {
+// JSON-NEXT:          "id": "0x{{.*}}",
+// JSON-NEXT:          "kind": "ParmVarDecl",
+// JSON-NEXT:          "loc": {
+// JSON-NEXT:           "offset": 812,
+// JSON-NEXT:           "col": 8,
+// JSON-NEXT:           "tokLen": 3
+// JSON-NEXT:          },
+// JSON-NEXT:          "range": {
+// JSON-NEXT:           "begin": {
+// JSON-NEXT:            "offset": 812,
+// JSON-NEXT:            "col": 8,
+// JSON-NEXT:            "tokLen": 3
+// JSON-NEXT:           },
+// JSON-NEXT:           "end": {
+// JSON-NEXT:            "offset": 812,
+// JSON-NEXT:            "col": 8,
+// JSON-NEXT:            "tokLen": 3
+// JSON-NEXT:           }
+// JSON-NEXT:          },
+// JSON-NEXT:          "type": {
+// JSON-NEXT:           "qualType": "foo<5, int> &&"
+// JSON-NEXT:          }
+// JSON-NEXT:         }
+// JSON-NEXT:        ]
+// JSON-NEXT:       },
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "CXXDestructorDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 812,
+// JSON-NEXT:         "col": 8,
+// JSON-NEXT:         "tokLen": 3
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 812,
+// JSON-NEXT:          "col": 8,
+// JSON-NEXT:          "tokLen": 3
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 812,
+// JSON-NEXT:          "col": 8,
+// JSON-NEXT:          "tokLen": 3
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "isImplicit": true,
+// JSON-NEXT:        "isReferenced": true,
+// JSON-NEXT:        "name": "~foo",
+// JSON-NEXT:        "mangledName": "_ZN3fooILi5EiLi5EED1Ev",
+// JSON-NEXT:        "type": {
+// JSON-NEXT:         "qualType": "void () noexcept"
+// JSON-NEXT:        },
+// JSON-NEXT:        "inline": true,
+// JSON-NEXT:        "explicitlyDefaulted": "default"
+// JSON-NEXT:       }
+// JSON-NEXT:      ]
+// JSON-NEXT:     },
+// JSON-NEXT:     {
+// JSON-NEXT:      "id": "0x{{.*}}",
+// JSON-NEXT:      "kind": "ClassTemplateSpecializationDecl",
+// JSON-NEXT:      "loc": {
+// JSON-NEXT:       "offset": 812,
+// JSON-NEXT:       "col": 8,
+// JSON-NEXT:       "tokLen": 3
+// JSON-NEXT:      },
+// JSON-NEXT:      "range": {
+// JSON-NEXT:       "begin": {
+// JSON-NEXT:        "offset": 765,
+// JSON-NEXT:        "line": 14,
+// JSON-NEXT:        "col": 1,
+// JSON-NEXT:        "tokLen": 8
+// JSON-NEXT:       },
+// JSON-NEXT:       "end": {
+// JSON-NEXT:        "offset": 879,
+// JSON-NEXT:        "line": 19,
+// JSON-NEXT:        "col": 1,
+// JSON-NEXT:        "tokLen": 1
+// JSON-NEXT:       }
+// JSON-NEXT:      },
+// JSON-NEXT:      "name": "foo",
+// JSON-NEXT:      "tagUsed": "struct",
+// JSON-NEXT:      "completeDefinition": true,
+// JSON-NEXT:      "definitionData": {
+// JSON-NEXT:       "canConstDefaultInit": true,
+// JSON-NEXT:       "canPassInRegisters": true,
+// JSON-NEXT:       "copyAssign": {
+// JSON-NEXT:        "hasConstParam": true,
+// JSON-NEXT:        "implicitHasConstParam": true,
+// JSON-NEXT:        "needsImplicit": true,
+// JSON-NEXT:        "simple": true,
+// JSON-NEXT:        "trivial": true
+// JSON-NEXT:       },
+// JSON-NEXT:       "copyCtor": {
+// JSON-NEXT:        "hasConstParam": true,
+// JSON-NEXT:        "implicitHasConstParam": true,
+// JSON-NEXT:        "simple": true,
+// JSON-NEXT:        "trivial": true
+// JSON-NEXT:       },
+// JSON-NEXT:       "defaultCtor": {
+// JSON-NEXT:        "exists": true,
+// JSON-NEXT:        "nonTrivial": true,
+// JSON-NEXT:        "userProvided": true
+// JSON-NEXT:       },
+// JSON-NEXT:       "dtor": {
+// JSON-NEXT:        "irrelevant": true,
+// JSON-NEXT:        "simple": true,
+// JSON-NEXT:        "trivial": true
+// JSON-NEXT:       },
+// JSON-NEXT:       "hasUserDeclaredConstructor": true,
+// JSON-NEXT:       "isStandardLayout": true,
+// JSON-NEXT:       "isTriviallyCopyable": true,
+// JSON-NEXT:       "moveAssign": {
+// JSON-NEXT:        "exists": true,
+// JSON-NEXT:        "needsImplicit": true,
+// JSON-NEXT:        "simple": true,
+// JSON-NEXT:        "trivial": true
+// JSON-NEXT:       },
+// JSON-NEXT:       "moveCtor": {
+// JSON-NEXT:        "exists": true,
+// JSON-NEXT:        "simple": true,
+// JSON-NEXT:        "trivial": true
+// JSON-NEXT:       }
+// JSON-NEXT:      },
+// JSON-NEXT:      "inner": [
+// JSON-NEXT:       {
+// JSON-NEXT:        "kind": "TemplateArgument",
+// JSON-NEXT:        "value": 2
+// JSON-NEXT:       },
+// JSON-NEXT:       {
+// JSON-NEXT:        "kind": "TemplateArgument",
+// JSON-NEXT:        "type": {
+// JSON-NEXT:         "qualType": "double"
+// JSON-NEXT:        },
+// JSON-NEXT:        "inner": [
+// JSON-NEXT:         {
+// JSON-NEXT:          "id": "0x{{.*}}",
+// JSON-NEXT:          "kind": "BuiltinType",
+// JSON-NEXT:          "type": {
+// JSON-NEXT:           "qualType": "double"
+// JSON-NEXT:          }
+// JSON-NEXT:         }
+// JSON-NEXT:        ]
+// JSON-NEXT:       },
+// JSON-NEXT:       {
+// JSON-NEXT:        "kind": "TemplateArgument",
+// JSON-NEXT:        "value": 3
+// JSON-NEXT:       },
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "CXXRecordDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 812,
+// JSON-NEXT:         "line": 15,
+// JSON-NEXT:         "col": 8,
+// JSON-NEXT:         "tokLen": 3
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 805,
+// JSON-NEXT:          "col": 1,
+// JSON-NEXT:          "tokLen": 6
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 812,
+// JSON-NEXT:          "col": 8,
+// JSON-NEXT:          "tokLen": 3
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "isImplicit": true,
+// JSON-NEXT:        "name": "foo",
+// JSON-NEXT:        "tagUsed": "struct"
+// JSON-NEXT:       },
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "FieldDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 824,
+// JSON-NEXT:         "line": 16,
+// JSON-NEXT:         "col": 7,
+// JSON-NEXT:         "tokLen": 8
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 820,
+// JSON-NEXT:          "col": 3,
+// JSON-NEXT:          "tokLen": 3
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 824,
+// JSON-NEXT:          "col": 7,
+// JSON-NEXT:          "tokLen": 8
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "name": "constant",
+// JSON-NEXT:        "type": {
+// JSON-NEXT:         "qualType": "int"
+// JSON-NEXT:        }
+// JSON-NEXT:       },
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "CXXConstructorDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 836,
+// JSON-NEXT:         "line": 17,
+// JSON-NEXT:         "col": 3,
+// JSON-NEXT:         "tokLen": 3
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 836,
+// JSON-NEXT:          "col": 3,
+// JSON-NEXT:          "tokLen": 3
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 843,
+// JSON-NEXT:          "col": 10,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "isUsed": true,
+// JSON-NEXT:        "name": "foo",
+// JSON-NEXT:        "mangledName": "_ZN3fooILi2EdLi3EEC1Ev",
+// JSON-NEXT:        "type": {
+// JSON-NEXT:         "qualType": "void ()"
+// JSON-NEXT:        },
+// JSON-NEXT:        "inner": [
+// JSON-NEXT:         {
+// JSON-NEXT:          "id": "0x{{.*}}",
+// JSON-NEXT:          "kind": "CompoundStmt",
+// JSON-NEXT:          "range": {
+// JSON-NEXT:           "begin": {
+// JSON-NEXT:            "offset": 842,
+// JSON-NEXT:            "col": 9,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           },
+// JSON-NEXT:           "end": {
+// JSON-NEXT:            "offset": 843,
+// JSON-NEXT:            "col": 10,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           }
+// JSON-NEXT:          }
+// JSON-NEXT:         }
+// JSON-NEXT:        ]
+// JSON-NEXT:       },
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "CXXMethodDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 849,
+// JSON-NEXT:         "line": 18,
+// JSON-NEXT:         "col": 5,
+// JSON-NEXT:         "tokLen": 6
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 847,
+// JSON-NEXT:          "col": 3,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 877,
+// JSON-NEXT:          "col": 33,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "isUsed": true,
+// JSON-NEXT:        "name": "getSum",
+// JSON-NEXT:        "mangledName": "_ZN3fooILi2EdLi3EE6getSumEv",
+// JSON-NEXT:        "type": {
+// JSON-NEXT:         "qualType": "double ()"
+// JSON-NEXT:        },
+// JSON-NEXT:        "inner": [
+// JSON-NEXT:         {
+// JSON-NEXT:          "id": "0x{{.*}}",
+// JSON-NEXT:          "kind": "CompoundStmt",
+// JSON-NEXT:          "range": {
+// JSON-NEXT:           "begin": {
+// JSON-NEXT:            "offset": 858,
+// JSON-NEXT:            "col": 14,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           },
+// JSON-NEXT:           "end": {
+// JSON-NEXT:            "offset": 877,
+// JSON-NEXT:            "col": 33,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           }
+// JSON-NEXT:          },
+// JSON-NEXT:          "inner": [
+// JSON-NEXT:           {
+// JSON-NEXT:            "id": "0x{{.*}}",
+// JSON-NEXT:            "kind": "ReturnStmt",
+// JSON-NEXT:            "range": {
+// JSON-NEXT:             "begin": {
+// JSON-NEXT:              "offset": 860,
+// JSON-NEXT:              "col": 16,
+// JSON-NEXT:              "tokLen": 6
+// JSON-NEXT:             },
+// JSON-NEXT:             "end": {
+// JSON-NEXT:              "offset": 874,
+// JSON-NEXT:              "col": 30,
+// JSON-NEXT:              "tokLen": 1
+// JSON-NEXT:             }
+// JSON-NEXT:            },
+// JSON-NEXT:            "inner": [
+// JSON-NEXT:             {
+// JSON-NEXT:              "id": "0x{{.*}}",
+// JSON-NEXT:              "kind": "CXXFunctionalCastExpr",
+// JSON-NEXT:              "range": {
+// JSON-NEXT:               "begin": {
+// JSON-NEXT:                "offset": 867,
+// JSON-NEXT:                "col": 23,
+// JSON-NEXT:                "tokLen": 1
+// JSON-NEXT:               },
+// JSON-NEXT:               "end": {
+// JSON-NEXT:                "offset": 874,
+// JSON-NEXT:                "col": 30,
+// JSON-NEXT:                "tokLen": 1
+// JSON-NEXT:               }
+// JSON-NEXT:              },
+// JSON-NEXT:              "type": {
+// JSON-NEXT:               "qualType": "double"
+// JSON-NEXT:              },
+// JSON-NEXT:              "valueCategory": "prvalue",
+// JSON-NEXT:              "castKind": "NoOp",
+// JSON-NEXT:              "inner": [
+// JSON-NEXT:               {
+// JSON-NEXT:                "id": "0x{{.*}}",
+// JSON-NEXT:                "kind": "ImplicitCastExpr",
+// JSON-NEXT:                "range": {
+// JSON-NEXT:                 "begin": {
+// JSON-NEXT:                  "offset": 869,
+// JSON-NEXT:                  "col": 25,
+// JSON-NEXT:                  "tokLen": 1
+// JSON-NEXT:                 },
+// JSON-NEXT:                 "end": {
+// JSON-NEXT:                  "offset": 873,
+// JSON-NEXT:                  "col": 29,
+// JSON-NEXT:                  "tokLen": 1
+// JSON-NEXT:                 }
+// JSON-NEXT:                },
+// JSON-NEXT:                "type": {
+// JSON-NEXT:                 "qualType": "double"
+// JSON-NEXT:                },
+// JSON-NEXT:                "valueCategory": "prvalue",
+// JSON-NEXT:                "castKind": "IntegralToFloating",
+// JSON-NEXT:                "isPartOfExplicitCast": true,
+// JSON-NEXT:                "inner": [
+// JSON-NEXT:                 {
+// JSON-NEXT:                  "id": "0x{{.*}}",
+// JSON-NEXT:                  "kind": "BinaryOperator",
+// JSON-NEXT:                  "range": {
+// JSON-NEXT:                   "begin": {
+// JSON-NEXT:                    "offset": 869,
+// JSON-NEXT:                    "col": 25,
+// JSON-NEXT:                    "tokLen": 1
+// JSON-NEXT:                   },
+// JSON-NEXT:                   "end": {
+// JSON-NEXT:                    "offset": 873,
+// JSON-NEXT:                    "col": 29,
+// JSON-NEXT:                    "tokLen": 1
+// JSON-NEXT:                   }
+// JSON-NEXT:                  },
+// JSON-NEXT:                  "type": {
+// JSON-NEXT:                   "qualType": "int"
+// JSON-NEXT:                  },
+// JSON-NEXT:                  "valueCategory": "prvalue",
+// JSON-NEXT:                  "opcode": "+",
+// JSON-NEXT:                  "inner": [
+// JSON-NEXT:                   {
+// JSON-NEXT:                    "id": "0x{{.*}}",
+// JSON-NEXT:                    "kind": "SubstNonTypeTemplateParmExpr",
+// JSON-NEXT:                    "range": {
+// JSON-NEXT:                     "begin": {
+// JSON-NEXT:                      "offset": 869,
+// JSON-NEXT:                      "col": 25,
+// JSON-NEXT:                      "tokLen": 1
+// JSON-NEXT:                     },
+// JSON-NEXT:                     "end": {
+// JSON-NEXT:                      "offset": 869,
+// JSON-NEXT:                      "col": 25,
+// JSON-NEXT:                      "tokLen": 1
+// JSON-NEXT:                     }
+// JSON-NEXT:                    },
+// JSON-NEXT:                    "type": {
+// JSON-NEXT:                     "qualType": "int"
+// JSON-NEXT:                    },
+// JSON-NEXT:                    "valueCategory": "prvalue",
+// JSON-NEXT:                    "inner": [
+// JSON-NEXT:                     {
+// JSON-NEXT:                      "id": "0x{{.*}}",
+// JSON-NEXT:                      "kind": "NonTypeTemplateParmDecl",
+// JSON-NEXT:                      "loc": {
+// JSON-NEXT:                       "offset": 779,
+// JSON-NEXT:                       "line": 14,
+// JSON-NEXT:                       "col": 15,
+// JSON-NEXT:                       "tokLen": 1
+// JSON-NEXT:                      },
+// JSON-NEXT:                      "range": {
+// JSON-NEXT:                       "begin": {
+// JSON-NEXT:                        "offset": 775,
+// JSON-NEXT:                        "col": 11,
+// JSON-NEXT:                        "tokLen": 3
+// JSON-NEXT:                       },
+// JSON-NEXT:                       "end": {
+// JSON-NEXT:                        "offset": 779,
+// JSON-NEXT:                        "col": 15,
+// JSON-NEXT:                        "tokLen": 1
+// JSON-NEXT:                       }
+// JSON-NEXT:                      },
+// JSON-NEXT:                      "isReferenced": true,
+// JSON-NEXT:                      "name": "X",
+// JSON-NEXT:                      "type": {
+// JSON-NEXT:                       "qualType": "int"
+// JSON-NEXT:                      },
+// JSON-NEXT:                      "depth": 0,
+// JSON-NEXT:                      "index": 0
+// JSON-NEXT:                     },
+// JSON-NEXT:                     {
+// JSON-NEXT:                      "id": "0x{{.*}}",
+// JSON-NEXT:                      "kind": "IntegerLiteral",
+// JSON-NEXT:                      "range": {
+// JSON-NEXT:                       "begin": {
+// JSON-NEXT:                        "offset": 869,
+// JSON-NEXT:                        "line": 18,
+// JSON-NEXT:                        "col": 25,
+// JSON-NEXT:                        "tokLen": 1
+// JSON-NEXT:                       },
+// JSON-NEXT:                       "end": {
+// JSON-NEXT:                        "offset": 869,
+// JSON-NEXT:                        "col": 25,
+// JSON-NEXT:                        "tokLen": 1
+// JSON-NEXT:                       }
+// JSON-NEXT:                      },
+// JSON-NEXT:                      "type": {
+// JSON-NEXT:                       "qualType": "int"
+// JSON-NEXT:                      },
+// JSON-NEXT:                      "valueCategory": "prvalue",
+// JSON-NEXT:                      "value": "2"
+// JSON-NEXT:                     }
+// JSON-NEXT:                    ]
+// JSON-NEXT:                   },
+// JSON-NEXT:                   {
+// JSON-NEXT:                    "id": "0x{{.*}}",
+// JSON-NEXT:                    "kind": "SubstNonTypeTemplateParmExpr",
+// JSON-NEXT:                    "range": {
+// JSON-NEXT:                     "begin": {
+// JSON-NEXT:                      "offset": 873,
+// JSON-NEXT:                      "col": 29,
+// JSON-NEXT:                      "tokLen": 1
+// JSON-NEXT:                     },
+// JSON-NEXT:                     "end": {
+// JSON-NEXT:                      "offset": 873,
+// JSON-NEXT:                      "col": 29,
+// JSON-NEXT:                      "tokLen": 1
+// JSON-NEXT:                     }
+// JSON-NEXT:                    },
+// JSON-NEXT:                    "type": {
+// JSON-NEXT:                     "qualType": "int"
+// JSON-NEXT:                    },
+// JSON-NEXT:                    "valueCategory": "prvalue",
+// JSON-NEXT:                    "inner": [
+// JSON-NEXT:                     {
+// JSON-NEXT:                      "id": "0x{{.*}}",
+// JSON-NEXT:                      "kind": "NonTypeTemplateParmDecl",
+// JSON-NEXT:                      "loc": {
+// JSON-NEXT:                       "offset": 798,
+// JSON-NEXT:                       "line": 14,
+// JSON-NEXT:                       "col": 34,
+// JSON-NEXT:                       "tokLen": 1
+// JSON-NEXT:                      },
+// JSON-NEXT:                      "range": {
+// JSON-NEXT:                       "begin": {
+// JSON-NEXT:                        "offset": 794,
+// JSON-NEXT:                        "col": 30,
+// JSON-NEXT:                        "tokLen": 3
+// JSON-NEXT:                       },
+// JSON-NEXT:                       "end": {
+// JSON-NEXT:                        "offset": 802,
+// JSON-NEXT:                        "col": 38,
+// JSON-NEXT:                        "tokLen": 1
+// JSON-NEXT:                       }
+// JSON-NEXT:                      },
+// JSON-NEXT:                      "isReferenced": true,
+// JSON-NEXT:                      "name": "Z",
+// JSON-NEXT:                      "type": {
+// JSON-NEXT:                       "qualType": "int"
+// JSON-NEXT:                      },
+// JSON-NEXT:                      "depth": 0,
+// JSON-NEXT:                      "index": 2,
+// JSON-NEXT:                      "defaultArg": {
+// JSON-NEXT:                       "kind": "TemplateArgument",
+// JSON-NEXT:                       "isExpr": true
+// JSON-NEXT:                      },
+// JSON-NEXT:                      "inner": [
+// JSON-NEXT:                       {
+// JSON-NEXT:                        "kind": "TemplateArgument",
+// JSON-NEXT:                        "range": {
+// JSON-NEXT:                         "begin": {
+// JSON-NEXT:                          "offset": 802,
+// JSON-NEXT:                          "col": 38,
+// JSON-NEXT:                          "tokLen": 1
+// JSON-NEXT:                         },
+// JSON-NEXT:                         "end": {
+// JSON-NEXT:                          "offset": 802,
+// JSON-NEXT:                          "col": 38,
+// JSON-NEXT:                          "tokLen": 1
+// JSON-NEXT:                         }
+// JSON-NEXT:                        },
+// JSON-NEXT:                        "isExpr": true,
+// JSON-NEXT:                        "inner": [
+// JSON-NEXT:                         {
+// JSON-NEXT:                          "id": "0x{{.*}}",
+// JSON-NEXT:                          "kind": "IntegerLiteral",
+// JSON-NEXT:                          "range": {
+// JSON-NEXT:                           "begin": {
+// JSON-NEXT:                            "offset": 802,
+// JSON-NEXT:                            "col": 38,
+// JSON-NEXT:                            "tokLen": 1
+// JSON-NEXT:                           },
+// JSON-NEXT:                           "end": {
+// JSON-NEXT:                            "offset": 802,
+// JSON-NEXT:                            "col": 38,
+// JSON-NEXT:                            "tokLen": 1
+// JSON-NEXT:                           }
+// JSON-NEXT:                          },
+// JSON-NEXT:                          "type": {
+// JSON-NEXT:                           "qualType": "int"
+// JSON-NEXT:                          },
+// JSON-NEXT:                          "valueCategory": "prvalue",
+// JSON-NEXT:                          "value": "5"
+// JSON-NEXT:                         }
+// JSON-NEXT:                        ]
+// JSON-NEXT:                       }
+// JSON-NEXT:                      ]
+// JSON-NEXT:                     },
+// JSON-NEXT:                     {
+// JSON-NEXT:                      "id": "0x{{.*}}",
+// JSON-NEXT:                      "kind": "IntegerLiteral",
+// JSON-NEXT:                      "range": {
+// JSON-NEXT:                       "begin": {
+// JSON-NEXT:                        "offset": 873,
+// JSON-NEXT:                        "line": 18,
+// JSON-NEXT:                        "col": 29,
+// JSON-NEXT:                        "tokLen": 1
+// JSON-NEXT:                       },
+// JSON-NEXT:                       "end": {
+// JSON-NEXT:                        "offset": 873,
+// JSON-NEXT:                        "col": 29,
+// JSON-NEXT:                        "tokLen": 1
+// JSON-NEXT:                       }
+// JSON-NEXT:                      },
+// JSON-NEXT:                      "type": {
+// JSON-NEXT:                       "qualType": "int"
+// JSON-NEXT:                      },
+// JSON-NEXT:                      "valueCategory": "prvalue",
+// JSON-NEXT:                      "value": "3"
+// JSON-NEXT:                     }
+// JSON-NEXT:                    ]
+// JSON-NEXT:                   }
+// JSON-NEXT:                  ]
+// JSON-NEXT:                 }
+// JSON-NEXT:                ]
+// JSON-NEXT:               }
+// JSON-NEXT:              ]
+// JSON-NEXT:             }
+// JSON-NEXT:            ]
+// JSON-NEXT:           }
+// JSON-NEXT:          ]
+// JSON-NEXT:         }
+// JSON-NEXT:        ]
+// JSON-NEXT:       },
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "CXXConstructorDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 812,
+// JSON-NEXT:         "line": 15,
+// JSON-NEXT:         "col": 8,
+// JSON-NEXT:         "tokLen": 3
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 812,
+// JSON-NEXT:          "col": 8,
+// JSON-NEXT:          "tokLen": 3
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 812,
+// JSON-NEXT:          "col": 8,
+// JSON-NEXT:          "tokLen": 3
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "isImplicit": true,
+// JSON-NEXT:        "name": "foo",
+// JSON-NEXT:        "mangledName": "_ZN3fooILi2EdLi3EEC1ERKS0_",
+// JSON-NEXT:        "type": {
+// JSON-NEXT:         "qualType": "void (const foo<2, double, 3> &)"
+// JSON-NEXT:        },
+// JSON-NEXT:        "inline": true,
+// JSON-NEXT:        "constexpr": true,
+// JSON-NEXT:        "explicitlyDefaulted": "default",
+// JSON-NEXT:        "inner": [
+// JSON-NEXT:         {
+// JSON-NEXT:          "id": "0x{{.*}}",
+// JSON-NEXT:          "kind": "ParmVarDecl",
+// JSON-NEXT:          "loc": {
+// JSON-NEXT:           "offset": 812,
+// JSON-NEXT:           "col": 8,
+// JSON-NEXT:           "tokLen": 3
+// JSON-NEXT:          },
+// JSON-NEXT:          "range": {
+// JSON-NEXT:           "begin": {
+// JSON-NEXT:            "offset": 812,
+// JSON-NEXT:            "col": 8,
+// JSON-NEXT:            "tokLen": 3
+// JSON-NEXT:           },
+// JSON-NEXT:           "end": {
+// JSON-NEXT:            "offset": 812,
+// JSON-NEXT:            "col": 8,
+// JSON-NEXT:            "tokLen": 3
+// JSON-NEXT:           }
+// JSON-NEXT:          },
+// JSON-NEXT:          "type": {
+// JSON-NEXT:           "qualType": "const foo<2, double, 3> &"
+// JSON-NEXT:          }
+// JSON-NEXT:         }
+// JSON-NEXT:        ]
+// JSON-NEXT:       },
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "CXXConstructorDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 812,
+// JSON-NEXT:         "col": 8,
+// JSON-NEXT:         "tokLen": 3
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 812,
+// JSON-NEXT:          "col": 8,
+// JSON-NEXT:          "tokLen": 3
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 812,
+// JSON-NEXT:          "col": 8,
+// JSON-NEXT:          "tokLen": 3
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "isImplicit": true,
+// JSON-NEXT:        "name": "foo",
+// JSON-NEXT:        "mangledName": "_ZN3fooILi2EdLi3EEC1EOS0_",
+// JSON-NEXT:        "type": {
+// JSON-NEXT:         "qualType": "void (foo<2, double, 3> &&)"
+// JSON-NEXT:        },
+// JSON-NEXT:        "inline": true,
+// JSON-NEXT:        "constexpr": true,
+// JSON-NEXT:        "explicitlyDefaulted": "default",
+// JSON-NEXT:        "inner": [
+// JSON-NEXT:         {
+// JSON-NEXT:          "id": "0x{{.*}}",
+// JSON-NEXT:          "kind": "ParmVarDecl",
+// JSON-NEXT:          "loc": {
+// JSON-NEXT:           "offset": 812,
+// JSON-NEXT:           "col": 8,
+// JSON-NEXT:           "tokLen": 3
+// JSON-NEXT:          },
+// JSON-NEXT:          "range": {
+// JSON-NEXT:           "begin": {
+// JSON-NEXT:            "offset": 812,
+// JSON-NEXT:            "col": 8,
+// JSON-NEXT:            "tokLen": 3
+// JSON-NEXT:           },
+// JSON-NEXT:           "end": {
+// JSON-NEXT:            "offset": 812,
+// JSON-NEXT:            "col": 8,
+// JSON-NEXT:            "tokLen": 3
+// JSON-NEXT:           }
+// JSON-NEXT:          },
+// JSON-NEXT:          "type": {
+// JSON-NEXT:           "qualType": "foo<2, double, 3> &&"
+// JSON-NEXT:          }
+// JSON-NEXT:         }
+// JSON-NEXT:        ]
+// JSON-NEXT:       },
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "CXXDestructorDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 812,
+// JSON-NEXT:         "col": 8,
+// JSON-NEXT:         "tokLen": 3
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 812,
+// JSON-NEXT:          "col": 8,
+// JSON-NEXT:          "tokLen": 3
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 812,
+// JSON-NEXT:          "col": 8,
+// JSON-NEXT:          "tokLen": 3
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "isImplicit": true,
+// JSON-NEXT:        "isReferenced": true,
+// JSON-NEXT:        "name": "~foo",
+// JSON-NEXT:        "mangledName": "_ZN3fooILi2EdLi3EED1Ev",
+// JSON-NEXT:        "type": {
+// JSON-NEXT:         "qualType": "void () noexcept"
+// JSON-NEXT:        },
+// JSON-NEXT:        "inline": true,
+// JSON-NEXT:        "explicitlyDefaulted": "default"
+// JSON-NEXT:       }
+// JSON-NEXT:      ]
+// JSON-NEXT:     }
+// JSON-NEXT:    ]
+// JSON-NEXT:   },
+// JSON-NEXT:   {
+// JSON-NEXT:    "id": "0x{{.*}}",
+// JSON-NEXT:    "kind": "FunctionTemplateDecl",
+// JSON-NEXT:    "loc": {
+// JSON-NEXT:     "offset": 914,
+// JSON-NEXT:     "line": 22,
+// JSON-NEXT:     "col": 3,
+// JSON-NEXT:     "tokLen": 3
+// JSON-NEXT:    },
+// JSON-NEXT:    "range": {
+// JSON-NEXT:     "begin": {
+// JSON-NEXT:      "offset": 883,
+// JSON-NEXT:      "line": 21,
+// JSON-NEXT:      "col": 1,
+// JSON-NEXT:      "tokLen": 8
+// JSON-NEXT:     },
+// JSON-NEXT:     "end": {
+// JSON-NEXT:      "offset": 937,
+// JSON-NEXT:      "line": 24,
+// JSON-NEXT:      "col": 1,
+// JSON-NEXT:      "tokLen": 1
+// JSON-NEXT:     }
+// JSON-NEXT:    },
+// JSON-NEXT:    "name": "bar",
+// JSON-NEXT:    "inner": [
+// JSON-NEXT:     {
+// JSON-NEXT:      "id": "0x{{.*}}",
+// JSON-NEXT:      "kind": "NonTypeTemplateParmDecl",
+// JSON-NEXT:      "loc": {
+// JSON-NEXT:       "offset": 897,
+// JSON-NEXT:       "line": 21,
+// JSON-NEXT:       "col": 15,
+// JSON-NEXT:       "tokLen": 1
+// JSON-NEXT:      },
+// JSON-NEXT:      "range": {
+// JSON-NEXT:       "begin": {
+// JSON-NEXT:        "offset": 893,
+// JSON-NEXT:        "col": 11,
+// JSON-NEXT:        "tokLen": 3
+// JSON-NEXT:       },
+// JSON-NEXT:       "end": {
+// JSON-NEXT:        "offset": 897,
+// JSON-NEXT:        "col": 15,
+// JSON-NEXT:        "tokLen": 1
+// JSON-NEXT:       }
+// JSON-NEXT:      },
+// JSON-NEXT:      "isReferenced": true,
+// JSON-NEXT:      "name": "A",
+// JSON-NEXT:      "type": {
+// JSON-NEXT:       "qualType": "int"
+// JSON-NEXT:      },
+// JSON-NEXT:      "depth": 0,
+// JSON-NEXT:      "index": 0
+// JSON-NEXT:     },
+// JSON-NEXT:     {
+// JSON-NEXT:      "id": "0x{{.*}}",
+// JSON-NEXT:      "kind": "TemplateTypeParmDecl",
+// JSON-NEXT:      "loc": {
+// JSON-NEXT:       "offset": 909,
+// JSON-NEXT:       "col": 27,
+// JSON-NEXT:       "tokLen": 1
+// JSON-NEXT:      },
+// JSON-NEXT:      "range": {
+// JSON-NEXT:       "begin": {
+// JSON-NEXT:        "offset": 900,
+// JSON-NEXT:        "col": 18,
+// JSON-NEXT:        "tokLen": 8
+// JSON-NEXT:       },
+// JSON-NEXT:       "end": {
+// JSON-NEXT:        "offset": 909,
+// JSON-NEXT:        "col": 27,
+// JSON-NEXT:        "tokLen": 1
+// JSON-NEXT:       }
+// JSON-NEXT:      },
+// JSON-NEXT:      "isReferenced": true,
+// JSON-NEXT:      "name": "B",
+// JSON-NEXT:      "tagUsed": "typename",
+// JSON-NEXT:      "depth": 0,
+// JSON-NEXT:      "index": 1
+// JSON-NEXT:     },
+// JSON-NEXT:     {
+// JSON-NEXT:      "id": "0x{{.*}}",
+// JSON-NEXT:      "kind": "FunctionDecl",
+// JSON-NEXT:      "loc": {
+// JSON-NEXT:       "offset": 914,
+// JSON-NEXT:       "line": 22,
+// JSON-NEXT:       "col": 3,
+// JSON-NEXT:       "tokLen": 3
+// JSON-NEXT:      },
+// JSON-NEXT:      "range": {
+// JSON-NEXT:       "begin": {
+// JSON-NEXT:        "offset": 912,
+// JSON-NEXT:        "col": 1,
+// JSON-NEXT:        "tokLen": 1
+// JSON-NEXT:       },
+// JSON-NEXT:       "end": {
+// JSON-NEXT:        "offset": 937,
+// JSON-NEXT:        "line": 24,
+// JSON-NEXT:        "col": 1,
+// JSON-NEXT:        "tokLen": 1
+// JSON-NEXT:       }
+// JSON-NEXT:      },
+// JSON-NEXT:      "name": "bar",
+// JSON-NEXT:      "type": {
+// JSON-NEXT:       "qualType": "B ()"
+// JSON-NEXT:      },
+// JSON-NEXT:      "inner": [
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "CompoundStmt",
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 920,
+// JSON-NEXT:          "line": 22,
+// JSON-NEXT:          "col": 9,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 937,
+// JSON-NEXT:          "line": 24,
+// JSON-NEXT:          "col": 1,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "inner": [
+// JSON-NEXT:         {
+// JSON-NEXT:          "id": "0x{{.*}}",
+// JSON-NEXT:          "kind": "ReturnStmt",
+// JSON-NEXT:          "range": {
+// JSON-NEXT:           "begin": {
+// JSON-NEXT:            "offset": 924,
+// JSON-NEXT:            "line": 23,
+// JSON-NEXT:            "col": 3,
+// JSON-NEXT:            "tokLen": 6
+// JSON-NEXT:           },
+// JSON-NEXT:           "end": {
+// JSON-NEXT:            "offset": 934,
+// JSON-NEXT:            "col": 13,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           }
+// JSON-NEXT:          },
+// JSON-NEXT:          "inner": [
+// JSON-NEXT:           {
+// JSON-NEXT:            "id": "0x{{.*}}",
+// JSON-NEXT:            "kind": "CXXUnresolvedConstructExpr",
+// JSON-NEXT:            "range": {
+// JSON-NEXT:             "begin": {
+// JSON-NEXT:              "offset": 931,
+// JSON-NEXT:              "col": 10,
+// JSON-NEXT:              "tokLen": 1
+// JSON-NEXT:             },
+// JSON-NEXT:             "end": {
+// JSON-NEXT:              "offset": 934,
+// JSON-NEXT:              "col": 13,
+// JSON-NEXT:              "tokLen": 1
+// JSON-NEXT:             }
+// JSON-NEXT:            },
+// JSON-NEXT:            "type": {
+// JSON-NEXT:             "qualType": "B"
+// JSON-NEXT:            },
+// JSON-NEXT:            "valueCategory": "prvalue",
+// JSON-NEXT:            "inner": [
+// JSON-NEXT:             {
+// JSON-NEXT:              "id": "0x{{.*}}",
+// JSON-NEXT:              "kind": "DeclRefExpr",
+// JSON-NEXT:              "range": {
+// JSON-NEXT:               "begin": {
+// JSON-NEXT:                "offset": 933,
+// JSON-NEXT:                "col": 12,
+// JSON-NEXT:                "tokLen": 1
+// JSON-NEXT:               },
+// JSON-NEXT:               "end": {
+// JSON-NEXT:                "offset": 933,
+// JSON-NEXT:                "col": 12,
+// JSON-NEXT:                "tokLen": 1
+// JSON-NEXT:               }
+// JSON-NEXT:              },
+// JSON-NEXT:              "type": {
+// JSON-NEXT:               "qualType": "int"
+// JSON-NEXT:              },
+// JSON-NEXT:              "valueCategory": "prvalue",
+// JSON-NEXT:              "referencedDecl": {
+// JSON-NEXT:               "id": "0x{{.*}}",
+// JSON-NEXT:               "kind": "NonTypeTemplateParmDecl",
+// JSON-NEXT:               "name": "A",
+// JSON-NEXT:               "type": {
+// JSON-NEXT:                "qualType": "int"
+// JSON-NEXT:               }
+// JSON-NEXT:              }
+// JSON-NEXT:             }
+// JSON-NEXT:            ]
+// JSON-NEXT:           }
+// JSON-NEXT:          ]
+// JSON-NEXT:         }
+// JSON-NEXT:        ]
+// JSON-NEXT:       }
+// JSON-NEXT:      ]
+// JSON-NEXT:     },
+// JSON-NEXT:     {
+// JSON-NEXT:      "id": "0x{{.*}}",
+// JSON-NEXT:      "kind": "FunctionDecl",
+// JSON-NEXT:      "loc": {
+// JSON-NEXT:       "offset": 914,
+// JSON-NEXT:       "line": 22,
+// JSON-NEXT:       "col": 3,
+// JSON-NEXT:       "tokLen": 3
+// JSON-NEXT:      },
+// JSON-NEXT:      "range": {
+// JSON-NEXT:       "begin": {
+// JSON-NEXT:        "offset": 912,
+// JSON-NEXT:        "col": 1,
+// JSON-NEXT:        "tokLen": 1
+// JSON-NEXT:       },
+// JSON-NEXT:       "end": {
+// JSON-NEXT:        "offset": 937,
+// JSON-NEXT:        "line": 24,
+// JSON-NEXT:        "col": 1,
+// JSON-NEXT:        "tokLen": 1
+// JSON-NEXT:       }
+// JSON-NEXT:      },
+// JSON-NEXT:      "isUsed": true,
+// JSON-NEXT:      "name": "bar",
+// JSON-NEXT:      "mangledName": "_Z3barILi5EiET0_v",
+// JSON-NEXT:      "type": {
+// JSON-NEXT:       "qualType": "int ()"
+// JSON-NEXT:      },
+// JSON-NEXT:      "inner": [
+// JSON-NEXT:       {
+// JSON-NEXT:        "kind": "TemplateArgument",
+// JSON-NEXT:        "value": 5
+// JSON-NEXT:       },
+// JSON-NEXT:       {
+// JSON-NEXT:        "kind": "TemplateArgument",
+// JSON-NEXT:        "type": {
+// JSON-NEXT:         "qualType": "int"
+// JSON-NEXT:        },
+// JSON-NEXT:        "inner": [
+// JSON-NEXT:         {
+// JSON-NEXT:          "id": "0x{{.*}}",
+// JSON-NEXT:          "kind": "BuiltinType",
+// JSON-NEXT:          "type": {
+// JSON-NEXT:           "qualType": "int"
+// JSON-NEXT:          }
+// JSON-NEXT:         }
+// JSON-NEXT:        ]
+// JSON-NEXT:       },
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "CompoundStmt",
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 920,
+// JSON-NEXT:          "line": 22,
+// JSON-NEXT:          "col": 9,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 937,
+// JSON-NEXT:          "line": 24,
+// JSON-NEXT:          "col": 1,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "inner": [
+// JSON-NEXT:         {
+// JSON-NEXT:          "id": "0x{{.*}}",
+// JSON-NEXT:          "kind": "ReturnStmt",
+// JSON-NEXT:          "range": {
+// JSON-NEXT:           "begin": {
+// JSON-NEXT:            "offset": 924,
+// JSON-NEXT:            "line": 23,
+// JSON-NEXT:            "col": 3,
+// JSON-NEXT:            "tokLen": 6
+// JSON-NEXT:           },
+// JSON-NEXT:           "end": {
+// JSON-NEXT:            "offset": 934,
+// JSON-NEXT:            "col": 13,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           }
+// JSON-NEXT:          },
+// JSON-NEXT:          "inner": [
+// JSON-NEXT:           {
+// JSON-NEXT:            "id": "0x{{.*}}",
+// JSON-NEXT:            "kind": "CXXFunctionalCastExpr",
+// JSON-NEXT:            "range": {
+// JSON-NEXT:             "begin": {
+// JSON-NEXT:              "offset": 931,
+// JSON-NEXT:              "col": 10,
+// JSON-NEXT:              "tokLen": 1
+// JSON-NEXT:             },
+// JSON-NEXT:             "end": {
+// JSON-NEXT:              "offset": 934,
+// JSON-NEXT:              "col": 13,
+// JSON-NEXT:              "tokLen": 1
+// JSON-NEXT:             }
+// JSON-NEXT:            },
+// JSON-NEXT:            "type": {
+// JSON-NEXT:             "qualType": "int"
+// JSON-NEXT:            },
+// JSON-NEXT:            "valueCategory": "prvalue",
+// JSON-NEXT:            "castKind": "NoOp",
+// JSON-NEXT:            "inner": [
+// JSON-NEXT:             {
+// JSON-NEXT:              "id": "0x{{.*}}",
+// JSON-NEXT:              "kind": "SubstNonTypeTemplateParmExpr",
+// JSON-NEXT:              "range": {
+// JSON-NEXT:               "begin": {
+// JSON-NEXT:                "offset": 933,
+// JSON-NEXT:                "col": 12,
+// JSON-NEXT:                "tokLen": 1
+// JSON-NEXT:               },
+// JSON-NEXT:               "end": {
+// JSON-NEXT:                "offset": 933,
+// JSON-NEXT:                "col": 12,
+// JSON-NEXT:                "tokLen": 1
+// JSON-NEXT:               }
+// JSON-NEXT:              },
+// JSON-NEXT:              "type": {
+// JSON-NEXT:               "qualType": "int"
+// JSON-NEXT:              },
+// JSON-NEXT:              "valueCategory": "prvalue",
+// JSON-NEXT:              "inner": [
+// JSON-NEXT:               {
+// JSON-NEXT:                "id": "0x{{.*}}",
+// JSON-NEXT:                "kind": "NonTypeTemplateParmDecl",
+// JSON-NEXT:                "loc": {
+// JSON-NEXT:                 "offset": 897,
+// JSON-NEXT:                 "line": 21,
+// JSON-NEXT:                 "col": 15,
+// JSON-NEXT:                 "tokLen": 1
+// JSON-NEXT:                },
+// JSON-NEXT:                "range": {
+// JSON-NEXT:                 "begin": {
+// JSON-NEXT:                  "offset": 893,
+// JSON-NEXT:                  "col": 11,
+// JSON-NEXT:                  "tokLen": 3
+// JSON-NEXT:                 },
+// JSON-NEXT:                 "end": {
+// JSON-NEXT:                  "offset": 897,
+// JSON-NEXT:                  "col": 15,
+// JSON-NEXT:                  "tokLen": 1
+// JSON-NEXT:                 }
+// JSON-NEXT:                },
+// JSON-NEXT:                "isReferenced": true,
+// JSON-NEXT:                "name": "A",
+// JSON-NEXT:                "type": {
+// JSON-NEXT:                 "qualType": "int"
+// JSON-NEXT:                },
+// JSON-NEXT:                "depth": 0,
+// JSON-NEXT:                "index": 0
+// JSON-NEXT:               },
+// JSON-NEXT:               {
+// JSON-NEXT:                "id": "0x{{.*}}",
+// JSON-NEXT:                "kind": "IntegerLiteral",
+// JSON-NEXT:                "range": {
+// JSON-NEXT:                 "begin": {
+// JSON-NEXT:                  "offset": 933,
+// JSON-NEXT:                  "line": 23,
+// JSON-NEXT:                  "col": 12,
+// JSON-NEXT:                  "tokLen": 1
+// JSON-NEXT:                 },
+// JSON-NEXT:                 "end": {
+// JSON-NEXT:                  "offset": 933,
+// JSON-NEXT:                  "col": 12,
+// JSON-NEXT:                  "tokLen": 1
+// JSON-NEXT:                 }
+// JSON-NEXT:                },
+// JSON-NEXT:                "type": {
+// JSON-NEXT:                 "qualType": "int"
+// JSON-NEXT:                },
+// JSON-NEXT:                "valueCategory": "prvalue",
+// JSON-NEXT:                "value": "5"
+// JSON-NEXT:               }
+// JSON-NEXT:              ]
+// JSON-NEXT:             }
+// JSON-NEXT:            ]
+// JSON-NEXT:           }
+// JSON-NEXT:          ]
+// JSON-NEXT:         }
+// JSON-NEXT:        ]
+// JSON-NEXT:       }
+// JSON-NEXT:      ]
+// JSON-NEXT:     }
+// JSON-NEXT:    ]
+// JSON-NEXT:   },
+// JSON-NEXT:   {
+// JSON-NEXT:    "id": "0x{{.*}}",
+// JSON-NEXT:    "kind": "FunctionDecl",
+// JSON-NEXT:    "loc": {
+// JSON-NEXT:     "offset": 945,
+// JSON-NEXT:     "line": 26,
+// JSON-NEXT:     "col": 6,
+// JSON-NEXT:     "tokLen": 3
+// JSON-NEXT:    },
+// JSON-NEXT:    "range": {
+// JSON-NEXT:     "begin": {
+// JSON-NEXT:      "offset": 940,
+// JSON-NEXT:      "col": 1,
+// JSON-NEXT:      "tokLen": 4
+// JSON-NEXT:     },
+// JSON-NEXT:     "end": {
+// JSON-NEXT:      "offset": 1055,
+// JSON-NEXT:      "line": 30,
+// JSON-NEXT:      "col": 1,
+// JSON-NEXT:      "tokLen": 1
+// JSON-NEXT:     }
+// JSON-NEXT:    },
+// JSON-NEXT:    "name": "baz",
+// JSON-NEXT:    "mangledName": "_Z3bazv",
+// JSON-NEXT:    "type": {
+// JSON-NEXT:     "qualType": "void ()"
+// JSON-NEXT:    },
+// JSON-NEXT:    "inner": [
+// JSON-NEXT:     {
+// JSON-NEXT:      "id": "0x{{.*}}",
+// JSON-NEXT:      "kind": "CompoundStmt",
+// JSON-NEXT:      "range": {
+// JSON-NEXT:       "begin": {
+// JSON-NEXT:        "offset": 951,
+// JSON-NEXT:        "line": 26,
+// JSON-NEXT:        "col": 12,
+// JSON-NEXT:        "tokLen": 1
+// JSON-NEXT:       },
+// JSON-NEXT:       "end": {
+// JSON-NEXT:        "offset": 1055,
+// JSON-NEXT:        "line": 30,
+// JSON-NEXT:        "col": 1,
+// JSON-NEXT:        "tokLen": 1
+// JSON-NEXT:       }
+// JSON-NEXT:      },
+// JSON-NEXT:      "inner": [
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "DeclStmt",
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 955,
+// JSON-NEXT:          "line": 27,
+// JSON-NEXT:          "col": 3,
+// JSON-NEXT:          "tokLen": 3
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 976,
+// JSON-NEXT:          "col": 24,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "inner": [
+// JSON-NEXT:         {
+// JSON-NEXT:          "id": "0x{{.*}}",
+// JSON-NEXT:          "kind": "VarDecl",
+// JSON-NEXT:          "loc": {
+// JSON-NEXT:           "offset": 959,
+// JSON-NEXT:           "col": 7,
+// JSON-NEXT:           "tokLen": 1
+// JSON-NEXT:          },
+// JSON-NEXT:          "range": {
+// JSON-NEXT:           "begin": {
+// JSON-NEXT:            "offset": 955,
+// JSON-NEXT:            "col": 3,
+// JSON-NEXT:            "tokLen": 3
+// JSON-NEXT:           },
+// JSON-NEXT:           "end": {
+// JSON-NEXT:            "offset": 975,
+// JSON-NEXT:            "col": 23,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           }
+// JSON-NEXT:          },
+// JSON-NEXT:          "name": "x",
+// JSON-NEXT:          "type": {
+// JSON-NEXT:           "qualType": "int"
+// JSON-NEXT:          },
+// JSON-NEXT:          "init": "c",
+// JSON-NEXT:          "inner": [
+// JSON-NEXT:           {
+// JSON-NEXT:            "id": "0x{{.*}}",
+// JSON-NEXT:            "kind": "CallExpr",
+// JSON-NEXT:            "range": {
+// JSON-NEXT:             "begin": {
+// JSON-NEXT:              "offset": 963,
+// JSON-NEXT:              "col": 11,
+// JSON-NEXT:              "tokLen": 3
+// JSON-NEXT:             },
+// JSON-NEXT:             "end": {
+// JSON-NEXT:              "offset": 975,
+// JSON-NEXT:              "col": 23,
+// JSON-NEXT:              "tokLen": 1
+// JSON-NEXT:             }
+// JSON-NEXT:            },
+// JSON-NEXT:            "type": {
+// JSON-NEXT:             "qualType": "int"
+// JSON-NEXT:            },
+// JSON-NEXT:            "valueCategory": "prvalue",
+// JSON-NEXT:            "inner": [
+// JSON-NEXT:             {
+// JSON-NEXT:              "id": "0x{{.*}}",
+// JSON-NEXT:              "kind": "ImplicitCastExpr",
+// JSON-NEXT:              "range": {
+// JSON-NEXT:               "begin": {
+// JSON-NEXT:                "offset": 963,
+// JSON-NEXT:                "col": 11,
+// JSON-NEXT:                "tokLen": 3
+// JSON-NEXT:               },
+// JSON-NEXT:               "end": {
+// JSON-NEXT:                "offset": 973,
+// JSON-NEXT:                "col": 21,
+// JSON-NEXT:                "tokLen": 1
+// JSON-NEXT:               }
+// JSON-NEXT:              },
+// JSON-NEXT:              "type": {
+// JSON-NEXT:               "qualType": "int (*)()"
+// JSON-NEXT:              },
+// JSON-NEXT:              "valueCategory": "prvalue",
+// JSON-NEXT:              "castKind": "FunctionToPointerDecay",
+// JSON-NEXT:              "inner": [
+// JSON-NEXT:               {
+// JSON-NEXT:                "id": "0x{{.*}}",
+// JSON-NEXT:                "kind": "DeclRefExpr",
+// JSON-NEXT:                "range": {
+// JSON-NEXT:                 "begin": {
+// JSON-NEXT:                  "offset": 963,
+// JSON-NEXT:                  "col": 11,
+// JSON-NEXT:                  "tokLen": 3
+// JSON-NEXT:                 },
+// JSON-NEXT:                 "end": {
+// JSON-NEXT:                  "offset": 973,
+// JSON-NEXT:                  "col": 21,
+// JSON-NEXT:                  "tokLen": 1
+// JSON-NEXT:                 }
+// JSON-NEXT:                },
+// JSON-NEXT:                "type": {
+// JSON-NEXT:                 "qualType": "int ()"
+// JSON-NEXT:                },
+// JSON-NEXT:                "valueCategory": "lvalue",
+// JSON-NEXT:                "referencedDecl": {
+// JSON-NEXT:                 "id": "0x{{.*}}",
+// JSON-NEXT:                 "kind": "FunctionDecl",
+// JSON-NEXT:                 "name": "bar",
+// JSON-NEXT:                 "type": {
+// JSON-NEXT:                  "qualType": "int ()"
+// JSON-NEXT:                 }
+// JSON-NEXT:                },
+// JSON-NEXT:                "foundReferencedDecl": {
+// JSON-NEXT:                 "id": "0x{{.*}}",
+// JSON-NEXT:                 "kind": "FunctionTemplateDecl",
+// JSON-NEXT:                 "name": "bar"
+// JSON-NEXT:                }
+// JSON-NEXT:               }
+// JSON-NEXT:              ]
+// JSON-NEXT:             }
+// JSON-NEXT:            ]
+// JSON-NEXT:           }
+// JSON-NEXT:          ]
+// JSON-NEXT:         }
+// JSON-NEXT:        ]
+// JSON-NEXT:       },
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "DeclStmt",
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 980,
+// JSON-NEXT:          "line": 28,
+// JSON-NEXT:          "col": 3,
+// JSON-NEXT:          "tokLen": 3
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 1010,
+// JSON-NEXT:          "col": 33,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "inner": [
+// JSON-NEXT:         {
+// JSON-NEXT:          "id": "0x{{.*}}",
+// JSON-NEXT:          "kind": "VarDecl",
+// JSON-NEXT:          "loc": {
+// JSON-NEXT:           "offset": 984,
+// JSON-NEXT:           "col": 7,
+// JSON-NEXT:           "tokLen": 1
+// JSON-NEXT:          },
+// JSON-NEXT:          "range": {
+// JSON-NEXT:           "begin": {
+// JSON-NEXT:            "offset": 980,
+// JSON-NEXT:            "col": 3,
+// JSON-NEXT:            "tokLen": 3
+// JSON-NEXT:           },
+// JSON-NEXT:           "end": {
+// JSON-NEXT:            "offset": 1009,
+// JSON-NEXT:            "col": 32,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           }
+// JSON-NEXT:          },
+// JSON-NEXT:          "name": "y",
+// JSON-NEXT:          "type": {
+// JSON-NEXT:           "qualType": "int"
+// JSON-NEXT:          },
+// JSON-NEXT:          "init": "c",
+// JSON-NEXT:          "inner": [
+// JSON-NEXT:           {
+// JSON-NEXT:            "id": "0x{{.*}}",
+// JSON-NEXT:            "kind": "ExprWithCleanups",
+// JSON-NEXT:            "range": {
+// JSON-NEXT:             "begin": {
+// JSON-NEXT:              "offset": 988,
+// JSON-NEXT:              "col": 11,
+// JSON-NEXT:              "tokLen": 3
+// JSON-NEXT:             },
+// JSON-NEXT:             "end": {
+// JSON-NEXT:              "offset": 1009,
+// JSON-NEXT:              "col": 32,
+// JSON-NEXT:              "tokLen": 1
+// JSON-NEXT:             }
+// JSON-NEXT:            },
+// JSON-NEXT:            "type": {
+// JSON-NEXT:             "qualType": "int"
+// JSON-NEXT:            },
+// JSON-NEXT:            "valueCategory": "prvalue",
+// JSON-NEXT:            "inner": [
+// JSON-NEXT:             {
+// JSON-NEXT:              "id": "0x{{.*}}",
+// JSON-NEXT:              "kind": "CXXMemberCallExpr",
+// JSON-NEXT:              "range": {
+// JSON-NEXT:               "begin": {
+// JSON-NEXT:                "offset": 988,
+// JSON-NEXT:                "col": 11,
+// JSON-NEXT:                "tokLen": 3
+// JSON-NEXT:               },
+// JSON-NEXT:               "end": {
+// JSON-NEXT:                "offset": 1009,
+// JSON-NEXT:                "col": 32,
+// JSON-NEXT:                "tokLen": 1
+// JSON-NEXT:               }
+// JSON-NEXT:              },
+// JSON-NEXT:              "type": {
+// JSON-NEXT:               "qualType": "int"
+// JSON-NEXT:              },
+// JSON-NEXT:              "valueCategory": "prvalue",
+// JSON-NEXT:              "inner": [
+// JSON-NEXT:               {
+// JSON-NEXT:                "id": "0x{{.*}}",
+// JSON-NEXT:                "kind": "MemberExpr",
+// JSON-NEXT:                "range": {
+// JSON-NEXT:                 "begin": {
+// JSON-NEXT:                  "offset": 988,
+// JSON-NEXT:                  "col": 11,
+// JSON-NEXT:                  "tokLen": 3
+// JSON-NEXT:                 },
+// JSON-NEXT:                 "end": {
+// JSON-NEXT:                  "offset": 1002,
+// JSON-NEXT:                  "col": 25,
+// JSON-NEXT:                  "tokLen": 6
+// JSON-NEXT:                 }
+// JSON-NEXT:                },
+// JSON-NEXT:                "type": {
+// JSON-NEXT:                 "qualType": "<bound member function type>"
+// JSON-NEXT:                },
+// JSON-NEXT:                "valueCategory": "prvalue",
+// JSON-NEXT:                "name": "getSum",
+// JSON-NEXT:                "isArrow": false,
+// JSON-NEXT:                "referencedMemberDecl": "0x{{.*}}",
+// JSON-NEXT:                "inner": [
+// JSON-NEXT:                 {
+// JSON-NEXT:                  "id": "0x{{.*}}",
+// JSON-NEXT:                  "kind": "MaterializeTemporaryExpr",
+// JSON-NEXT:                  "range": {
+// JSON-NEXT:                   "begin": {
+// JSON-NEXT:                    "offset": 988,
+// JSON-NEXT:                    "col": 11,
+// JSON-NEXT:                    "tokLen": 3
+// JSON-NEXT:                   },
+// JSON-NEXT:                   "end": {
+// JSON-NEXT:                    "offset": 1000,
+// JSON-NEXT:                    "col": 23,
+// JSON-NEXT:                    "tokLen": 1
+// JSON-NEXT:                   }
+// JSON-NEXT:                  },
+// JSON-NEXT:                  "type": {
+// JSON-NEXT:                   "qualType": "foo<5, int>"
+// JSON-NEXT:                  },
+// JSON-NEXT:                  "valueCategory": "xvalue",
+// JSON-NEXT:                  "storageDuration": "full expression",
+// JSON-NEXT:                  "inner": [
+// JSON-NEXT:                   {
+// JSON-NEXT:                    "id": "0x{{.*}}",
+// JSON-NEXT:                    "kind": "CXXTemporaryObjectExpr",
+// JSON-NEXT:                    "range": {
+// JSON-NEXT:                     "begin": {
+// JSON-NEXT:                      "offset": 988,
+// JSON-NEXT:                      "col": 11,
+// JSON-NEXT:                      "tokLen": 3
+// JSON-NEXT:                     },
+// JSON-NEXT:                     "end": {
+// JSON-NEXT:                      "offset": 1000,
+// JSON-NEXT:                      "col": 23,
+// JSON-NEXT:                      "tokLen": 1
+// JSON-NEXT:                     }
+// JSON-NEXT:                    },
+// JSON-NEXT:                    "type": {
+// JSON-NEXT:                     "qualType": "foo<5, int>"
+// JSON-NEXT:                    },
+// JSON-NEXT:                    "valueCategory": "prvalue",
+// JSON-NEXT:                    "ctorType": {
+// JSON-NEXT:                     "qualType": "void ()"
+// JSON-NEXT:                    },
+// JSON-NEXT:                    "hadMultipleCandidates": true,
+// JSON-NEXT:                    "constructionKind": "complete"
+// JSON-NEXT:                   }
+// JSON-NEXT:                  ]
+// JSON-NEXT:                 }
+// JSON-NEXT:                ]
+// JSON-NEXT:               }
+// JSON-NEXT:              ]
+// JSON-NEXT:             }
+// JSON-NEXT:            ]
+// JSON-NEXT:           }
+// JSON-NEXT:          ]
+// JSON-NEXT:         }
+// JSON-NEXT:        ]
+// JSON-NEXT:       },
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "DeclStmt",
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 1014,
+// JSON-NEXT:          "line": 29,
+// JSON-NEXT:          "col": 3,
+// JSON-NEXT:          "tokLen": 6
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 1053,
+// JSON-NEXT:          "col": 42,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "inner": [
+// JSON-NEXT:         {
+// JSON-NEXT:          "id": "0x{{.*}}",
+// JSON-NEXT:          "kind": "VarDecl",
+// JSON-NEXT:          "loc": {
+// JSON-NEXT:           "offset": 1021,
+// JSON-NEXT:           "col": 10,
+// JSON-NEXT:           "tokLen": 1
+// JSON-NEXT:          },
+// JSON-NEXT:          "range": {
+// JSON-NEXT:           "begin": {
+// JSON-NEXT:            "offset": 1014,
+// JSON-NEXT:            "col": 3,
+// JSON-NEXT:            "tokLen": 6
+// JSON-NEXT:           },
+// JSON-NEXT:           "end": {
+// JSON-NEXT:            "offset": 1052,
+// JSON-NEXT:            "col": 41,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           }
+// JSON-NEXT:          },
+// JSON-NEXT:          "name": "z",
+// JSON-NEXT:          "type": {
+// JSON-NEXT:           "qualType": "double"
+// JSON-NEXT:          },
+// JSON-NEXT:          "init": "c",
+// JSON-NEXT:          "inner": [
+// JSON-NEXT:           {
+// JSON-NEXT:            "id": "0x{{.*}}",
+// JSON-NEXT:            "kind": "ExprWithCleanups",
+// JSON-NEXT:            "range": {
+// JSON-NEXT:             "begin": {
+// JSON-NEXT:              "offset": 1025,
+// JSON-NEXT:              "col": 14,
+// JSON-NEXT:              "tokLen": 3
+// JSON-NEXT:             },
+// JSON-NEXT:             "end": {
+// JSON-NEXT:              "offset": 1052,
+// JSON-NEXT:              "col": 41,
+// JSON-NEXT:              "tokLen": 1
+// JSON-NEXT:             }
+// JSON-NEXT:            },
+// JSON-NEXT:            "type": {
+// JSON-NEXT:             "qualType": "double"
+// JSON-NEXT:            },
+// JSON-NEXT:            "valueCategory": "prvalue",
+// JSON-NEXT:            "inner": [
+// JSON-NEXT:             {
+// JSON-NEXT:              "id": "0x{{.*}}",
+// JSON-NEXT:              "kind": "CXXMemberCallExpr",
+// JSON-NEXT:              "range": {
+// JSON-NEXT:               "begin": {
+// JSON-NEXT:                "offset": 1025,
+// JSON-NEXT:                "col": 14,
+// JSON-NEXT:                "tokLen": 3
+// JSON-NEXT:               },
+// JSON-NEXT:               "end": {
+// JSON-NEXT:                "offset": 1052,
+// JSON-NEXT:                "col": 41,
+// JSON-NEXT:                "tokLen": 1
+// JSON-NEXT:               }
+// JSON-NEXT:              },
+// JSON-NEXT:              "type": {
+// JSON-NEXT:               "qualType": "double"
+// JSON-NEXT:              },
+// JSON-NEXT:              "valueCategory": "prvalue",
+// JSON-NEXT:              "inner": [
+// JSON-NEXT:               {
+// JSON-NEXT:                "id": "0x{{.*}}",
+// JSON-NEXT:                "kind": "MemberExpr",
+// JSON-NEXT:                "range": {
+// JSON-NEXT:                 "begin": {
+// JSON-NEXT:                  "offset": 1025,
+// JSON-NEXT:                  "col": 14,
+// JSON-NEXT:                  "tokLen": 3
+// JSON-NEXT:                 },
+// JSON-NEXT:                 "end": {
+// JSON-NEXT:                  "offset": 1045,
+// JSON-NEXT:                  "col": 34,
+// JSON-NEXT:                  "tokLen": 6
+// JSON-NEXT:                 }
+// JSON-NEXT:                },
+// JSON-NEXT:                "type": {
+// JSON-NEXT:                 "qualType": "<bound member function type>"
+// JSON-NEXT:                },
+// JSON-NEXT:                "valueCategory": "prvalue",
+// JSON-NEXT:                "name": "getSum",
+// JSON-NEXT:                "isArrow": false,
+// JSON-NEXT:                "referencedMemberDecl": "0x{{.*}}",
+// JSON-NEXT:                "inner": [
+// JSON-NEXT:                 {
+// JSON-NEXT:                  "id": "0x{{.*}}",
+// JSON-NEXT:                  "kind": "MaterializeTemporaryExpr",
+// JSON-NEXT:                  "range": {
+// JSON-NEXT:                   "begin": {
+// JSON-NEXT:                    "offset": 1025,
+// JSON-NEXT:                    "col": 14,
+// JSON-NEXT:                    "tokLen": 3
+// JSON-NEXT:                   },
+// JSON-NEXT:                   "end": {
+// JSON-NEXT:                    "offset": 1043,
+// JSON-NEXT:                    "col": 32,
+// JSON-NEXT:                    "tokLen": 1
+// JSON-NEXT:                   }
+// JSON-NEXT:                  },
+// JSON-NEXT:                  "type": {
+// JSON-NEXT:                   "qualType": "foo<2, double, 3>"
+// JSON-NEXT:                  },
+// JSON-NEXT:                  "valueCategory": "xvalue",
+// JSON-NEXT:                  "storageDuration": "full expression",
+// JSON-NEXT:                  "inner": [
+// JSON-NEXT:                   {
+// JSON-NEXT:                    "id": "0x{{.*}}",
+// JSON-NEXT:                    "kind": "CXXTemporaryObjectExpr",
+// JSON-NEXT:                    "range": {
+// JSON-NEXT:                     "begin": {
+// JSON-NEXT:                      "offset": 1025,
+// JSON-NEXT:                      "col": 14,
+// JSON-NEXT:                      "tokLen": 3
+// JSON-NEXT:                     },
+// JSON-NEXT:                     "end": {
+// JSON-NEXT:                      "offset": 1043,
+// JSON-NEXT:                      "col": 32,
+// JSON-NEXT:                      "tokLen": 1
+// JSON-NEXT:                     }
+// JSON-NEXT:                    },
+// JSON-NEXT:                    "type": {
+// JSON-NEXT:                     "qualType": "foo<2, double, 3>"
+// JSON-NEXT:                    },
+// JSON-NEXT:                    "valueCategory": "prvalue",
+// JSON-NEXT:                    "ctorType": {
+// JSON-NEXT:                     "qualType": "void ()"
+// JSON-NEXT:                    },
+// JSON-NEXT:                    "hadMultipleCandidates": true,
+// JSON-NEXT:                    "constructionKind": "complete"
+// JSON-NEXT:                   }
+// JSON-NEXT:                  ]
+// JSON-NEXT:                 }
+// JSON-NEXT:                ]
+// JSON-NEXT:               }
+// JSON-NEXT:              ]
+// JSON-NEXT:             }
+// JSON-NEXT:            ]
+// JSON-NEXT:           }
+// JSON-NEXT:          ]
+// JSON-NEXT:         }
+// JSON-NEXT:        ]
+// JSON-NEXT:       }
+// JSON-NEXT:      ]
+// JSON-NEXT:     }
+// JSON-NEXT:    ]
+// JSON-NEXT:   },
+// JSON-NEXT:   {
+// JSON-NEXT:    "id": "0x{{.*}}",
+// JSON-NEXT:    "kind": "ClassTemplateDecl",
+// JSON-NEXT:    "loc": {
+// JSON-NEXT:     "offset": 1856,
+// JSON-NEXT:     "line": 52,
+// JSON-NEXT:     "col": 33,
+// JSON-NEXT:     "tokLen": 1
+// JSON-NEXT:    },
+// JSON-NEXT:    "range": {
+// JSON-NEXT:     "begin": {
+// JSON-NEXT:      "offset": 1824,
+// JSON-NEXT:      "col": 1,
+// JSON-NEXT:      "tokLen": 8
+// JSON-NEXT:     },
+// JSON-NEXT:     "end": {
+// JSON-NEXT:      "offset": 1896,
+// JSON-NEXT:      "line": 54,
+// JSON-NEXT:      "col": 1,
+// JSON-NEXT:      "tokLen": 1
+// JSON-NEXT:     }
+// JSON-NEXT:    },
+// JSON-NEXT:    "name": "A",
+// JSON-NEXT:    "inner": [
+// JSON-NEXT:     {
+// JSON-NEXT:      "id": "0x{{.*}}",
+// JSON-NEXT:      "kind": "TemplateTypeParmDecl",
+// JSON-NEXT:      "loc": {
+// JSON-NEXT:       "offset": 1846,
+// JSON-NEXT:       "line": 52,
+// JSON-NEXT:       "col": 23,
+// JSON-NEXT:       "tokLen": 1
+// JSON-NEXT:      },
+// JSON-NEXT:      "range": {
+// JSON-NEXT:       "begin": {
+// JSON-NEXT:        "offset": 1834,
+// JSON-NEXT:        "col": 11,
+// JSON-NEXT:        "tokLen": 8
+// JSON-NEXT:       },
+// JSON-NEXT:       "end": {
+// JSON-NEXT:        "offset": 1846,
+// JSON-NEXT:        "col": 23,
+// JSON-NEXT:        "tokLen": 1
+// JSON-NEXT:       }
+// JSON-NEXT:      },
+// JSON-NEXT:      "isReferenced": true,
+// JSON-NEXT:      "name": "T",
+// JSON-NEXT:      "tagUsed": "typename",
+// JSON-NEXT:      "depth": 0,
+// JSON-NEXT:      "index": 0,
+// JSON-NEXT:      "isParameterPack": true
+// JSON-NEXT:     },
+// JSON-NEXT:     {
+// JSON-NEXT:      "id": "0x{{.*}}",
+// JSON-NEXT:      "kind": "CXXRecordDecl",
+// JSON-NEXT:      "loc": {
+// JSON-NEXT:       "offset": 1856,
+// JSON-NEXT:       "col": 33,
+// JSON-NEXT:       "tokLen": 1
+// JSON-NEXT:      },
+// JSON-NEXT:      "range": {
+// JSON-NEXT:       "begin": {
+// JSON-NEXT:        "offset": 1849,
+// JSON-NEXT:        "col": 26,
+// JSON-NEXT:        "tokLen": 6
+// JSON-NEXT:       },
+// JSON-NEXT:       "end": {
+// JSON-NEXT:        "offset": 1896,
+// JSON-NEXT:        "line": 54,
+// JSON-NEXT:        "col": 1,
+// JSON-NEXT:        "tokLen": 1
+// JSON-NEXT:       }
+// JSON-NEXT:      },
+// JSON-NEXT:      "name": "A",
+// JSON-NEXT:      "tagUsed": "struct",
+// JSON-NEXT:      "completeDefinition": true,
+// JSON-NEXT:      "definitionData": {
+// JSON-NEXT:       "canConstDefaultInit": true,
+// JSON-NEXT:       "copyAssign": {
+// JSON-NEXT:        "hasConstParam": true,
+// JSON-NEXT:        "implicitHasConstParam": true,
+// JSON-NEXT:        "needsImplicit": true,
+// JSON-NEXT:        "simple": true,
+// JSON-NEXT:        "trivial": true
+// JSON-NEXT:       },
+// JSON-NEXT:       "copyCtor": {
+// JSON-NEXT:        "hasConstParam": true,
+// JSON-NEXT:        "implicitHasConstParam": true,
+// JSON-NEXT:        "needsImplicit": true,
+// JSON-NEXT:        "simple": true,
+// JSON-NEXT:        "trivial": true
+// JSON-NEXT:       },
+// JSON-NEXT:       "defaultCtor": {
+// JSON-NEXT:        "defaultedIsConstexpr": true,
+// JSON-NEXT:        "exists": true,
+// JSON-NEXT:        "isConstexpr": true,
+// JSON-NEXT:        "needsImplicit": true,
+// JSON-NEXT:        "trivial": true
+// JSON-NEXT:       },
+// JSON-NEXT:       "dtor": {
+// JSON-NEXT:        "irrelevant": true,
+// JSON-NEXT:        "needsImplicit": true,
+// JSON-NEXT:        "simple": true,
+// JSON-NEXT:        "trivial": true
+// JSON-NEXT:       },
+// JSON-NEXT:       "hasConstexprNonCopyMoveConstructor": true,
+// JSON-NEXT:       "isAggregate": true,
+// JSON-NEXT:       "isEmpty": true,
+// JSON-NEXT:       "isLiteral": true,
+// JSON-NEXT:       "isPOD": true,
+// JSON-NEXT:       "isStandardLayout": true,
+// JSON-NEXT:       "isTrivial": true,
+// JSON-NEXT:       "isTriviallyCopyable": true,
+// JSON-NEXT:       "moveAssign": {
+// JSON-NEXT:        "exists": true,
+// JSON-NEXT:        "needsImplicit": true,
+// JSON-NEXT:        "simple": true,
+// JSON-NEXT:        "trivial": true
+// JSON-NEXT:       },
+// JSON-NEXT:       "moveCtor": {
+// JSON-NEXT:        "exists": true,
+// JSON-NEXT:        "needsImplicit": true,
+// JSON-NEXT:        "simple": true,
+// JSON-NEXT:        "trivial": true
+// JSON-NEXT:       }
+// JSON-NEXT:      },
+// JSON-NEXT:      "inner": [
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "CXXRecordDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 1856,
+// JSON-NEXT:         "line": 52,
+// JSON-NEXT:         "col": 33,
+// JSON-NEXT:         "tokLen": 1
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 1849,
+// JSON-NEXT:          "col": 26,
+// JSON-NEXT:          "tokLen": 6
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 1856,
+// JSON-NEXT:          "col": 33,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "isImplicit": true,
+// JSON-NEXT:        "name": "A",
+// JSON-NEXT:        "tagUsed": "struct"
+// JSON-NEXT:       },
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "ClassTemplateDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 1890,
+// JSON-NEXT:         "line": 53,
+// JSON-NEXT:         "col": 31,
+// JSON-NEXT:         "tokLen": 1
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 1862,
+// JSON-NEXT:          "col": 3,
+// JSON-NEXT:          "tokLen": 8
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 1893,
+// JSON-NEXT:          "col": 34,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "name": "B",
+// JSON-NEXT:        "inner": [
+// JSON-NEXT:         {
+// JSON-NEXT:          "id": "0x{{.*}}",
+// JSON-NEXT:          "kind": "NonTypeTemplateParmDecl",
+// JSON-NEXT:          "loc": {
+// JSON-NEXT:           "offset": 1877,
+// JSON-NEXT:           "col": 18,
+// JSON-NEXT:           "tokLen": 1
+// JSON-NEXT:          },
+// JSON-NEXT:          "range": {
+// JSON-NEXT:           "begin": {
+// JSON-NEXT:            "offset": 1872,
+// JSON-NEXT:            "col": 13,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           },
+// JSON-NEXT:           "end": {
+// JSON-NEXT:            "offset": 1880,
+// JSON-NEXT:            "col": 21,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           }
+// JSON-NEXT:          },
+// JSON-NEXT:          "name": "x",
+// JSON-NEXT:          "type": {
+// JSON-NEXT:           "qualType": "T[3]..."
+// JSON-NEXT:          },
+// JSON-NEXT:          "depth": 1,
+// JSON-NEXT:          "index": 0,
+// JSON-NEXT:          "isParameterPack": true
+// JSON-NEXT:         },
+// JSON-NEXT:         {
+// JSON-NEXT:          "id": "0x{{.*}}",
+// JSON-NEXT:          "kind": "CXXRecordDecl",
+// JSON-NEXT:          "loc": {
+// JSON-NEXT:           "offset": 1890,
+// JSON-NEXT:           "col": 31,
+// JSON-NEXT:           "tokLen": 1
+// JSON-NEXT:          },
+// JSON-NEXT:          "range": {
+// JSON-NEXT:           "begin": {
+// JSON-NEXT:            "offset": 1883,
+// JSON-NEXT:            "col": 24,
+// JSON-NEXT:            "tokLen": 6
+// JSON-NEXT:           },
+// JSON-NEXT:           "end": {
+// JSON-NEXT:            "offset": 1893,
+// JSON-NEXT:            "col": 34,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           }
+// JSON-NEXT:          },
+// JSON-NEXT:          "name": "B",
+// JSON-NEXT:          "tagUsed": "struct",
+// JSON-NEXT:          "completeDefinition": true,
+// JSON-NEXT:          "definitionData": {
+// JSON-NEXT:           "canConstDefaultInit": true,
+// JSON-NEXT:           "copyAssign": {
+// JSON-NEXT:            "hasConstParam": true,
+// JSON-NEXT:            "implicitHasConstParam": true,
+// JSON-NEXT:            "needsImplicit": true,
+// JSON-NEXT:            "simple": true,
+// JSON-NEXT:            "trivial": true
+// JSON-NEXT:           },
+// JSON-NEXT:           "copyCtor": {
+// JSON-NEXT:            "hasConstParam": true,
+// JSON-NEXT:            "implicitHasConstParam": true,
+// JSON-NEXT:            "needsImplicit": true,
+// JSON-NEXT:            "simple": true,
+// JSON-NEXT:            "trivial": true
+// JSON-NEXT:           },
+// JSON-NEXT:           "defaultCtor": {
+// JSON-NEXT:            "defaultedIsConstexpr": true,
+// JSON-NEXT:            "exists": true,
+// JSON-NEXT:            "isConstexpr": true,
+// JSON-NEXT:            "needsImplicit": true,
+// JSON-NEXT:            "trivial": true
+// JSON-NEXT:           },
+// JSON-NEXT:           "dtor": {
+// JSON-NEXT:            "irrelevant": true,
+// JSON-NEXT:            "needsImplicit": true,
+// JSON-NEXT:            "simple": true,
+// JSON-NEXT:            "trivial": true
+// JSON-NEXT:           },
+// JSON-NEXT:           "hasConstexprNonCopyMoveConstructor": true,
+// JSON-NEXT:           "isAggregate": true,
+// JSON-NEXT:           "isEmpty": true,
+// JSON-NEXT:           "isLiteral": true,
+// JSON-NEXT:           "isPOD": true,
+// JSON-NEXT:           "isStandardLayout": true,
+// JSON-NEXT:           "isTrivial": true,
+// JSON-NEXT:           "isTriviallyCopyable": true,
+// JSON-NEXT:           "moveAssign": {
+// JSON-NEXT:            "exists": true,
+// JSON-NEXT:            "needsImplicit": true,
+// JSON-NEXT:            "simple": true,
+// JSON-NEXT:            "trivial": true
+// JSON-NEXT:           },
+// JSON-NEXT:           "moveCtor": {
+// JSON-NEXT:            "exists": true,
+// JSON-NEXT:            "needsImplicit": true,
+// JSON-NEXT:            "simple": true,
+// JSON-NEXT:            "trivial": true
+// JSON-NEXT:           }
+// JSON-NEXT:          },
+// JSON-NEXT:          "inner": [
+// JSON-NEXT:           {
+// JSON-NEXT:            "id": "0x{{.*}}",
+// JSON-NEXT:            "kind": "CXXRecordDecl",
+// JSON-NEXT:            "loc": {
+// JSON-NEXT:             "offset": 1890,
+// JSON-NEXT:             "col": 31,
+// JSON-NEXT:             "tokLen": 1
+// JSON-NEXT:            },
+// JSON-NEXT:            "range": {
+// JSON-NEXT:             "begin": {
+// JSON-NEXT:              "offset": 1883,
+// JSON-NEXT:              "col": 24,
+// JSON-NEXT:              "tokLen": 6
+// JSON-NEXT:             },
+// JSON-NEXT:             "end": {
+// JSON-NEXT:              "offset": 1890,
+// JSON-NEXT:              "col": 31,
+// JSON-NEXT:              "tokLen": 1
+// JSON-NEXT:             }
+// JSON-NEXT:            },
+// JSON-NEXT:            "isImplicit": true,
+// JSON-NEXT:            "name": "B",
+// JSON-NEXT:            "tagUsed": "struct"
+// JSON-NEXT:           }
+// JSON-NEXT:          ]
+// JSON-NEXT:         }
+// JSON-NEXT:        ]
+// JSON-NEXT:       }
+// JSON-NEXT:      ]
+// JSON-NEXT:     }
+// JSON-NEXT:    ]
+// JSON-NEXT:   },
+// JSON-NEXT:   {
+// JSON-NEXT:    "id": "0x{{.*}}",
+// JSON-NEXT:    "kind": "FunctionTemplateDecl",
+// JSON-NEXT:    "loc": {
+// JSON-NEXT:     "offset": 2016,
+// JSON-NEXT:     "line": 58,
+// JSON-NEXT:     "col": 31,
+// JSON-NEXT:     "tokLen": 1
+// JSON-NEXT:    },
+// JSON-NEXT:    "range": {
+// JSON-NEXT:     "begin": {
+// JSON-NEXT:      "offset": 1986,
+// JSON-NEXT:      "col": 1,
+// JSON-NEXT:      "tokLen": 8
+// JSON-NEXT:     },
+// JSON-NEXT:     "end": {
+// JSON-NEXT:      "offset": 2038,
+// JSON-NEXT:      "line": 60,
+// JSON-NEXT:      "col": 1,
+// JSON-NEXT:      "tokLen": 1
+// JSON-NEXT:     }
+// JSON-NEXT:    },
+// JSON-NEXT:    "name": "f",
+// JSON-NEXT:    "inner": [
+// JSON-NEXT:     {
+// JSON-NEXT:      "id": "0x{{.*}}",
+// JSON-NEXT:      "kind": "TemplateTypeParmDecl",
+// JSON-NEXT:      "loc": {
+// JSON-NEXT:       "offset": 2008,
+// JSON-NEXT:       "line": 58,
+// JSON-NEXT:       "col": 23,
+// JSON-NEXT:       "tokLen": 1
+// JSON-NEXT:      },
+// JSON-NEXT:      "range": {
+// JSON-NEXT:       "begin": {
+// JSON-NEXT:        "offset": 1996,
+// JSON-NEXT:        "col": 11,
+// JSON-NEXT:        "tokLen": 8
+// JSON-NEXT:       },
+// JSON-NEXT:       "end": {
+// JSON-NEXT:        "offset": 2008,
+// JSON-NEXT:        "col": 23,
+// JSON-NEXT:        "tokLen": 1
+// JSON-NEXT:       }
+// JSON-NEXT:      },
+// JSON-NEXT:      "isReferenced": true,
+// JSON-NEXT:      "name": "T",
+// JSON-NEXT:      "tagUsed": "typename",
+// JSON-NEXT:      "depth": 0,
+// JSON-NEXT:      "index": 0,
+// JSON-NEXT:      "isParameterPack": true
+// JSON-NEXT:     },
+// JSON-NEXT:     {
+// JSON-NEXT:      "id": "0x{{.*}}",
+// JSON-NEXT:      "kind": "FunctionDecl",
+// JSON-NEXT:      "loc": {
+// JSON-NEXT:       "offset": 2016,
+// JSON-NEXT:       "col": 31,
+// JSON-NEXT:       "tokLen": 1
+// JSON-NEXT:      },
+// JSON-NEXT:      "range": {
+// JSON-NEXT:       "begin": {
+// JSON-NEXT:        "offset": 2011,
+// JSON-NEXT:        "col": 26,
+// JSON-NEXT:        "tokLen": 4
+// JSON-NEXT:       },
+// JSON-NEXT:       "end": {
+// JSON-NEXT:        "offset": 2038,
+// JSON-NEXT:        "line": 60,
+// JSON-NEXT:        "col": 1,
+// JSON-NEXT:        "tokLen": 1
+// JSON-NEXT:       }
+// JSON-NEXT:      },
+// JSON-NEXT:      "name": "f",
+// JSON-NEXT:      "type": {
+// JSON-NEXT:       "qualType": "void ()"
+// JSON-NEXT:      },
+// JSON-NEXT:      "inner": [
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "CompoundStmt",
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 2020,
+// JSON-NEXT:          "line": 58,
+// JSON-NEXT:          "col": 35,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 2038,
+// JSON-NEXT:          "line": 60,
+// JSON-NEXT:          "col": 1,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "inner": [
+// JSON-NEXT:         {
+// JSON-NEXT:          "id": "0x{{.*}}",
+// JSON-NEXT:          "kind": "DeclStmt",
+// JSON-NEXT:          "range": {
+// JSON-NEXT:           "begin": {
+// JSON-NEXT:            "offset": 2024,
+// JSON-NEXT:            "line": 59,
+// JSON-NEXT:            "col": 3,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           },
+// JSON-NEXT:           "end": {
+// JSON-NEXT:            "offset": 2036,
+// JSON-NEXT:            "col": 15,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           }
+// JSON-NEXT:          },
+// JSON-NEXT:          "inner": [
+// JSON-NEXT:           {
+// JSON-NEXT:            "id": "0x{{.*}}",
+// JSON-NEXT:            "kind": "VarDecl",
+// JSON-NEXT:            "loc": {
+// JSON-NEXT:             "offset": 2035,
+// JSON-NEXT:             "col": 14,
+// JSON-NEXT:             "tokLen": 1
+// JSON-NEXT:            },
+// JSON-NEXT:            "range": {
+// JSON-NEXT:             "begin": {
+// JSON-NEXT:              "offset": 2024,
+// JSON-NEXT:              "col": 3,
+// JSON-NEXT:              "tokLen": 1
+// JSON-NEXT:             },
+// JSON-NEXT:             "end": {
+// JSON-NEXT:              "offset": 2035,
+// JSON-NEXT:              "col": 14,
+// JSON-NEXT:              "tokLen": 1
+// JSON-NEXT:             }
+// JSON-NEXT:            },
+// JSON-NEXT:            "name": "a",
+// JSON-NEXT:            "type": {
+// JSON-NEXT:             "qualType": "A<T[3]...>"
+// JSON-NEXT:            }
+// JSON-NEXT:           }
+// JSON-NEXT:          ]
+// JSON-NEXT:         }
+// JSON-NEXT:        ]
+// JSON-NEXT:       }
+// JSON-NEXT:      ]
+// JSON-NEXT:     }
+// JSON-NEXT:    ]
+// JSON-NEXT:   },
+// JSON-NEXT:   {
+// JSON-NEXT:    "id": "0x{{.*}}",
+// JSON-NEXT:    "kind": "NamespaceDecl",
+// JSON-NEXT:    "loc": {
+// JSON-NEXT:     "offset": 2051,
+// JSON-NEXT:     "line": 62,
+// JSON-NEXT:     "col": 11,
+// JSON-NEXT:     "tokLen": 5
+// JSON-NEXT:    },
+// JSON-NEXT:    "range": {
+// JSON-NEXT:     "begin": {
+// JSON-NEXT:      "offset": 2041,
+// JSON-NEXT:      "col": 1,
+// JSON-NEXT:      "tokLen": 9
+// JSON-NEXT:     },
+// JSON-NEXT:     "end": {
+// JSON-NEXT:      "offset": 2240,
+// JSON-NEXT:      "line": 71,
+// JSON-NEXT:      "col": 1,
+// JSON-NEXT:      "tokLen": 1
+// JSON-NEXT:     }
+// JSON-NEXT:    },
+// JSON-NEXT:    "name": "test2",
+// JSON-NEXT:    "inner": [
+// JSON-NEXT:     {
+// JSON-NEXT:      "id": "0x{{.*}}",
+// JSON-NEXT:      "kind": "FunctionDecl",
+// JSON-NEXT:      "loc": {
+// JSON-NEXT:       "offset": 2064,
+// JSON-NEXT:       "line": 63,
+// JSON-NEXT:       "col": 6,
+// JSON-NEXT:       "tokLen": 4
+// JSON-NEXT:      },
+// JSON-NEXT:      "range": {
+// JSON-NEXT:       "begin": {
+// JSON-NEXT:        "offset": 2059,
+// JSON-NEXT:        "col": 1,
+// JSON-NEXT:        "tokLen": 4
+// JSON-NEXT:       },
+// JSON-NEXT:       "end": {
+// JSON-NEXT:        "offset": 2072,
+// JSON-NEXT:        "col": 14,
+// JSON-NEXT:        "tokLen": 1
+// JSON-NEXT:       }
+// JSON-NEXT:      },
+// JSON-NEXT:      "name": "func",
+// JSON-NEXT:      "mangledName": "_ZN5test24funcEi",
+// JSON-NEXT:      "type": {
+// JSON-NEXT:       "qualType": "void (int)"
+// JSON-NEXT:      },
+// JSON-NEXT:      "inner": [
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "ParmVarDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 2072,
+// JSON-NEXT:         "col": 14,
+// JSON-NEXT:         "tokLen": 1
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 2069,
+// JSON-NEXT:          "col": 11,
+// JSON-NEXT:          "tokLen": 3
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 2069,
+// JSON-NEXT:          "col": 11,
+// JSON-NEXT:          "tokLen": 3
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "type": {
+// JSON-NEXT:         "qualType": "int"
+// JSON-NEXT:        }
+// JSON-NEXT:       }
+// JSON-NEXT:      ]
+// JSON-NEXT:     },
+// JSON-NEXT:     {
+// JSON-NEXT:      "id": "0x{{.*}}",
+// JSON-NEXT:      "kind": "FunctionDecl",
+// JSON-NEXT:      "loc": {
+// JSON-NEXT:       "offset": 2080,
+// JSON-NEXT:       "line": 64,
+// JSON-NEXT:       "col": 6,
+// JSON-NEXT:       "tokLen": 4
+// JSON-NEXT:      },
+// JSON-NEXT:      "range": {
+// JSON-NEXT:       "begin": {
+// JSON-NEXT:        "offset": 2075,
+// JSON-NEXT:        "col": 1,
+// JSON-NEXT:        "tokLen": 4
+// JSON-NEXT:       },
+// JSON-NEXT:       "end": {
+// JSON-NEXT:        "offset": 2090,
+// JSON-NEXT:        "col": 16,
+// JSON-NEXT:        "tokLen": 1
+// JSON-NEXT:       }
+// JSON-NEXT:      },
+// JSON-NEXT:      "name": "func",
+// JSON-NEXT:      "mangledName": "_ZN5test24funcEf",
+// JSON-NEXT:      "type": {
+// JSON-NEXT:       "qualType": "void (float)"
+// JSON-NEXT:      },
+// JSON-NEXT:      "inner": [
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "ParmVarDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 2090,
+// JSON-NEXT:         "col": 16,
+// JSON-NEXT:         "tokLen": 1
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 2085,
+// JSON-NEXT:          "col": 11,
+// JSON-NEXT:          "tokLen": 5
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 2085,
+// JSON-NEXT:          "col": 11,
+// JSON-NEXT:          "tokLen": 5
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "type": {
+// JSON-NEXT:         "qualType": "float"
+// JSON-NEXT:        }
+// JSON-NEXT:       }
+// JSON-NEXT:      ]
+// JSON-NEXT:     },
+// JSON-NEXT:     {
+// JSON-NEXT:      "id": "0x{{.*}}",
+// JSON-NEXT:      "kind": "FunctionTemplateDecl",
+// JSON-NEXT:      "loc": {
+// JSON-NEXT:       "offset": 2119,
+// JSON-NEXT:       "line": 66,
+// JSON-NEXT:       "col": 6,
+// JSON-NEXT:       "tokLen": 4
+// JSON-NEXT:      },
+// JSON-NEXT:      "range": {
+// JSON-NEXT:       "begin": {
+// JSON-NEXT:        "offset": 2093,
+// JSON-NEXT:        "line": 65,
+// JSON-NEXT:        "col": 1,
+// JSON-NEXT:        "tokLen": 8
+// JSON-NEXT:       },
+// JSON-NEXT:       "end": {
+// JSON-NEXT:        "offset": 2141,
+// JSON-NEXT:        "line": 68,
+// JSON-NEXT:        "col": 1,
+// JSON-NEXT:        "tokLen": 1
+// JSON-NEXT:       }
+// JSON-NEXT:      },
+// JSON-NEXT:      "name": "tmpl",
+// JSON-NEXT:      "inner": [
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "TemplateTypeParmDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 2111,
+// JSON-NEXT:         "line": 65,
+// JSON-NEXT:         "col": 19,
+// JSON-NEXT:         "tokLen": 1
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 2102,
+// JSON-NEXT:          "col": 10,
+// JSON-NEXT:          "tokLen": 8
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 2111,
+// JSON-NEXT:          "col": 19,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "isReferenced": true,
+// JSON-NEXT:        "name": "T",
+// JSON-NEXT:        "tagUsed": "typename",
+// JSON-NEXT:        "depth": 0,
+// JSON-NEXT:        "index": 0
+// JSON-NEXT:       },
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "FunctionDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 2119,
+// JSON-NEXT:         "line": 66,
+// JSON-NEXT:         "col": 6,
+// JSON-NEXT:         "tokLen": 4
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 2114,
+// JSON-NEXT:          "col": 1,
+// JSON-NEXT:          "tokLen": 4
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 2141,
+// JSON-NEXT:          "line": 68,
+// JSON-NEXT:          "col": 1,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "name": "tmpl",
+// JSON-NEXT:        "type": {
+// JSON-NEXT:         "qualType": "void ()"
+// JSON-NEXT:        },
+// JSON-NEXT:        "inner": [
+// JSON-NEXT:         {
+// JSON-NEXT:          "id": "0x{{.*}}",
+// JSON-NEXT:          "kind": "CompoundStmt",
+// JSON-NEXT:          "range": {
+// JSON-NEXT:           "begin": {
+// JSON-NEXT:            "offset": 2126,
+// JSON-NEXT:            "line": 66,
+// JSON-NEXT:            "col": 13,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           },
+// JSON-NEXT:           "end": {
+// JSON-NEXT:            "offset": 2141,
+// JSON-NEXT:            "line": 68,
+// JSON-NEXT:            "col": 1,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           }
+// JSON-NEXT:          },
+// JSON-NEXT:          "inner": [
+// JSON-NEXT:           {
+// JSON-NEXT:            "id": "0x{{.*}}",
+// JSON-NEXT:            "kind": "CallExpr",
+// JSON-NEXT:            "range": {
+// JSON-NEXT:             "begin": {
+// JSON-NEXT:              "offset": 2130,
+// JSON-NEXT:              "line": 67,
+// JSON-NEXT:              "col": 3,
+// JSON-NEXT:              "tokLen": 4
+// JSON-NEXT:             },
+// JSON-NEXT:             "end": {
+// JSON-NEXT:              "offset": 2138,
+// JSON-NEXT:              "col": 11,
+// JSON-NEXT:              "tokLen": 1
+// JSON-NEXT:             }
+// JSON-NEXT:            },
+// JSON-NEXT:            "type": {
+// JSON-NEXT:             "qualType": "<dependent type>"
+// JSON-NEXT:            },
+// JSON-NEXT:            "valueCategory": "prvalue",
+// JSON-NEXT:            "inner": [
+// JSON-NEXT:             {
+// JSON-NEXT:              "id": "0x{{.*}}",
+// JSON-NEXT:              "kind": "UnresolvedLookupExpr",
+// JSON-NEXT:              "range": {
+// JSON-NEXT:               "begin": {
+// JSON-NEXT:                "offset": 2130,
+// JSON-NEXT:                "col": 3,
+// JSON-NEXT:                "tokLen": 4
+// JSON-NEXT:               },
+// JSON-NEXT:               "end": {
+// JSON-NEXT:                "offset": 2130,
+// JSON-NEXT:                "col": 3,
+// JSON-NEXT:                "tokLen": 4
+// JSON-NEXT:               }
+// JSON-NEXT:              },
+// JSON-NEXT:              "type": {
+// JSON-NEXT:               "qualType": "<overloaded function type>"
+// JSON-NEXT:              },
+// JSON-NEXT:              "valueCategory": "lvalue",
+// JSON-NEXT:              "usesADL": true,
+// JSON-NEXT:              "name": "func",
+// JSON-NEXT:              "lookups": [
+// JSON-NEXT:               {
+// JSON-NEXT:                "id": "0x{{.*}}",
+// JSON-NEXT:                "kind": "FunctionDecl",
+// JSON-NEXT:                "name": "func",
+// JSON-NEXT:                "type": {
+// JSON-NEXT:                 "qualType": "void (float)"
+// JSON-NEXT:                }
+// JSON-NEXT:               },
+// JSON-NEXT:               {
+// JSON-NEXT:                "id": "0x{{.*}}",
+// JSON-NEXT:                "kind": "FunctionDecl",
+// JSON-NEXT:                "name": "func",
+// JSON-NEXT:                "type": {
+// JSON-NEXT:                 "qualType": "void (int)"
+// JSON-NEXT:                }
+// JSON-NEXT:               }
+// JSON-NEXT:              ]
+// JSON-NEXT:             },
+// JSON-NEXT:             {
+// JSON-NEXT:              "id": "0x{{.*}}",
+// JSON-NEXT:              "kind": "CXXUnresolvedConstructExpr",
+// JSON-NEXT:              "range": {
+// JSON-NEXT:               "begin": {
+// JSON-NEXT:                "offset": 2135,
+// JSON-NEXT:                "col": 8,
+// JSON-NEXT:                "tokLen": 1
+// JSON-NEXT:               },
+// JSON-NEXT:               "end": {
+// JSON-NEXT:                "offset": 2137,
+// JSON-NEXT:                "col": 10,
+// JSON-NEXT:                "tokLen": 1
+// JSON-NEXT:               }
+// JSON-NEXT:              },
+// JSON-NEXT:              "type": {
+// JSON-NEXT:               "qualType": "T"
+// JSON-NEXT:              },
+// JSON-NEXT:              "valueCategory": "prvalue"
+// JSON-NEXT:             }
+// JSON-NEXT:            ]
+// JSON-NEXT:           }
+// JSON-NEXT:          ]
+// JSON-NEXT:         }
+// JSON-NEXT:        ]
+// JSON-NEXT:       }
+// JSON-NEXT:      ]
+// JSON-NEXT:     }
+// JSON-NEXT:    ]
+// JSON-NEXT:   },
+// JSON-NEXT:   {
+// JSON-NEXT:    "id": "0x{{.*}}",
+// JSON-NEXT:    "kind": "NamespaceDecl",
+// JSON-NEXT:    "loc": {
+// JSON-NEXT:     "offset": 2253,
+// JSON-NEXT:     "line": 73,
+// JSON-NEXT:     "col": 11,
+// JSON-NEXT:     "tokLen": 5
+// JSON-NEXT:    },
+// JSON-NEXT:    "range": {
+// JSON-NEXT:     "begin": {
+// JSON-NEXT:      "offset": 2243,
+// JSON-NEXT:      "col": 1,
+// JSON-NEXT:      "tokLen": 9
+// JSON-NEXT:     },
+// JSON-NEXT:     "end": {
+// JSON-NEXT:      "offset": 2387,
+// JSON-NEXT:      "line": 77,
+// JSON-NEXT:      "col": 1,
+// JSON-NEXT:      "tokLen": 1
+// JSON-NEXT:     }
+// JSON-NEXT:    },
+// JSON-NEXT:    "name": "test3",
+// JSON-NEXT:    "inner": [
+// JSON-NEXT:     {
+// JSON-NEXT:      "id": "0x{{.*}}",
+// JSON-NEXT:      "kind": "ClassTemplateDecl",
+// JSON-NEXT:      "loc": {
+// JSON-NEXT:       "offset": 2291,
+// JSON-NEXT:       "line": 74,
+// JSON-NEXT:       "col": 31,
+// JSON-NEXT:       "tokLen": 1
+// JSON-NEXT:      },
+// JSON-NEXT:      "range": {
+// JSON-NEXT:       "begin": {
+// JSON-NEXT:        "offset": 2263,
+// JSON-NEXT:        "col": 3,
+// JSON-NEXT:        "tokLen": 8
+// JSON-NEXT:       },
+// JSON-NEXT:       "end": {
+// JSON-NEXT:        "offset": 2294,
+// JSON-NEXT:        "col": 34,
+// JSON-NEXT:        "tokLen": 1
+// JSON-NEXT:       }
+// JSON-NEXT:      },
+// JSON-NEXT:      "name": "A",
+// JSON-NEXT:      "inner": [
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "TemplateTypeParmDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 2281,
+// JSON-NEXT:         "col": 21,
+// JSON-NEXT:         "tokLen": 1
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 2272,
+// JSON-NEXT:          "col": 12,
+// JSON-NEXT:          "tokLen": 8
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 2281,
+// JSON-NEXT:          "col": 21,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "name": "T",
+// JSON-NEXT:        "tagUsed": "typename",
+// JSON-NEXT:        "depth": 0,
+// JSON-NEXT:        "index": 0
+// JSON-NEXT:       },
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "CXXRecordDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 2291,
+// JSON-NEXT:         "col": 31,
+// JSON-NEXT:         "tokLen": 1
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 2284,
+// JSON-NEXT:          "col": 24,
+// JSON-NEXT:          "tokLen": 6
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 2294,
+// JSON-NEXT:          "col": 34,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "name": "A",
+// JSON-NEXT:        "tagUsed": "struct",
+// JSON-NEXT:        "completeDefinition": true,
+// JSON-NEXT:        "definitionData": {
+// JSON-NEXT:         "canConstDefaultInit": true,
+// JSON-NEXT:         "copyAssign": {
+// JSON-NEXT:          "hasConstParam": true,
+// JSON-NEXT:          "implicitHasConstParam": true,
+// JSON-NEXT:          "needsImplicit": true,
+// JSON-NEXT:          "simple": true,
+// JSON-NEXT:          "trivial": true
+// JSON-NEXT:         },
+// JSON-NEXT:         "copyCtor": {
+// JSON-NEXT:          "hasConstParam": true,
+// JSON-NEXT:          "implicitHasConstParam": true,
+// JSON-NEXT:          "needsImplicit": true,
+// JSON-NEXT:          "simple": true,
+// JSON-NEXT:          "trivial": true
+// JSON-NEXT:         },
+// JSON-NEXT:         "defaultCtor": {
+// JSON-NEXT:          "defaultedIsConstexpr": true,
+// JSON-NEXT:          "exists": true,
+// JSON-NEXT:          "isConstexpr": true,
+// JSON-NEXT:          "needsImplicit": true,
+// JSON-NEXT:          "trivial": true
+// JSON-NEXT:         },
+// JSON-NEXT:         "dtor": {
+// JSON-NEXT:          "irrelevant": true,
+// JSON-NEXT:          "needsImplicit": true,
+// JSON-NEXT:          "simple": true,
+// JSON-NEXT:          "trivial": true
+// JSON-NEXT:         },
+// JSON-NEXT:         "hasConstexprNonCopyMoveConstructor": true,
+// JSON-NEXT:         "isAggregate": true,
+// JSON-NEXT:         "isEmpty": true,
+// JSON-NEXT:         "isLiteral": true,
+// JSON-NEXT:         "isPOD": true,
+// JSON-NEXT:         "isStandardLayout": true,
+// JSON-NEXT:         "isTrivial": true,
+// JSON-NEXT:         "isTriviallyCopyable": true,
+// JSON-NEXT:         "moveAssign": {
+// JSON-NEXT:          "exists": true,
+// JSON-NEXT:          "needsImplicit": true,
+// JSON-NEXT:          "simple": true,
+// JSON-NEXT:          "trivial": true
+// JSON-NEXT:         },
+// JSON-NEXT:         "moveCtor": {
+// JSON-NEXT:          "exists": true,
+// JSON-NEXT:          "needsImplicit": true,
+// JSON-NEXT:          "simple": true,
+// JSON-NEXT:          "trivial": true
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "inner": [
+// JSON-NEXT:         {
+// JSON-NEXT:          "id": "0x{{.*}}",
+// JSON-NEXT:          "kind": "CXXRecordDecl",
+// JSON-NEXT:          "loc": {
+// JSON-NEXT:           "offset": 2291,
+// JSON-NEXT:           "col": 31,
+// JSON-NEXT:           "tokLen": 1
+// JSON-NEXT:          },
+// JSON-NEXT:          "range": {
+// JSON-NEXT:           "begin": {
+// JSON-NEXT:            "offset": 2284,
+// JSON-NEXT:            "col": 24,
+// JSON-NEXT:            "tokLen": 6
+// JSON-NEXT:           },
+// JSON-NEXT:           "end": {
+// JSON-NEXT:            "offset": 2291,
+// JSON-NEXT:            "col": 31,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           }
+// JSON-NEXT:          },
+// JSON-NEXT:          "isImplicit": true,
+// JSON-NEXT:          "name": "A",
+// JSON-NEXT:          "tagUsed": "struct"
+// JSON-NEXT:         }
+// JSON-NEXT:        ]
+// JSON-NEXT:       },
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "ClassTemplateSpecializationDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 2291,
+// JSON-NEXT:         "col": 31,
+// JSON-NEXT:         "tokLen": 1
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 2263,
+// JSON-NEXT:          "col": 3,
+// JSON-NEXT:          "tokLen": 8
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 2294,
+// JSON-NEXT:          "col": 34,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "name": "A",
+// JSON-NEXT:        "tagUsed": "struct",
+// JSON-NEXT:        "inner": [
+// JSON-NEXT:         {
+// JSON-NEXT:          "kind": "TemplateArgument",
+// JSON-NEXT:          "type": {
+// JSON-NEXT:           "qualType": "int"
+// JSON-NEXT:          },
+// JSON-NEXT:          "inner": [
+// JSON-NEXT:           {
+// JSON-NEXT:            "id": "0x{{.*}}",
+// JSON-NEXT:            "kind": "BuiltinType",
+// JSON-NEXT:            "type": {
+// JSON-NEXT:             "qualType": "int"
+// JSON-NEXT:            }
+// JSON-NEXT:           }
+// JSON-NEXT:          ]
+// JSON-NEXT:         }
+// JSON-NEXT:        ]
+// JSON-NEXT:       }
+// JSON-NEXT:      ]
+// JSON-NEXT:     },
+// JSON-NEXT:     {
+// JSON-NEXT:      "id": "0x{{.*}}",
+// JSON-NEXT:      "kind": "FunctionTemplateDecl",
+// JSON-NEXT:      "loc": {
+// JSON-NEXT:       "offset": 2291,
+// JSON-NEXT:       "col": 31,
+// JSON-NEXT:       "tokLen": 1
+// JSON-NEXT:      },
+// JSON-NEXT:      "range": {
+// JSON-NEXT:       "begin": {
+// JSON-NEXT:        "offset": 2263,
+// JSON-NEXT:        "col": 3,
+// JSON-NEXT:        "tokLen": 8
+// JSON-NEXT:       },
+// JSON-NEXT:       "end": {
+// JSON-NEXT:        "offset": 2291,
+// JSON-NEXT:        "col": 31,
+// JSON-NEXT:        "tokLen": 1
+// JSON-NEXT:       }
+// JSON-NEXT:      },
+// JSON-NEXT:      "isImplicit": true,
+// JSON-NEXT:      "name": "<deduction guide for A>",
+// JSON-NEXT:      "inner": [
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "TemplateTypeParmDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 2281,
+// JSON-NEXT:         "col": 21,
+// JSON-NEXT:         "tokLen": 1
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 2272,
+// JSON-NEXT:          "col": 12,
+// JSON-NEXT:          "tokLen": 8
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 2281,
+// JSON-NEXT:          "col": 21,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "name": "T",
+// JSON-NEXT:        "tagUsed": "typename",
+// JSON-NEXT:        "depth": 0,
+// JSON-NEXT:        "index": 0
+// JSON-NEXT:       },
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "CXXDeductionGuideDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 2291,
+// JSON-NEXT:         "col": 31,
+// JSON-NEXT:         "tokLen": 1
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 2291,
+// JSON-NEXT:          "col": 31,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 2291,
+// JSON-NEXT:          "col": 31,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "isImplicit": true,
+// JSON-NEXT:        "name": "<deduction guide for A>",
+// JSON-NEXT:        "type": {
+// JSON-NEXT:         "qualType": "auto () -> A<T>"
+// JSON-NEXT:        }
+// JSON-NEXT:       }
+// JSON-NEXT:      ]
+// JSON-NEXT:     },
+// JSON-NEXT:     {
+// JSON-NEXT:      "id": "0x{{.*}}",
+// JSON-NEXT:      "kind": "FunctionTemplateDecl",
+// JSON-NEXT:      "loc": {
+// JSON-NEXT:       "offset": 2291,
+// JSON-NEXT:       "col": 31,
+// JSON-NEXT:       "tokLen": 1
+// JSON-NEXT:      },
+// JSON-NEXT:      "range": {
+// JSON-NEXT:       "begin": {
+// JSON-NEXT:        "offset": 2263,
+// JSON-NEXT:        "col": 3,
+// JSON-NEXT:        "tokLen": 8
+// JSON-NEXT:       },
+// JSON-NEXT:       "end": {
+// JSON-NEXT:        "offset": 2291,
+// JSON-NEXT:        "col": 31,
+// JSON-NEXT:        "tokLen": 1
+// JSON-NEXT:       }
+// JSON-NEXT:      },
+// JSON-NEXT:      "isImplicit": true,
+// JSON-NEXT:      "name": "<deduction guide for A>",
+// JSON-NEXT:      "inner": [
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "TemplateTypeParmDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 2281,
+// JSON-NEXT:         "col": 21,
+// JSON-NEXT:         "tokLen": 1
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 2272,
+// JSON-NEXT:          "col": 12,
+// JSON-NEXT:          "tokLen": 8
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 2281,
+// JSON-NEXT:          "col": 21,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "name": "T",
+// JSON-NEXT:        "tagUsed": "typename",
+// JSON-NEXT:        "depth": 0,
+// JSON-NEXT:        "index": 0
+// JSON-NEXT:       },
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "CXXDeductionGuideDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 2291,
+// JSON-NEXT:         "col": 31,
+// JSON-NEXT:         "tokLen": 1
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 2291,
+// JSON-NEXT:          "col": 31,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 2291,
+// JSON-NEXT:          "col": 31,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "isImplicit": true,
+// JSON-NEXT:        "name": "<deduction guide for A>",
+// JSON-NEXT:        "type": {
+// JSON-NEXT:         "qualType": "auto (A<T>) -> A<T>"
+// JSON-NEXT:        },
+// JSON-NEXT:        "inner": [
+// JSON-NEXT:         {
+// JSON-NEXT:          "id": "0x{{.*}}",
+// JSON-NEXT:          "kind": "ParmVarDecl",
+// JSON-NEXT:          "loc": {
+// JSON-NEXT:           "offset": 2291,
+// JSON-NEXT:           "col": 31,
+// JSON-NEXT:           "tokLen": 1
+// JSON-NEXT:          },
+// JSON-NEXT:          "range": {
+// JSON-NEXT:           "begin": {
+// JSON-NEXT:            "offset": 2291,
+// JSON-NEXT:            "col": 31,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           },
+// JSON-NEXT:           "end": {
+// JSON-NEXT:            "offset": 2291,
+// JSON-NEXT:            "col": 31,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           }
+// JSON-NEXT:          },
+// JSON-NEXT:          "type": {
+// JSON-NEXT:           "qualType": "A<T>"
+// JSON-NEXT:          }
+// JSON-NEXT:         }
+// JSON-NEXT:        ]
+// JSON-NEXT:       }
+// JSON-NEXT:      ]
+// JSON-NEXT:     },
+// JSON-NEXT:     {
+// JSON-NEXT:      "id": "0x{{.*}}",
+// JSON-NEXT:      "kind": "FunctionTemplateDecl",
+// JSON-NEXT:      "loc": {
+// JSON-NEXT:       "offset": 2320,
+// JSON-NEXT:       "line": 75,
+// JSON-NEXT:       "col": 24,
+// JSON-NEXT:       "tokLen": 1
+// JSON-NEXT:      },
+// JSON-NEXT:      "range": {
+// JSON-NEXT:       "begin": {
+// JSON-NEXT:        "offset": 2299,
+// JSON-NEXT:        "col": 3,
+// JSON-NEXT:        "tokLen": 8
+// JSON-NEXT:       },
+// JSON-NEXT:       "end": {
+// JSON-NEXT:        "offset": 2333,
+// JSON-NEXT:        "col": 37,
+// JSON-NEXT:        "tokLen": 1
+// JSON-NEXT:       }
+// JSON-NEXT:      },
+// JSON-NEXT:      "name": "<deduction guide for A>",
+// JSON-NEXT:      "inner": [
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "TemplateTypeParmDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 2317,
+// JSON-NEXT:         "col": 21,
+// JSON-NEXT:         "tokLen": 1
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 2308,
+// JSON-NEXT:          "col": 12,
+// JSON-NEXT:          "tokLen": 8
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 2317,
+// JSON-NEXT:          "col": 21,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "isReferenced": true,
+// JSON-NEXT:        "name": "T",
+// JSON-NEXT:        "tagUsed": "typename",
+// JSON-NEXT:        "depth": 0,
+// JSON-NEXT:        "index": 0
+// JSON-NEXT:       },
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "CXXDeductionGuideDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 2320,
+// JSON-NEXT:         "col": 24,
+// JSON-NEXT:         "tokLen": 1
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 2320,
+// JSON-NEXT:          "col": 24,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 2333,
+// JSON-NEXT:          "col": 37,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "name": "<deduction guide for A>",
+// JSON-NEXT:        "type": {
+// JSON-NEXT:         "qualType": "auto (T) -> A<int>"
+// JSON-NEXT:        },
+// JSON-NEXT:        "inner": [
+// JSON-NEXT:         {
+// JSON-NEXT:          "id": "0x{{.*}}",
+// JSON-NEXT:          "kind": "ParmVarDecl",
+// JSON-NEXT:          "loc": {
+// JSON-NEXT:           "offset": 2323,
+// JSON-NEXT:           "col": 27,
+// JSON-NEXT:           "tokLen": 1
+// JSON-NEXT:          },
+// JSON-NEXT:          "range": {
+// JSON-NEXT:           "begin": {
+// JSON-NEXT:            "offset": 2322,
+// JSON-NEXT:            "col": 26,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           },
+// JSON-NEXT:           "end": {
+// JSON-NEXT:            "offset": 2322,
+// JSON-NEXT:            "col": 26,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           }
+// JSON-NEXT:          },
+// JSON-NEXT:          "type": {
+// JSON-NEXT:           "qualType": "T"
+// JSON-NEXT:          }
+// JSON-NEXT:         }
+// JSON-NEXT:        ]
+// JSON-NEXT:       }
+// JSON-NEXT:      ]
+// JSON-NEXT:     }
+// JSON-NEXT:    ]
+// JSON-NEXT:   },
+// JSON-NEXT:   {
+// JSON-NEXT:    "id": "0x{{.*}}",
+// JSON-NEXT:    "kind": "NamespaceDecl",
+// JSON-NEXT:    "loc": {
+// JSON-NEXT:     "offset": 2400,
+// JSON-NEXT:     "line": 79,
+// JSON-NEXT:     "col": 11,
+// JSON-NEXT:     "tokLen": 5
+// JSON-NEXT:    },
+// JSON-NEXT:    "range": {
+// JSON-NEXT:     "begin": {
+// JSON-NEXT:      "offset": 2390,
+// JSON-NEXT:      "col": 1,
+// JSON-NEXT:      "tokLen": 9
+// JSON-NEXT:     },
+// JSON-NEXT:     "end": {
+// JSON-NEXT:      "offset": 3297,
+// JSON-NEXT:      "line": 103,
+// JSON-NEXT:      "col": 1,
+// JSON-NEXT:      "tokLen": 1
+// JSON-NEXT:     }
+// JSON-NEXT:    },
+// JSON-NEXT:    "name": "test4",
+// JSON-NEXT:    "inner": [
+// JSON-NEXT:     {
+// JSON-NEXT:      "id": "0x{{.*}}",
+// JSON-NEXT:      "kind": "ClassTemplateDecl",
+// JSON-NEXT:      "loc": {
+// JSON-NEXT:       "offset": 2445,
+// JSON-NEXT:       "line": 81,
+// JSON-NEXT:       "col": 8,
+// JSON-NEXT:       "tokLen": 3
+// JSON-NEXT:      },
+// JSON-NEXT:      "range": {
+// JSON-NEXT:       "begin": {
+// JSON-NEXT:        "offset": 2408,
+// JSON-NEXT:        "line": 80,
+// JSON-NEXT:        "col": 1,
+// JSON-NEXT:        "tokLen": 8
+// JSON-NEXT:       },
+// JSON-NEXT:       "end": {
+// JSON-NEXT:        "offset": 2471,
+// JSON-NEXT:        "line": 83,
+// JSON-NEXT:        "col": 1,
+// JSON-NEXT:        "tokLen": 1
+// JSON-NEXT:       }
+// JSON-NEXT:      },
+// JSON-NEXT:      "name": "foo",
+// JSON-NEXT:      "inner": [
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "NonTypeTemplateParmDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 2427,
+// JSON-NEXT:         "line": 80,
+// JSON-NEXT:         "col": 20,
+// JSON-NEXT:         "tokLen": 1
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 2418,
+// JSON-NEXT:          "col": 11,
+// JSON-NEXT:          "tokLen": 8
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 2427,
+// JSON-NEXT:          "col": 20,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "name": "X",
+// JSON-NEXT:        "type": {
+// JSON-NEXT:         "qualType": "unsigned int"
+// JSON-NEXT:        },
+// JSON-NEXT:        "depth": 0,
+// JSON-NEXT:        "index": 0
+// JSON-NEXT:       },
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "NonTypeTemplateParmDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 2435,
+// JSON-NEXT:         "col": 28,
+// JSON-NEXT:         "tokLen": 1
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 2430,
+// JSON-NEXT:          "col": 23,
+// JSON-NEXT:          "tokLen": 4
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 2435,
+// JSON-NEXT:          "col": 28,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "name": "A",
+// JSON-NEXT:        "type": {
+// JSON-NEXT:         "qualType": "auto"
+// JSON-NEXT:        },
+// JSON-NEXT:        "depth": 0,
+// JSON-NEXT:        "index": 1
+// JSON-NEXT:       },
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "CXXRecordDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 2445,
+// JSON-NEXT:         "line": 81,
+// JSON-NEXT:         "col": 8,
+// JSON-NEXT:         "tokLen": 3
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 2438,
+// JSON-NEXT:          "col": 1,
+// JSON-NEXT:          "tokLen": 6
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 2471,
+// JSON-NEXT:          "line": 83,
+// JSON-NEXT:          "col": 1,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "name": "foo",
+// JSON-NEXT:        "tagUsed": "struct",
+// JSON-NEXT:        "completeDefinition": true,
+// JSON-NEXT:        "definitionData": {
+// JSON-NEXT:         "canConstDefaultInit": true,
+// JSON-NEXT:         "copyAssign": {
+// JSON-NEXT:          "hasConstParam": true,
+// JSON-NEXT:          "implicitHasConstParam": true,
+// JSON-NEXT:          "needsImplicit": true,
+// JSON-NEXT:          "simple": true,
+// JSON-NEXT:          "trivial": true
+// JSON-NEXT:         },
+// JSON-NEXT:         "copyCtor": {
+// JSON-NEXT:          "hasConstParam": true,
+// JSON-NEXT:          "implicitHasConstParam": true,
+// JSON-NEXT:          "needsImplicit": true,
+// JSON-NEXT:          "simple": true,
+// JSON-NEXT:          "trivial": true
+// JSON-NEXT:         },
+// JSON-NEXT:         "defaultCtor": {
+// JSON-NEXT:          "defaultedIsConstexpr": true,
+// JSON-NEXT:          "exists": true,
+// JSON-NEXT:          "isConstexpr": true,
+// JSON-NEXT:          "needsImplicit": true,
+// JSON-NEXT:          "trivial": true
+// JSON-NEXT:         },
+// JSON-NEXT:         "dtor": {
+// JSON-NEXT:          "irrelevant": true,
+// JSON-NEXT:          "needsImplicit": true,
+// JSON-NEXT:          "simple": true,
+// JSON-NEXT:          "trivial": true
+// JSON-NEXT:         },
+// JSON-NEXT:         "hasConstexprNonCopyMoveConstructor": true,
+// JSON-NEXT:         "isAggregate": true,
+// JSON-NEXT:         "isEmpty": true,
+// JSON-NEXT:         "isLiteral": true,
+// JSON-NEXT:         "isPOD": true,
+// JSON-NEXT:         "isStandardLayout": true,
+// JSON-NEXT:         "isTrivial": true,
+// JSON-NEXT:         "isTriviallyCopyable": true,
+// JSON-NEXT:         "moveAssign": {
+// JSON-NEXT:          "exists": true,
+// JSON-NEXT:          "needsImplicit": true,
+// JSON-NEXT:          "simple": true,
+// JSON-NEXT:          "trivial": true
+// JSON-NEXT:         },
+// JSON-NEXT:         "moveCtor": {
+// JSON-NEXT:          "exists": true,
+// JSON-NEXT:          "needsImplicit": true,
+// JSON-NEXT:          "simple": true,
+// JSON-NEXT:          "trivial": true
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "inner": [
+// JSON-NEXT:         {
+// JSON-NEXT:          "id": "0x{{.*}}",
+// JSON-NEXT:          "kind": "CXXRecordDecl",
+// JSON-NEXT:          "loc": {
+// JSON-NEXT:           "offset": 2445,
+// JSON-NEXT:           "line": 81,
+// JSON-NEXT:           "col": 8,
+// JSON-NEXT:           "tokLen": 3
+// JSON-NEXT:          },
+// JSON-NEXT:          "range": {
+// JSON-NEXT:           "begin": {
+// JSON-NEXT:            "offset": 2438,
+// JSON-NEXT:            "col": 1,
+// JSON-NEXT:            "tokLen": 6
+// JSON-NEXT:           },
+// JSON-NEXT:           "end": {
+// JSON-NEXT:            "offset": 2445,
+// JSON-NEXT:            "col": 8,
+// JSON-NEXT:            "tokLen": 3
+// JSON-NEXT:           }
+// JSON-NEXT:          },
+// JSON-NEXT:          "isImplicit": true,
+// JSON-NEXT:          "name": "foo",
+// JSON-NEXT:          "tagUsed": "struct"
+// JSON-NEXT:         },
+// JSON-NEXT:         {
+// JSON-NEXT:          "id": "0x{{.*}}",
+// JSON-NEXT:          "kind": "CXXMethodDecl",
+// JSON-NEXT:          "loc": {
+// JSON-NEXT:           "offset": 2465,
+// JSON-NEXT:           "line": 82,
+// JSON-NEXT:           "col": 15,
+// JSON-NEXT:           "tokLen": 2
+// JSON-NEXT:          },
+// JSON-NEXT:          "range": {
+// JSON-NEXT:           "begin": {
+// JSON-NEXT:            "offset": 2453,
+// JSON-NEXT:            "col": 3,
+// JSON-NEXT:            "tokLen": 6
+// JSON-NEXT:           },
+// JSON-NEXT:           "end": {
+// JSON-NEXT:            "offset": 2468,
+// JSON-NEXT:            "col": 18,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           }
+// JSON-NEXT:          },
+// JSON-NEXT:          "name": "fn",
+// JSON-NEXT:          "type": {
+// JSON-NEXT:           "qualType": "void ()"
+// JSON-NEXT:          },
+// JSON-NEXT:          "storageClass": "static"
+// JSON-NEXT:         }
+// JSON-NEXT:        ]
+// JSON-NEXT:       },
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "ClassTemplateSpecializationDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 2445,
+// JSON-NEXT:         "line": 81,
+// JSON-NEXT:         "col": 8,
+// JSON-NEXT:         "tokLen": 3
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 2408,
+// JSON-NEXT:          "line": 80,
+// JSON-NEXT:          "col": 1,
+// JSON-NEXT:          "tokLen": 8
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 2471,
+// JSON-NEXT:          "line": 83,
+// JSON-NEXT:          "col": 1,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "name": "foo",
+// JSON-NEXT:        "tagUsed": "struct",
+// JSON-NEXT:        "completeDefinition": true,
+// JSON-NEXT:        "definitionData": {
+// JSON-NEXT:         "canConstDefaultInit": true,
+// JSON-NEXT:         "canPassInRegisters": true,
+// JSON-NEXT:         "copyAssign": {
+// JSON-NEXT:          "hasConstParam": true,
+// JSON-NEXT:          "implicitHasConstParam": true,
+// JSON-NEXT:          "needsImplicit": true,
+// JSON-NEXT:          "simple": true,
+// JSON-NEXT:          "trivial": true
+// JSON-NEXT:         },
+// JSON-NEXT:         "copyCtor": {
+// JSON-NEXT:          "hasConstParam": true,
+// JSON-NEXT:          "implicitHasConstParam": true,
+// JSON-NEXT:          "needsImplicit": true,
+// JSON-NEXT:          "simple": true,
+// JSON-NEXT:          "trivial": true
+// JSON-NEXT:         },
+// JSON-NEXT:         "defaultCtor": {
+// JSON-NEXT:          "defaultedIsConstexpr": true,
+// JSON-NEXT:          "exists": true,
+// JSON-NEXT:          "isConstexpr": true,
+// JSON-NEXT:          "needsImplicit": true,
+// JSON-NEXT:          "trivial": true
+// JSON-NEXT:         },
+// JSON-NEXT:         "dtor": {
+// JSON-NEXT:          "irrelevant": true,
+// JSON-NEXT:          "needsImplicit": true,
+// JSON-NEXT:          "simple": true,
+// JSON-NEXT:          "trivial": true
+// JSON-NEXT:         },
+// JSON-NEXT:         "hasConstexprNonCopyMoveConstructor": true,
+// JSON-NEXT:         "isAggregate": true,
+// JSON-NEXT:         "isEmpty": true,
+// JSON-NEXT:         "isLiteral": true,
+// JSON-NEXT:         "isPOD": true,
+// JSON-NEXT:         "isStandardLayout": true,
+// JSON-NEXT:         "isTrivial": true,
+// JSON-NEXT:         "isTriviallyCopyable": true,
+// JSON-NEXT:         "moveAssign": {
+// JSON-NEXT:          "exists": true,
+// JSON-NEXT:          "needsImplicit": true,
+// JSON-NEXT:          "simple": true,
+// JSON-NEXT:          "trivial": true
+// JSON-NEXT:         },
+// JSON-NEXT:         "moveCtor": {
+// JSON-NEXT:          "exists": true,
+// JSON-NEXT:          "needsImplicit": true,
+// JSON-NEXT:          "simple": true,
+// JSON-NEXT:          "trivial": true
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "inner": [
+// JSON-NEXT:         {
+// JSON-NEXT:          "kind": "TemplateArgument",
+// JSON-NEXT:          "value": 0
+// JSON-NEXT:         },
+// JSON-NEXT:         {
+// JSON-NEXT:          "kind": "TemplateArgument",
+// JSON-NEXT:          "value": 0
+// JSON-NEXT:         },
+// JSON-NEXT:         {
+// JSON-NEXT:          "id": "0x{{.*}}",
+// JSON-NEXT:          "kind": "CXXRecordDecl",
+// JSON-NEXT:          "loc": {
+// JSON-NEXT:           "offset": 2445,
+// JSON-NEXT:           "line": 81,
+// JSON-NEXT:           "col": 8,
+// JSON-NEXT:           "tokLen": 3
+// JSON-NEXT:          },
+// JSON-NEXT:          "range": {
+// JSON-NEXT:           "begin": {
+// JSON-NEXT:            "offset": 2438,
+// JSON-NEXT:            "col": 1,
+// JSON-NEXT:            "tokLen": 6
+// JSON-NEXT:           },
+// JSON-NEXT:           "end": {
+// JSON-NEXT:            "offset": 2445,
+// JSON-NEXT:            "col": 8,
+// JSON-NEXT:            "tokLen": 3
+// JSON-NEXT:           }
+// JSON-NEXT:          },
+// JSON-NEXT:          "isImplicit": true,
+// JSON-NEXT:          "name": "foo",
+// JSON-NEXT:          "tagUsed": "struct"
+// JSON-NEXT:         },
+// JSON-NEXT:         {
+// JSON-NEXT:          "id": "0x{{.*}}",
+// JSON-NEXT:          "kind": "CXXMethodDecl",
+// JSON-NEXT:          "loc": {
+// JSON-NEXT:           "offset": 2465,
+// JSON-NEXT:           "line": 82,
+// JSON-NEXT:           "col": 15,
+// JSON-NEXT:           "tokLen": 2
+// JSON-NEXT:          },
+// JSON-NEXT:          "range": {
+// JSON-NEXT:           "begin": {
+// JSON-NEXT:            "offset": 2453,
+// JSON-NEXT:            "col": 3,
+// JSON-NEXT:            "tokLen": 6
+// JSON-NEXT:           },
+// JSON-NEXT:           "end": {
+// JSON-NEXT:            "offset": 2468,
+// JSON-NEXT:            "col": 18,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           }
+// JSON-NEXT:          },
+// JSON-NEXT:          "isUsed": true,
+// JSON-NEXT:          "name": "fn",
+// JSON-NEXT:          "mangledName": "_ZN5test43fooILj0ELl0EE2fnEv",
+// JSON-NEXT:          "type": {
+// JSON-NEXT:           "qualType": "void ()"
+// JSON-NEXT:          },
+// JSON-NEXT:          "storageClass": "static"
+// JSON-NEXT:         }
+// JSON-NEXT:        ]
+// JSON-NEXT:       },
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "ClassTemplateSpecializationDecl",
+// JSON-NEXT:        "name": "foo"
+// JSON-NEXT:       }
+// JSON-NEXT:      ]
+// JSON-NEXT:     },
+// JSON-NEXT:     {
+// JSON-NEXT:      "id": "0x{{.*}}",
+// JSON-NEXT:      "kind": "FunctionDecl",
+// JSON-NEXT:      "loc": {
+// JSON-NEXT:       "offset": 2846,
+// JSON-NEXT:       "line": 92,
+// JSON-NEXT:       "col": 6,
+// JSON-NEXT:       "tokLen": 4
+// JSON-NEXT:      },
+// JSON-NEXT:      "range": {
+// JSON-NEXT:       "begin": {
+// JSON-NEXT:        "offset": 2841,
+// JSON-NEXT:        "col": 1,
+// JSON-NEXT:        "tokLen": 4
+// JSON-NEXT:       },
+// JSON-NEXT:       "end": {
+// JSON-NEXT:        "offset": 2879,
+// JSON-NEXT:        "line": 94,
+// JSON-NEXT:        "col": 1,
+// JSON-NEXT:        "tokLen": 1
+// JSON-NEXT:       }
+// JSON-NEXT:      },
+// JSON-NEXT:      "name": "test",
+// JSON-NEXT:      "mangledName": "_ZN5test44testEv",
+// JSON-NEXT:      "type": {
+// JSON-NEXT:       "qualType": "void ()"
+// JSON-NEXT:      },
+// JSON-NEXT:      "inner": [
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "CompoundStmt",
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 2853,
+// JSON-NEXT:          "line": 92,
+// JSON-NEXT:          "col": 13,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 2879,
+// JSON-NEXT:          "line": 94,
+// JSON-NEXT:          "col": 1,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "inner": [
+// JSON-NEXT:         {
+// JSON-NEXT:          "id": "0x{{.*}}",
+// JSON-NEXT:          "kind": "CallExpr",
+// JSON-NEXT:          "range": {
+// JSON-NEXT:           "begin": {
+// JSON-NEXT:            "offset": 2857,
+// JSON-NEXT:            "line": 93,
+// JSON-NEXT:            "col": 3,
+// JSON-NEXT:            "tokLen": 3
+// JSON-NEXT:           },
+// JSON-NEXT:           "end": {
+// JSON-NEXT:            "offset": 2876,
+// JSON-NEXT:            "col": 22,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           }
+// JSON-NEXT:          },
+// JSON-NEXT:          "type": {
+// JSON-NEXT:           "qualType": "void"
+// JSON-NEXT:          },
+// JSON-NEXT:          "valueCategory": "prvalue",
+// JSON-NEXT:          "inner": [
+// JSON-NEXT:           {
+// JSON-NEXT:            "id": "0x{{.*}}",
+// JSON-NEXT:            "kind": "ImplicitCastExpr",
+// JSON-NEXT:            "range": {
+// JSON-NEXT:             "begin": {
+// JSON-NEXT:              "offset": 2857,
+// JSON-NEXT:              "col": 3,
+// JSON-NEXT:              "tokLen": 3
+// JSON-NEXT:             },
+// JSON-NEXT:             "end": {
+// JSON-NEXT:              "offset": 2873,
+// JSON-NEXT:              "col": 19,
+// JSON-NEXT:              "tokLen": 2
+// JSON-NEXT:             }
+// JSON-NEXT:            },
+// JSON-NEXT:            "type": {
+// JSON-NEXT:             "qualType": "void (*)()"
+// JSON-NEXT:            },
+// JSON-NEXT:            "valueCategory": "prvalue",
+// JSON-NEXT:            "castKind": "FunctionToPointerDecay",
+// JSON-NEXT:            "inner": [
+// JSON-NEXT:             {
+// JSON-NEXT:              "id": "0x{{.*}}",
+// JSON-NEXT:              "kind": "DeclRefExpr",
+// JSON-NEXT:              "range": {
+// JSON-NEXT:               "begin": {
+// JSON-NEXT:                "offset": 2857,
+// JSON-NEXT:                "col": 3,
+// JSON-NEXT:                "tokLen": 3
+// JSON-NEXT:               },
+// JSON-NEXT:               "end": {
+// JSON-NEXT:                "offset": 2873,
+// JSON-NEXT:                "col": 19,
+// JSON-NEXT:                "tokLen": 2
+// JSON-NEXT:               }
+// JSON-NEXT:              },
+// JSON-NEXT:              "type": {
+// JSON-NEXT:               "qualType": "void ()"
+// JSON-NEXT:              },
+// JSON-NEXT:              "valueCategory": "lvalue",
+// JSON-NEXT:              "referencedDecl": {
+// JSON-NEXT:               "id": "0x{{.*}}",
+// JSON-NEXT:               "kind": "CXXMethodDecl",
+// JSON-NEXT:               "name": "fn",
+// JSON-NEXT:               "type": {
+// JSON-NEXT:                "qualType": "void ()"
+// JSON-NEXT:               }
+// JSON-NEXT:              }
+// JSON-NEXT:             }
+// JSON-NEXT:            ]
+// JSON-NEXT:           }
+// JSON-NEXT:          ]
+// JSON-NEXT:         }
+// JSON-NEXT:        ]
+// JSON-NEXT:       }
+// JSON-NEXT:      ]
+// JSON-NEXT:     },
+// JSON-NEXT:     {
+// JSON-NEXT:      "id": "0x{{.*}}",
+// JSON-NEXT:      "kind": "ClassTemplateSpecializationDecl",
+// JSON-NEXT:      "loc": {
+// JSON-NEXT:       "offset": 3281,
+// JSON-NEXT:       "line": 102,
+// JSON-NEXT:       "col": 17,
+// JSON-NEXT:       "tokLen": 3
+// JSON-NEXT:      },
+// JSON-NEXT:      "range": {
+// JSON-NEXT:       "begin": {
+// JSON-NEXT:        "offset": 3265,
+// JSON-NEXT:        "col": 1,
+// JSON-NEXT:        "tokLen": 8
+// JSON-NEXT:       },
+// JSON-NEXT:       "end": {
+// JSON-NEXT:        "offset": 3294,
+// JSON-NEXT:        "col": 30,
+// JSON-NEXT:        "tokLen": 1
+// JSON-NEXT:       }
+// JSON-NEXT:      },
+// JSON-NEXT:      "name": "foo",
+// JSON-NEXT:      "tagUsed": "struct",
+// JSON-NEXT:      "completeDefinition": true,
+// JSON-NEXT:      "definitionData": {
+// JSON-NEXT:       "canConstDefaultInit": true,
+// JSON-NEXT:       "canPassInRegisters": true,
+// JSON-NEXT:       "copyAssign": {
+// JSON-NEXT:        "hasConstParam": true,
+// JSON-NEXT:        "implicitHasConstParam": true,
+// JSON-NEXT:        "needsImplicit": true,
+// JSON-NEXT:        "simple": true,
+// JSON-NEXT:        "trivial": true
+// JSON-NEXT:       },
+// JSON-NEXT:       "copyCtor": {
+// JSON-NEXT:        "hasConstParam": true,
+// JSON-NEXT:        "implicitHasConstParam": true,
+// JSON-NEXT:        "needsImplicit": true,
+// JSON-NEXT:        "simple": true,
+// JSON-NEXT:        "trivial": true
+// JSON-NEXT:       },
+// JSON-NEXT:       "defaultCtor": {
+// JSON-NEXT:        "defaultedIsConstexpr": true,
+// JSON-NEXT:        "exists": true,
+// JSON-NEXT:        "isConstexpr": true,
+// JSON-NEXT:        "needsImplicit": true,
+// JSON-NEXT:        "trivial": true
+// JSON-NEXT:       },
+// JSON-NEXT:       "dtor": {
+// JSON-NEXT:        "irrelevant": true,
+// JSON-NEXT:        "needsImplicit": true,
+// JSON-NEXT:        "simple": true,
+// JSON-NEXT:        "trivial": true
+// JSON-NEXT:       },
+// JSON-NEXT:       "hasConstexprNonCopyMoveConstructor": true,
+// JSON-NEXT:       "isAggregate": true,
+// JSON-NEXT:       "isEmpty": true,
+// JSON-NEXT:       "isLiteral": true,
+// JSON-NEXT:       "isPOD": true,
+// JSON-NEXT:       "isStandardLayout": true,
+// JSON-NEXT:       "isTrivial": true,
+// JSON-NEXT:       "isTriviallyCopyable": true,
+// JSON-NEXT:       "moveAssign": {
+// JSON-NEXT:        "exists": true,
+// JSON-NEXT:        "needsImplicit": true,
+// JSON-NEXT:        "simple": true,
+// JSON-NEXT:        "trivial": true
+// JSON-NEXT:       },
+// JSON-NEXT:       "moveCtor": {
+// JSON-NEXT:        "exists": true,
+// JSON-NEXT:        "needsImplicit": true,
+// JSON-NEXT:        "simple": true,
+// JSON-NEXT:        "trivial": true
+// JSON-NEXT:       }
+// JSON-NEXT:      },
+// JSON-NEXT:      "inner": [
+// JSON-NEXT:       {
+// JSON-NEXT:        "kind": "TemplateArgument",
+// JSON-NEXT:        "value": 1
+// JSON-NEXT:       },
+// JSON-NEXT:       {
+// JSON-NEXT:        "kind": "TemplateArgument",
+// JSON-NEXT:        "value": 0
+// JSON-NEXT:       },
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "CXXRecordDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 2445,
+// JSON-NEXT:         "line": 81,
+// JSON-NEXT:         "col": 8,
+// JSON-NEXT:         "tokLen": 3
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 2438,
+// JSON-NEXT:          "col": 1,
+// JSON-NEXT:          "tokLen": 6
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 2445,
+// JSON-NEXT:          "col": 8,
+// JSON-NEXT:          "tokLen": 3
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "isImplicit": true,
+// JSON-NEXT:        "name": "foo",
+// JSON-NEXT:        "tagUsed": "struct"
+// JSON-NEXT:       },
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "CXXMethodDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 2465,
+// JSON-NEXT:         "line": 82,
+// JSON-NEXT:         "col": 15,
+// JSON-NEXT:         "tokLen": 2
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 2453,
+// JSON-NEXT:          "col": 3,
+// JSON-NEXT:          "tokLen": 6
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 2468,
+// JSON-NEXT:          "col": 18,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "name": "fn",
+// JSON-NEXT:        "mangledName": "_ZN5test43fooILj1ELl0EE2fnEv",
+// JSON-NEXT:        "type": {
+// JSON-NEXT:         "qualType": "void ()"
+// JSON-NEXT:        },
+// JSON-NEXT:        "storageClass": "static"
+// JSON-NEXT:       }
+// JSON-NEXT:      ]
+// JSON-NEXT:     }
+// JSON-NEXT:    ]
+// JSON-NEXT:   },
+// JSON-NEXT:   {
+// JSON-NEXT:    "id": "0x{{.*}}",
+// JSON-NEXT:    "kind": "NamespaceDecl",
+// JSON-NEXT:    "loc": {
+// JSON-NEXT:     "offset": 3310,
+// JSON-NEXT:     "line": 105,
+// JSON-NEXT:     "col": 11,
+// JSON-NEXT:     "tokLen": 5
+// JSON-NEXT:    },
+// JSON-NEXT:    "range": {
+// JSON-NEXT:     "begin": {
+// JSON-NEXT:      "offset": 3300,
+// JSON-NEXT:      "col": 1,
+// JSON-NEXT:      "tokLen": 9
+// JSON-NEXT:     },
+// JSON-NEXT:     "end": {
+// JSON-NEXT:      "offset": 3632,
+// JSON-NEXT:      "line": 114,
+// JSON-NEXT:      "col": 1,
+// JSON-NEXT:      "tokLen": 1
+// JSON-NEXT:     }
+// JSON-NEXT:    },
+// JSON-NEXT:    "name": "test5",
+// JSON-NEXT:    "inner": [
+// JSON-NEXT:     {
+// JSON-NEXT:      "id": "0x{{.*}}",
+// JSON-NEXT:      "kind": "FunctionTemplateDecl",
+// JSON-NEXT:      "loc": {
+// JSON-NEXT:       "offset": 3338,
+// JSON-NEXT:       "line": 106,
+// JSON-NEXT:       "col": 21,
+// JSON-NEXT:       "tokLen": 1
+// JSON-NEXT:      },
+// JSON-NEXT:      "range": {
+// JSON-NEXT:       "begin": {
+// JSON-NEXT:        "offset": 3318,
+// JSON-NEXT:        "col": 1,
+// JSON-NEXT:        "tokLen": 8
+// JSON-NEXT:       },
+// JSON-NEXT:       "end": {
+// JSON-NEXT:        "offset": 3343,
+// JSON-NEXT:        "col": 26,
+// JSON-NEXT:        "tokLen": 1
+// JSON-NEXT:       }
+// JSON-NEXT:      },
+// JSON-NEXT:      "name": "f",
+// JSON-NEXT:      "inner": [
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "NonTypeTemplateParmDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 3331,
+// JSON-NEXT:         "col": 14,
+// JSON-NEXT:         "tokLen": 1
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 3327,
+// JSON-NEXT:          "col": 10,
+// JSON-NEXT:          "tokLen": 4
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 3327,
+// JSON-NEXT:          "col": 10,
+// JSON-NEXT:          "tokLen": 4
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "type": {
+// JSON-NEXT:         "qualType": "long"
+// JSON-NEXT:        },
+// JSON-NEXT:        "depth": 0,
+// JSON-NEXT:        "index": 0
+// JSON-NEXT:       },
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "FunctionDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 3338,
+// JSON-NEXT:         "col": 21,
+// JSON-NEXT:         "tokLen": 1
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 3333,
+// JSON-NEXT:          "col": 16,
+// JSON-NEXT:          "tokLen": 4
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 3343,
+// JSON-NEXT:          "col": 26,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "name": "f",
+// JSON-NEXT:        "type": {
+// JSON-NEXT:         "qualType": "void ()"
+// JSON-NEXT:        },
+// JSON-NEXT:        "inner": [
+// JSON-NEXT:         {
+// JSON-NEXT:          "id": "0x{{.*}}",
+// JSON-NEXT:          "kind": "CompoundStmt",
+// JSON-NEXT:          "range": {
+// JSON-NEXT:           "begin": {
+// JSON-NEXT:            "offset": 3342,
+// JSON-NEXT:            "col": 25,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           },
+// JSON-NEXT:           "end": {
+// JSON-NEXT:            "offset": 3343,
+// JSON-NEXT:            "col": 26,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           }
+// JSON-NEXT:          }
+// JSON-NEXT:         }
+// JSON-NEXT:        ]
+// JSON-NEXT:       },
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "FunctionDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 3338,
+// JSON-NEXT:         "col": 21,
+// JSON-NEXT:         "tokLen": 1
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 3333,
+// JSON-NEXT:          "col": 16,
+// JSON-NEXT:          "tokLen": 4
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 3343,
+// JSON-NEXT:          "col": 26,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "isUsed": true,
+// JSON-NEXT:        "name": "f",
+// JSON-NEXT:        "mangledName": "_ZN5test51fILl0EEEvv",
+// JSON-NEXT:        "type": {
+// JSON-NEXT:         "qualType": "void ()"
+// JSON-NEXT:        },
+// JSON-NEXT:        "inner": [
+// JSON-NEXT:         {
+// JSON-NEXT:          "kind": "TemplateArgument",
+// JSON-NEXT:          "value": 0
+// JSON-NEXT:         },
+// JSON-NEXT:         {
+// JSON-NEXT:          "id": "0x{{.*}}",
+// JSON-NEXT:          "kind": "CompoundStmt",
+// JSON-NEXT:          "range": {
+// JSON-NEXT:           "begin": {
+// JSON-NEXT:            "offset": 3342,
+// JSON-NEXT:            "col": 25,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           },
+// JSON-NEXT:           "end": {
+// JSON-NEXT:            "offset": 3343,
+// JSON-NEXT:            "col": 26,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           }
+// JSON-NEXT:          }
+// JSON-NEXT:         }
+// JSON-NEXT:        ]
+// JSON-NEXT:       }
+// JSON-NEXT:      ]
+// JSON-NEXT:     },
+// JSON-NEXT:     {
+// JSON-NEXT:      "id": "0x{{.*}}",
+// JSON-NEXT:      "kind": "VarDecl",
+// JSON-NEXT:      "loc": {
+// JSON-NEXT:       "offset": 3352,
+// JSON-NEXT:       "line": 107,
+// JSON-NEXT:       "col": 8,
+// JSON-NEXT:       "tokLen": 1
+// JSON-NEXT:      },
+// JSON-NEXT:      "range": {
+// JSON-NEXT:       "begin": {
+// JSON-NEXT:        "offset": 3345,
+// JSON-NEXT:        "col": 1,
+// JSON-NEXT:        "tokLen": 4
+// JSON-NEXT:       },
+// JSON-NEXT:       "end": {
+// JSON-NEXT:        "offset": 3362,
+// JSON-NEXT:        "col": 18,
+// JSON-NEXT:        "tokLen": 1
+// JSON-NEXT:       }
+// JSON-NEXT:      },
+// JSON-NEXT:      "name": "p",
+// JSON-NEXT:      "mangledName": "_ZN5test51pE",
+// JSON-NEXT:      "type": {
+// JSON-NEXT:       "qualType": "void (*)()"
+// JSON-NEXT:      },
+// JSON-NEXT:      "init": "c",
+// JSON-NEXT:      "inner": [
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "ImplicitCastExpr",
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 3359,
+// JSON-NEXT:          "col": 15,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 3362,
+// JSON-NEXT:          "col": 18,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "type": {
+// JSON-NEXT:         "qualType": "void (*)()"
+// JSON-NEXT:        },
+// JSON-NEXT:        "valueCategory": "prvalue",
+// JSON-NEXT:        "castKind": "FunctionToPointerDecay",
+// JSON-NEXT:        "inner": [
+// JSON-NEXT:         {
+// JSON-NEXT:          "id": "0x{{.*}}",
+// JSON-NEXT:          "kind": "DeclRefExpr",
+// JSON-NEXT:          "range": {
+// JSON-NEXT:           "begin": {
+// JSON-NEXT:            "offset": 3359,
+// JSON-NEXT:            "col": 15,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           },
+// JSON-NEXT:           "end": {
+// JSON-NEXT:            "offset": 3362,
+// JSON-NEXT:            "col": 18,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           }
+// JSON-NEXT:          },
+// JSON-NEXT:          "type": {
+// JSON-NEXT:           "qualType": "void ()"
+// JSON-NEXT:          },
+// JSON-NEXT:          "valueCategory": "lvalue",
+// JSON-NEXT:          "referencedDecl": {
+// JSON-NEXT:           "id": "0x{{.*}}",
+// JSON-NEXT:           "kind": "FunctionDecl",
+// JSON-NEXT:           "name": "f",
+// JSON-NEXT:           "type": {
+// JSON-NEXT:            "qualType": "void ()"
+// JSON-NEXT:           }
+// JSON-NEXT:          },
+// JSON-NEXT:          "foundReferencedDecl": {
+// JSON-NEXT:           "id": "0x{{.*}}",
+// JSON-NEXT:           "kind": "FunctionTemplateDecl",
+// JSON-NEXT:           "name": "f"
+// JSON-NEXT:          }
+// JSON-NEXT:         }
+// JSON-NEXT:        ]
+// JSON-NEXT:       }
+// JSON-NEXT:      ]
+// JSON-NEXT:     },
+// JSON-NEXT:     {
+// JSON-NEXT:      "id": "0x{{.*}}",
+// JSON-NEXT:      "kind": "FunctionTemplateDecl",
+// JSON-NEXT:      "loc": {
+// JSON-NEXT:       "offset": 3393,
+// JSON-NEXT:       "line": 108,
+// JSON-NEXT:       "col": 29,
+// JSON-NEXT:       "tokLen": 1
+// JSON-NEXT:      },
+// JSON-NEXT:      "range": {
+// JSON-NEXT:       "begin": {
+// JSON-NEXT:        "offset": 3365,
+// JSON-NEXT:        "col": 1,
+// JSON-NEXT:        "tokLen": 8
+// JSON-NEXT:       },
+// JSON-NEXT:       "end": {
+// JSON-NEXT:        "offset": 3398,
+// JSON-NEXT:        "col": 34,
+// JSON-NEXT:        "tokLen": 1
+// JSON-NEXT:       }
+// JSON-NEXT:      },
+// JSON-NEXT:      "name": "f",
+// JSON-NEXT:      "inner": [
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "NonTypeTemplateParmDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 3383,
+// JSON-NEXT:         "col": 19,
+// JSON-NEXT:         "tokLen": 1
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 3374,
+// JSON-NEXT:          "col": 10,
+// JSON-NEXT:          "tokLen": 8
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 3385,
+// JSON-NEXT:          "col": 21,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "type": {
+// JSON-NEXT:         "qualType": "unsigned int"
+// JSON-NEXT:        },
+// JSON-NEXT:        "depth": 0,
+// JSON-NEXT:        "index": 0,
+// JSON-NEXT:        "defaultArg": {
+// JSON-NEXT:         "kind": "TemplateArgument",
+// JSON-NEXT:         "isExpr": true
+// JSON-NEXT:        },
+// JSON-NEXT:        "inner": [
+// JSON-NEXT:         {
+// JSON-NEXT:          "kind": "TemplateArgument",
+// JSON-NEXT:          "range": {
+// JSON-NEXT:           "begin": {
+// JSON-NEXT:            "offset": 3385,
+// JSON-NEXT:            "col": 21,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           },
+// JSON-NEXT:           "end": {
+// JSON-NEXT:            "offset": 3385,
+// JSON-NEXT:            "col": 21,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           }
+// JSON-NEXT:          },
+// JSON-NEXT:          "isExpr": true,
+// JSON-NEXT:          "inner": [
+// JSON-NEXT:           {
+// JSON-NEXT:            "id": "0x{{.*}}",
+// JSON-NEXT:            "kind": "IntegerLiteral",
+// JSON-NEXT:            "range": {
+// JSON-NEXT:             "begin": {
+// JSON-NEXT:              "offset": 3385,
+// JSON-NEXT:              "col": 21,
+// JSON-NEXT:              "tokLen": 1
+// JSON-NEXT:             },
+// JSON-NEXT:             "end": {
+// JSON-NEXT:              "offset": 3385,
+// JSON-NEXT:              "col": 21,
+// JSON-NEXT:              "tokLen": 1
+// JSON-NEXT:             }
+// JSON-NEXT:            },
+// JSON-NEXT:            "type": {
+// JSON-NEXT:             "qualType": "int"
+// JSON-NEXT:            },
+// JSON-NEXT:            "valueCategory": "prvalue",
+// JSON-NEXT:            "value": "0"
+// JSON-NEXT:           }
+// JSON-NEXT:          ]
+// JSON-NEXT:         }
+// JSON-NEXT:        ]
+// JSON-NEXT:       },
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "FunctionDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 3393,
+// JSON-NEXT:         "col": 29,
+// JSON-NEXT:         "tokLen": 1
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 3388,
+// JSON-NEXT:          "col": 24,
+// JSON-NEXT:          "tokLen": 4
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 3398,
+// JSON-NEXT:          "col": 34,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "name": "f",
+// JSON-NEXT:        "type": {
+// JSON-NEXT:         "qualType": "void ()"
+// JSON-NEXT:        },
+// JSON-NEXT:        "inner": [
+// JSON-NEXT:         {
+// JSON-NEXT:          "id": "0x{{.*}}",
+// JSON-NEXT:          "kind": "CompoundStmt",
+// JSON-NEXT:          "range": {
+// JSON-NEXT:           "begin": {
+// JSON-NEXT:            "offset": 3397,
+// JSON-NEXT:            "col": 33,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           },
+// JSON-NEXT:           "end": {
+// JSON-NEXT:            "offset": 3398,
+// JSON-NEXT:            "col": 34,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           }
+// JSON-NEXT:          }
+// JSON-NEXT:         }
+// JSON-NEXT:        ]
+// JSON-NEXT:       },
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "FunctionDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 3393,
+// JSON-NEXT:         "col": 29,
+// JSON-NEXT:         "tokLen": 1
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 3388,
+// JSON-NEXT:          "col": 24,
+// JSON-NEXT:          "tokLen": 4
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 3398,
+// JSON-NEXT:          "col": 34,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "isUsed": true,
+// JSON-NEXT:        "name": "f",
+// JSON-NEXT:        "mangledName": "_ZN5test51fILj0EEEvv",
+// JSON-NEXT:        "type": {
+// JSON-NEXT:         "qualType": "void ()"
+// JSON-NEXT:        },
+// JSON-NEXT:        "inner": [
+// JSON-NEXT:         {
+// JSON-NEXT:          "kind": "TemplateArgument",
+// JSON-NEXT:          "value": 0
+// JSON-NEXT:         },
+// JSON-NEXT:         {
+// JSON-NEXT:          "id": "0x{{.*}}",
+// JSON-NEXT:          "kind": "CompoundStmt",
+// JSON-NEXT:          "range": {
+// JSON-NEXT:           "begin": {
+// JSON-NEXT:            "offset": 3397,
+// JSON-NEXT:            "col": 33,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           },
+// JSON-NEXT:           "end": {
+// JSON-NEXT:            "offset": 3398,
+// JSON-NEXT:            "col": 34,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           }
+// JSON-NEXT:          }
+// JSON-NEXT:         }
+// JSON-NEXT:        ]
+// JSON-NEXT:       }
+// JSON-NEXT:      ]
+// JSON-NEXT:     },
+// JSON-NEXT:     {
+// JSON-NEXT:      "id": "0x{{.*}}",
+// JSON-NEXT:      "kind": "VarDecl",
+// JSON-NEXT:      "loc": {
+// JSON-NEXT:       "offset": 3407,
+// JSON-NEXT:       "line": 109,
+// JSON-NEXT:       "col": 8,
+// JSON-NEXT:       "tokLen": 1
+// JSON-NEXT:      },
+// JSON-NEXT:      "range": {
+// JSON-NEXT:       "begin": {
+// JSON-NEXT:        "offset": 3400,
+// JSON-NEXT:        "col": 1,
+// JSON-NEXT:        "tokLen": 4
+// JSON-NEXT:       },
+// JSON-NEXT:       "end": {
+// JSON-NEXT:        "offset": 3416,
+// JSON-NEXT:        "col": 17,
+// JSON-NEXT:        "tokLen": 1
+// JSON-NEXT:       }
+// JSON-NEXT:      },
+// JSON-NEXT:      "name": "q",
+// JSON-NEXT:      "mangledName": "_ZN5test51qE",
+// JSON-NEXT:      "type": {
+// JSON-NEXT:       "qualType": "void (*)()"
+// JSON-NEXT:      },
+// JSON-NEXT:      "init": "c",
+// JSON-NEXT:      "inner": [
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "ImplicitCastExpr",
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 3414,
+// JSON-NEXT:          "col": 15,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 3416,
+// JSON-NEXT:          "col": 17,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "type": {
+// JSON-NEXT:         "qualType": "void (*)()"
+// JSON-NEXT:        },
+// JSON-NEXT:        "valueCategory": "prvalue",
+// JSON-NEXT:        "castKind": "FunctionToPointerDecay",
+// JSON-NEXT:        "inner": [
+// JSON-NEXT:         {
+// JSON-NEXT:          "id": "0x{{.*}}",
+// JSON-NEXT:          "kind": "DeclRefExpr",
+// JSON-NEXT:          "range": {
+// JSON-NEXT:           "begin": {
+// JSON-NEXT:            "offset": 3414,
+// JSON-NEXT:            "col": 15,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           },
+// JSON-NEXT:           "end": {
+// JSON-NEXT:            "offset": 3416,
+// JSON-NEXT:            "col": 17,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           }
+// JSON-NEXT:          },
+// JSON-NEXT:          "type": {
+// JSON-NEXT:           "qualType": "void ()"
+// JSON-NEXT:          },
+// JSON-NEXT:          "valueCategory": "lvalue",
+// JSON-NEXT:          "referencedDecl": {
+// JSON-NEXT:           "id": "0x{{.*}}",
+// JSON-NEXT:           "kind": "FunctionDecl",
+// JSON-NEXT:           "name": "f",
+// JSON-NEXT:           "type": {
+// JSON-NEXT:            "qualType": "void ()"
+// JSON-NEXT:           }
+// JSON-NEXT:          },
+// JSON-NEXT:          "foundReferencedDecl": {
+// JSON-NEXT:           "id": "0x{{.*}}",
+// JSON-NEXT:           "kind": "FunctionTemplateDecl",
+// JSON-NEXT:           "name": "f"
+// JSON-NEXT:          }
+// JSON-NEXT:         }
+// JSON-NEXT:        ]
+// JSON-NEXT:       }
+// JSON-NEXT:      ]
+// JSON-NEXT:     }
+// JSON-NEXT:    ]
+// JSON-NEXT:   },
+// JSON-NEXT:   {
+// JSON-NEXT:    "id": "0x{{.*}}",
+// JSON-NEXT:    "kind": "NamespaceDecl",
+// JSON-NEXT:    "loc": {
+// JSON-NEXT:     "offset": 3645,
+// JSON-NEXT:     "line": 116,
+// JSON-NEXT:     "col": 11,
+// JSON-NEXT:     "tokLen": 5
+// JSON-NEXT:    },
+// JSON-NEXT:    "range": {
+// JSON-NEXT:     "begin": {
+// JSON-NEXT:      "offset": 3635,
+// JSON-NEXT:      "col": 1,
+// JSON-NEXT:      "tokLen": 9
+// JSON-NEXT:     },
+// JSON-NEXT:     "end": {
+// JSON-NEXT:      "offset": 4000,
+// JSON-NEXT:      "line": 128,
+// JSON-NEXT:      "col": 1,
+// JSON-NEXT:      "tokLen": 1
+// JSON-NEXT:     }
+// JSON-NEXT:    },
+// JSON-NEXT:    "name": "test6",
+// JSON-NEXT:    "inner": [
+// JSON-NEXT:     {
+// JSON-NEXT:      "id": "0x{{.*}}",
+// JSON-NEXT:      "kind": "VarTemplateDecl",
+// JSON-NEXT:      "loc": {
+// JSON-NEXT:       "offset": 3687,
+// JSON-NEXT:       "line": 118,
+// JSON-NEXT:       "col": 16,
+// JSON-NEXT:       "tokLen": 1
+// JSON-NEXT:      },
+// JSON-NEXT:      "range": {
+// JSON-NEXT:       "begin": {
+// JSON-NEXT:        "offset": 3653,
+// JSON-NEXT:        "line": 117,
+// JSON-NEXT:        "col": 1,
+// JSON-NEXT:        "tokLen": 8
+// JSON-NEXT:       },
+// JSON-NEXT:       "end": {
+// JSON-NEXT:        "offset": 3691,
+// JSON-NEXT:        "line": 118,
+// JSON-NEXT:        "col": 20,
+// JSON-NEXT:        "tokLen": 4
+// JSON-NEXT:       }
+// JSON-NEXT:      },
+// JSON-NEXT:      "name": "C",
+// JSON-NEXT:      "inner": [
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "TemplateTypeParmDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 3669,
+// JSON-NEXT:         "line": 117,
+// JSON-NEXT:         "col": 17,
+// JSON-NEXT:         "tokLen": 1
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 3663,
+// JSON-NEXT:          "col": 11,
+// JSON-NEXT:          "tokLen": 5
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 3669,
+// JSON-NEXT:          "col": 17,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "name": "D",
+// JSON-NEXT:        "tagUsed": "class",
+// JSON-NEXT:        "depth": 0,
+// JSON-NEXT:        "index": 0
+// JSON-NEXT:       },
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "VarDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 3687,
+// JSON-NEXT:         "line": 118,
+// JSON-NEXT:         "col": 16,
+// JSON-NEXT:         "tokLen": 1
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 3672,
+// JSON-NEXT:          "col": 1,
+// JSON-NEXT:          "tokLen": 9
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 3691,
+// JSON-NEXT:          "col": 20,
+// JSON-NEXT:          "tokLen": 4
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "name": "C",
+// JSON-NEXT:        "type": {
+// JSON-NEXT:         "qualType": "const bool"
+// JSON-NEXT:        },
+// JSON-NEXT:        "constexpr": true,
+// JSON-NEXT:        "init": "c",
+// JSON-NEXT:        "inner": [
+// JSON-NEXT:         {
+// JSON-NEXT:          "id": "0x{{.*}}",
+// JSON-NEXT:          "kind": "CXXBoolLiteralExpr",
+// JSON-NEXT:          "range": {
+// JSON-NEXT:           "begin": {
+// JSON-NEXT:            "offset": 3691,
+// JSON-NEXT:            "col": 20,
+// JSON-NEXT:            "tokLen": 4
+// JSON-NEXT:           },
+// JSON-NEXT:           "end": {
+// JSON-NEXT:            "offset": 3691,
+// JSON-NEXT:            "col": 20,
+// JSON-NEXT:            "tokLen": 4
+// JSON-NEXT:           }
+// JSON-NEXT:          },
+// JSON-NEXT:          "type": {
+// JSON-NEXT:           "qualType": "bool"
+// JSON-NEXT:          },
+// JSON-NEXT:          "valueCategory": "prvalue",
+// JSON-NEXT:          "value": true
+// JSON-NEXT:         }
+// JSON-NEXT:        ]
+// JSON-NEXT:       }
+// JSON-NEXT:      ]
+// JSON-NEXT:     },
+// JSON-NEXT:     {
+// JSON-NEXT:      "id": "0x{{.*}}",
+// JSON-NEXT:      "kind": "FunctionTemplateDecl",
+// JSON-NEXT:      "loc": {
+// JSON-NEXT:       "offset": 3724,
+// JSON-NEXT:       "line": 121,
+// JSON-NEXT:       "col": 6,
+// JSON-NEXT:       "tokLen": 4
+// JSON-NEXT:      },
+// JSON-NEXT:      "range": {
+// JSON-NEXT:       "begin": {
+// JSON-NEXT:        "offset": 3698,
+// JSON-NEXT:        "line": 120,
+// JSON-NEXT:        "col": 1,
+// JSON-NEXT:        "tokLen": 8
+// JSON-NEXT:       },
+// JSON-NEXT:       "end": {
+// JSON-NEXT:        "offset": 3998,
+// JSON-NEXT:        "line": 127,
+// JSON-NEXT:        "col": 1,
+// JSON-NEXT:        "tokLen": 1
+// JSON-NEXT:       }
+// JSON-NEXT:      },
+// JSON-NEXT:      "name": "func",
+// JSON-NEXT:      "inner": [
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "TemplateTypeParmDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 3714,
+// JSON-NEXT:         "line": 120,
+// JSON-NEXT:         "col": 17,
+// JSON-NEXT:         "tokLen": 3
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 3708,
+// JSON-NEXT:          "col": 11,
+// JSON-NEXT:          "tokLen": 5
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 3714,
+// JSON-NEXT:          "col": 17,
+// JSON-NEXT:          "tokLen": 3
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "isReferenced": true,
+// JSON-NEXT:        "name": "Key",
+// JSON-NEXT:        "tagUsed": "class",
+// JSON-NEXT:        "depth": 0,
+// JSON-NEXT:        "index": 0
+// JSON-NEXT:       },
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "FunctionDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 3724,
+// JSON-NEXT:         "line": 121,
+// JSON-NEXT:         "col": 6,
+// JSON-NEXT:         "tokLen": 4
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 3719,
+// JSON-NEXT:          "col": 1,
+// JSON-NEXT:          "tokLen": 4
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 3998,
+// JSON-NEXT:          "line": 127,
+// JSON-NEXT:          "col": 1,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "name": "func",
+// JSON-NEXT:        "type": {
+// JSON-NEXT:         "qualType": "void ()"
+// JSON-NEXT:        },
+// JSON-NEXT:        "inner": [
+// JSON-NEXT:         {
+// JSON-NEXT:          "id": "0x{{.*}}",
+// JSON-NEXT:          "kind": "CompoundStmt",
+// JSON-NEXT:          "range": {
+// JSON-NEXT:           "begin": {
+// JSON-NEXT:            "offset": 3731,
+// JSON-NEXT:            "line": 121,
+// JSON-NEXT:            "col": 13,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           },
+// JSON-NEXT:           "end": {
+// JSON-NEXT:            "offset": 3998,
+// JSON-NEXT:            "line": 127,
+// JSON-NEXT:            "col": 1,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           }
+// JSON-NEXT:          },
+// JSON-NEXT:          "inner": [
+// JSON-NEXT:           {
+// JSON-NEXT:            "id": "0x{{.*}}",
+// JSON-NEXT:            "kind": "UnresolvedLookupExpr",
+// JSON-NEXT:            "range": {
+// JSON-NEXT:             "begin": {
+// JSON-NEXT:              "offset": 3735,
+// JSON-NEXT:              "line": 122,
+// JSON-NEXT:              "col": 3,
+// JSON-NEXT:              "tokLen": 1
+// JSON-NEXT:             },
+// JSON-NEXT:             "end": {
+// JSON-NEXT:              "offset": 3740,
+// JSON-NEXT:              "col": 8,
+// JSON-NEXT:              "tokLen": 1
+// JSON-NEXT:             }
+// JSON-NEXT:            },
+// JSON-NEXT:            "type": {
+// JSON-NEXT:             "qualType": "<dependent type>"
+// JSON-NEXT:            },
+// JSON-NEXT:            "valueCategory": "lvalue",
+// JSON-NEXT:            "usesADL": false,
+// JSON-NEXT:            "name": "C",
+// JSON-NEXT:            "lookups": [
+// JSON-NEXT:             {
+// JSON-NEXT:              "id": "0x{{.*}}",
+// JSON-NEXT:              "kind": "VarTemplateDecl",
+// JSON-NEXT:              "name": "C"
+// JSON-NEXT:             }
+// JSON-NEXT:            ],
+// JSON-NEXT:            "inner": [
+// JSON-NEXT:             {
+// JSON-NEXT:              "kind": "TemplateArgument",
+// JSON-NEXT:              "type": {
+// JSON-NEXT:               "qualType": "Key"
+// JSON-NEXT:              },
+// JSON-NEXT:              "inner": [
+// JSON-NEXT:               {
+// JSON-NEXT:                "id": "0x{{.*}}",
+// JSON-NEXT:                "kind": "TemplateTypeParmType",
+// JSON-NEXT:                "type": {
+// JSON-NEXT:                 "qualType": "Key"
+// JSON-NEXT:                },
+// JSON-NEXT:                "isDependent": true,
+// JSON-NEXT:                "isInstantiationDependent": true,
+// JSON-NEXT:                "depth": 0,
+// JSON-NEXT:                "index": 0,
+// JSON-NEXT:                "decl": {
+// JSON-NEXT:                 "id": "0x{{.*}}",
+// JSON-NEXT:                 "kind": "TemplateTypeParmDecl",
+// JSON-NEXT:                 "name": "Key"
+// JSON-NEXT:                }
+// JSON-NEXT:               }
+// JSON-NEXT:              ]
+// JSON-NEXT:             }
+// JSON-NEXT:            ]
+// JSON-NEXT:           }
+// JSON-NEXT:          ]
+// JSON-NEXT:         }
+// JSON-NEXT:        ]
+// JSON-NEXT:       }
+// JSON-NEXT:      ]
+// JSON-NEXT:     }
+// JSON-NEXT:    ]
+// JSON-NEXT:   },
+// JSON-NEXT:   {
+// JSON-NEXT:    "id": "0x{{.*}}",
+// JSON-NEXT:    "kind": "NamespaceDecl",
+// JSON-NEXT:    "loc": {
+// JSON-NEXT:     "offset": 4013,
+// JSON-NEXT:     "line": 130,
+// JSON-NEXT:     "col": 11,
+// JSON-NEXT:     "tokLen": 5
+// JSON-NEXT:    },
+// JSON-NEXT:    "range": {
+// JSON-NEXT:     "begin": {
+// JSON-NEXT:      "offset": 4003,
+// JSON-NEXT:      "col": 1,
+// JSON-NEXT:      "tokLen": 9
+// JSON-NEXT:     },
+// JSON-NEXT:     "end": {
+// JSON-NEXT:      "offset": 4308,
+// JSON-NEXT:      "line": 136,
+// JSON-NEXT:      "col": 1,
+// JSON-NEXT:      "tokLen": 1
+// JSON-NEXT:     }
+// JSON-NEXT:    },
+// JSON-NEXT:    "name": "test7",
+// JSON-NEXT:    "inner": [
+// JSON-NEXT:     {
+// JSON-NEXT:      "id": "0x{{.*}}",
+// JSON-NEXT:      "kind": "ClassTemplateDecl",
+// JSON-NEXT:      "loc": {
+// JSON-NEXT:       "offset": 4066,
+// JSON-NEXT:       "line": 131,
+// JSON-NEXT:       "col": 46,
+// JSON-NEXT:       "tokLen": 1
+// JSON-NEXT:      },
+// JSON-NEXT:      "range": {
+// JSON-NEXT:       "begin": {
+// JSON-NEXT:        "offset": 4023,
+// JSON-NEXT:        "col": 3,
+// JSON-NEXT:        "tokLen": 8
+// JSON-NEXT:       },
+// JSON-NEXT:       "end": {
+// JSON-NEXT:        "offset": 4069,
+// JSON-NEXT:        "col": 49,
+// JSON-NEXT:        "tokLen": 1
+// JSON-NEXT:       }
+// JSON-NEXT:      },
+// JSON-NEXT:      "name": "A",
+// JSON-NEXT:      "inner": [
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "TemplateTemplateParmDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 4055,
+// JSON-NEXT:         "col": 35,
+// JSON-NEXT:         "tokLen": 2
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 4033,
+// JSON-NEXT:          "col": 13,
+// JSON-NEXT:          "tokLen": 8
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 4055,
+// JSON-NEXT:          "col": 35,
+// JSON-NEXT:          "tokLen": 2
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "name": "TT",
+// JSON-NEXT:        "depth": 0,
+// JSON-NEXT:        "index": 0,
+// JSON-NEXT:        "inner": [
+// JSON-NEXT:         {
+// JSON-NEXT:          "id": "0x{{.*}}",
+// JSON-NEXT:          "kind": "TemplateTypeParmDecl",
+// JSON-NEXT:          "loc": {
+// JSON-NEXT:           "offset": 4047,
+// JSON-NEXT:           "col": 27,
+// JSON-NEXT:           "tokLen": 1
+// JSON-NEXT:          },
+// JSON-NEXT:          "range": {
+// JSON-NEXT:           "begin": {
+// JSON-NEXT:            "offset": 4042,
+// JSON-NEXT:            "col": 22,
+// JSON-NEXT:            "tokLen": 5
+// JSON-NEXT:           },
+// JSON-NEXT:           "end": {
+// JSON-NEXT:            "offset": 4042,
+// JSON-NEXT:            "col": 22,
+// JSON-NEXT:            "tokLen": 5
+// JSON-NEXT:           }
+// JSON-NEXT:          },
+// JSON-NEXT:          "tagUsed": "class",
+// JSON-NEXT:          "depth": 1,
+// JSON-NEXT:          "index": 0
+// JSON-NEXT:         }
+// JSON-NEXT:        ]
+// JSON-NEXT:       },
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "CXXRecordDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 4066,
+// JSON-NEXT:         "col": 46,
+// JSON-NEXT:         "tokLen": 1
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 4059,
+// JSON-NEXT:          "col": 39,
+// JSON-NEXT:          "tokLen": 6
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 4069,
+// JSON-NEXT:          "col": 49,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "name": "A",
+// JSON-NEXT:        "tagUsed": "struct",
+// JSON-NEXT:        "completeDefinition": true,
+// JSON-NEXT:        "definitionData": {
+// JSON-NEXT:         "canConstDefaultInit": true,
+// JSON-NEXT:         "copyAssign": {
+// JSON-NEXT:          "hasConstParam": true,
+// JSON-NEXT:          "implicitHasConstParam": true,
+// JSON-NEXT:          "needsImplicit": true,
+// JSON-NEXT:          "simple": true,
+// JSON-NEXT:          "trivial": true
+// JSON-NEXT:         },
+// JSON-NEXT:         "copyCtor": {
+// JSON-NEXT:          "hasConstParam": true,
+// JSON-NEXT:          "implicitHasConstParam": true,
+// JSON-NEXT:          "needsImplicit": true,
+// JSON-NEXT:          "simple": true,
+// JSON-NEXT:          "trivial": true
+// JSON-NEXT:         },
+// JSON-NEXT:         "defaultCtor": {
+// JSON-NEXT:          "defaultedIsConstexpr": true,
+// JSON-NEXT:          "exists": true,
+// JSON-NEXT:          "isConstexpr": true,
+// JSON-NEXT:          "needsImplicit": true,
+// JSON-NEXT:          "trivial": true
+// JSON-NEXT:         },
+// JSON-NEXT:         "dtor": {
+// JSON-NEXT:          "irrelevant": true,
+// JSON-NEXT:          "needsImplicit": true,
+// JSON-NEXT:          "simple": true,
+// JSON-NEXT:          "trivial": true
+// JSON-NEXT:         },
+// JSON-NEXT:         "hasConstexprNonCopyMoveConstructor": true,
+// JSON-NEXT:         "isAggregate": true,
+// JSON-NEXT:         "isEmpty": true,
+// JSON-NEXT:         "isLiteral": true,
+// JSON-NEXT:         "isPOD": true,
+// JSON-NEXT:         "isStandardLayout": true,
+// JSON-NEXT:         "isTrivial": true,
+// JSON-NEXT:         "isTriviallyCopyable": true,
+// JSON-NEXT:         "moveAssign": {
+// JSON-NEXT:          "exists": true,
+// JSON-NEXT:          "needsImplicit": true,
+// JSON-NEXT:          "simple": true,
+// JSON-NEXT:          "trivial": true
+// JSON-NEXT:         },
+// JSON-NEXT:         "moveCtor": {
+// JSON-NEXT:          "exists": true,
+// JSON-NEXT:          "needsImplicit": true,
+// JSON-NEXT:          "simple": true,
+// JSON-NEXT:          "trivial": true
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "inner": [
+// JSON-NEXT:         {
+// JSON-NEXT:          "id": "0x{{.*}}",
+// JSON-NEXT:          "kind": "CXXRecordDecl",
+// JSON-NEXT:          "loc": {
+// JSON-NEXT:           "offset": 4066,
+// JSON-NEXT:           "col": 46,
+// JSON-NEXT:           "tokLen": 1
+// JSON-NEXT:          },
+// JSON-NEXT:          "range": {
+// JSON-NEXT:           "begin": {
+// JSON-NEXT:            "offset": 4059,
+// JSON-NEXT:            "col": 39,
+// JSON-NEXT:            "tokLen": 6
+// JSON-NEXT:           },
+// JSON-NEXT:           "end": {
+// JSON-NEXT:            "offset": 4066,
+// JSON-NEXT:            "col": 46,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           }
+// JSON-NEXT:          },
+// JSON-NEXT:          "isImplicit": true,
+// JSON-NEXT:          "name": "A",
+// JSON-NEXT:          "tagUsed": "struct"
+// JSON-NEXT:         }
+// JSON-NEXT:        ]
+// JSON-NEXT:       },
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "ClassTemplateSpecializationDecl",
+// JSON-NEXT:        "name": "A"
+// JSON-NEXT:       }
+// JSON-NEXT:      ]
+// JSON-NEXT:     },
+// JSON-NEXT:     {
+// JSON-NEXT:      "id": "0x{{.*}}",
+// JSON-NEXT:      "kind": "ClassTemplateDecl",
+// JSON-NEXT:      "loc": {
+// JSON-NEXT:       "offset": 4100,
+// JSON-NEXT:       "line": 132,
+// JSON-NEXT:       "col": 29,
+// JSON-NEXT:       "tokLen": 1
+// JSON-NEXT:      },
+// JSON-NEXT:      "range": {
+// JSON-NEXT:       "begin": {
+// JSON-NEXT:        "offset": 4074,
+// JSON-NEXT:        "col": 3,
+// JSON-NEXT:        "tokLen": 8
+// JSON-NEXT:       },
+// JSON-NEXT:       "end": {
+// JSON-NEXT:        "offset": 4103,
+// JSON-NEXT:        "col": 32,
+// JSON-NEXT:        "tokLen": 1
+// JSON-NEXT:       }
+// JSON-NEXT:      },
+// JSON-NEXT:      "name": "B",
+// JSON-NEXT:      "inner": [
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "TemplateTypeParmDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 4092,
+// JSON-NEXT:         "col": 21,
+// JSON-NEXT:         "tokLen": 1
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 4084,
+// JSON-NEXT:          "col": 13,
+// JSON-NEXT:          "tokLen": 5
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 4084,
+// JSON-NEXT:          "col": 13,
+// JSON-NEXT:          "tokLen": 5
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "tagUsed": "class",
+// JSON-NEXT:        "depth": 0,
+// JSON-NEXT:        "index": 0,
+// JSON-NEXT:        "isParameterPack": true
+// JSON-NEXT:       },
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "CXXRecordDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 4100,
+// JSON-NEXT:         "col": 29,
+// JSON-NEXT:         "tokLen": 1
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 4094,
+// JSON-NEXT:          "col": 23,
+// JSON-NEXT:          "tokLen": 5
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 4103,
+// JSON-NEXT:          "col": 32,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "name": "B",
+// JSON-NEXT:        "tagUsed": "class",
+// JSON-NEXT:        "completeDefinition": true,
+// JSON-NEXT:        "definitionData": {
+// JSON-NEXT:         "canConstDefaultInit": true,
+// JSON-NEXT:         "copyAssign": {
+// JSON-NEXT:          "hasConstParam": true,
+// JSON-NEXT:          "implicitHasConstParam": true,
+// JSON-NEXT:          "needsImplicit": true,
+// JSON-NEXT:          "simple": true,
+// JSON-NEXT:          "trivial": true
+// JSON-NEXT:         },
+// JSON-NEXT:         "copyCtor": {
+// JSON-NEXT:          "hasConstParam": true,
+// JSON-NEXT:          "implicitHasConstParam": true,
+// JSON-NEXT:          "needsImplicit": true,
+// JSON-NEXT:          "simple": true,
+// JSON-NEXT:          "trivial": true
+// JSON-NEXT:         },
+// JSON-NEXT:         "defaultCtor": {
+// JSON-NEXT:          "defaultedIsConstexpr": true,
+// JSON-NEXT:          "exists": true,
+// JSON-NEXT:          "isConstexpr": true,
+// JSON-NEXT:          "needsImplicit": true,
+// JSON-NEXT:          "trivial": true
+// JSON-NEXT:         },
+// JSON-NEXT:         "dtor": {
+// JSON-NEXT:          "irrelevant": true,
+// JSON-NEXT:          "needsImplicit": true,
+// JSON-NEXT:          "simple": true,
+// JSON-NEXT:          "trivial": true
+// JSON-NEXT:         },
+// JSON-NEXT:         "hasConstexprNonCopyMoveConstructor": true,
+// JSON-NEXT:         "isAggregate": true,
+// JSON-NEXT:         "isEmpty": true,
+// JSON-NEXT:         "isLiteral": true,
+// JSON-NEXT:         "isPOD": true,
+// JSON-NEXT:         "isStandardLayout": true,
+// JSON-NEXT:         "isTrivial": true,
+// JSON-NEXT:         "isTriviallyCopyable": true,
+// JSON-NEXT:         "moveAssign": {
+// JSON-NEXT:          "exists": true,
+// JSON-NEXT:          "needsImplicit": true,
+// JSON-NEXT:          "simple": true,
+// JSON-NEXT:          "trivial": true
+// JSON-NEXT:         },
+// JSON-NEXT:         "moveCtor": {
+// JSON-NEXT:          "exists": true,
+// JSON-NEXT:          "needsImplicit": true,
+// JSON-NEXT:          "simple": true,
+// JSON-NEXT:          "trivial": true
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "inner": [
+// JSON-NEXT:         {
+// JSON-NEXT:          "id": "0x{{.*}}",
+// JSON-NEXT:          "kind": "CXXRecordDecl",
+// JSON-NEXT:          "loc": {
+// JSON-NEXT:           "offset": 4100,
+// JSON-NEXT:           "col": 29,
+// JSON-NEXT:           "tokLen": 1
+// JSON-NEXT:          },
+// JSON-NEXT:          "range": {
+// JSON-NEXT:           "begin": {
+// JSON-NEXT:            "offset": 4094,
+// JSON-NEXT:            "col": 23,
+// JSON-NEXT:            "tokLen": 5
+// JSON-NEXT:           },
+// JSON-NEXT:           "end": {
+// JSON-NEXT:            "offset": 4100,
+// JSON-NEXT:            "col": 29,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           }
+// JSON-NEXT:          },
+// JSON-NEXT:          "isImplicit": true,
+// JSON-NEXT:          "name": "B",
+// JSON-NEXT:          "tagUsed": "class"
+// JSON-NEXT:         }
+// JSON-NEXT:        ]
+// JSON-NEXT:       }
+// JSON-NEXT:      ]
+// JSON-NEXT:     },
+// JSON-NEXT:     {
+// JSON-NEXT:      "id": "0x{{.*}}",
+// JSON-NEXT:      "kind": "ClassTemplateSpecializationDecl",
+// JSON-NEXT:      "loc": {
+// JSON-NEXT:       "offset": 4124,
+// JSON-NEXT:       "line": 133,
+// JSON-NEXT:       "col": 19,
+// JSON-NEXT:       "tokLen": 1
+// JSON-NEXT:      },
+// JSON-NEXT:      "range": {
+// JSON-NEXT:       "begin": {
+// JSON-NEXT:        "offset": 4108,
+// JSON-NEXT:        "col": 3,
+// JSON-NEXT:        "tokLen": 8
+// JSON-NEXT:       },
+// JSON-NEXT:       "end": {
+// JSON-NEXT:        "offset": 4127,
+// JSON-NEXT:        "col": 22,
+// JSON-NEXT:        "tokLen": 1
+// JSON-NEXT:       }
+// JSON-NEXT:      },
+// JSON-NEXT:      "name": "A",
+// JSON-NEXT:      "tagUsed": "struct",
+// JSON-NEXT:      "completeDefinition": true,
+// JSON-NEXT:      "strict-pack-match": true,
+// JSON-NEXT:      "definitionData": {
+// JSON-NEXT:       "canConstDefaultInit": true,
+// JSON-NEXT:       "canPassInRegisters": true,
+// JSON-NEXT:       "copyAssign": {
+// JSON-NEXT:        "hasConstParam": true,
+// JSON-NEXT:        "implicitHasConstParam": true,
+// JSON-NEXT:        "needsImplicit": true,
+// JSON-NEXT:        "simple": true,
+// JSON-NEXT:        "trivial": true
+// JSON-NEXT:       },
+// JSON-NEXT:       "copyCtor": {
+// JSON-NEXT:        "hasConstParam": true,
+// JSON-NEXT:        "implicitHasConstParam": true,
+// JSON-NEXT:        "needsImplicit": true,
+// JSON-NEXT:        "simple": true,
+// JSON-NEXT:        "trivial": true
+// JSON-NEXT:       },
+// JSON-NEXT:       "defaultCtor": {
+// JSON-NEXT:        "defaultedIsConstexpr": true,
+// JSON-NEXT:        "exists": true,
+// JSON-NEXT:        "isConstexpr": true,
+// JSON-NEXT:        "needsImplicit": true,
+// JSON-NEXT:        "trivial": true
+// JSON-NEXT:       },
+// JSON-NEXT:       "dtor": {
+// JSON-NEXT:        "irrelevant": true,
+// JSON-NEXT:        "needsImplicit": true,
+// JSON-NEXT:        "simple": true,
+// JSON-NEXT:        "trivial": true
+// JSON-NEXT:       },
+// JSON-NEXT:       "hasConstexprNonCopyMoveConstructor": true,
+// JSON-NEXT:       "isAggregate": true,
+// JSON-NEXT:       "isEmpty": true,
+// JSON-NEXT:       "isLiteral": true,
+// JSON-NEXT:       "isPOD": true,
+// JSON-NEXT:       "isStandardLayout": true,
+// JSON-NEXT:       "isTrivial": true,
+// JSON-NEXT:       "isTriviallyCopyable": true,
+// JSON-NEXT:       "moveAssign": {
+// JSON-NEXT:        "exists": true,
+// JSON-NEXT:        "needsImplicit": true,
+// JSON-NEXT:        "simple": true,
+// JSON-NEXT:        "trivial": true
+// JSON-NEXT:       },
+// JSON-NEXT:       "moveCtor": {
+// JSON-NEXT:        "exists": true,
+// JSON-NEXT:        "needsImplicit": true,
+// JSON-NEXT:        "simple": true,
+// JSON-NEXT:        "trivial": true
+// JSON-NEXT:       }
+// JSON-NEXT:      },
+// JSON-NEXT:      "inner": [
+// JSON-NEXT:       {
+// JSON-NEXT:        "kind": "TemplateArgument"
+// JSON-NEXT:       },
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "CXXRecordDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 4066,
+// JSON-NEXT:         "line": 131,
+// JSON-NEXT:         "col": 46,
+// JSON-NEXT:         "tokLen": 1
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 4059,
+// JSON-NEXT:          "col": 39,
+// JSON-NEXT:          "tokLen": 6
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 4066,
+// JSON-NEXT:          "col": 46,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "isImplicit": true,
+// JSON-NEXT:        "name": "A",
+// JSON-NEXT:        "tagUsed": "struct"
+// JSON-NEXT:       }
+// JSON-NEXT:      ]
+// JSON-NEXT:     }
+// JSON-NEXT:    ]
+// JSON-NEXT:   }
+// JSON-NEXT:  ]
+// JSON-NEXT: }
diff --git a/clang/test/AST/gen_ast_dump_json_test.py b/clang/test/AST/gen_ast_dump_json_test.py
index 301d60e479dbf..39b8eaadbad32 100644
--- a/clang/test/AST/gen_ast_dump_json_test.py
+++ b/clang/test/AST/gen_ast_dump_json_test.py
@@ -83,6 +83,12 @@ def main():
         action="store",
         default="",
     )
+    parser.add_argument(
+        "--prefix",
+        help="The FileCheck prefix",
+        action="store",
+        default="CHECK",
+    )
     update_or_generate_group = parser.add_mutually_exclusive_group()
     update_or_generate_group.add_argument(
         "--update", help="Update the file in-place", action="store_true"
@@ -113,11 +119,18 @@ def main():
             cmdline_opts=args.opts,
             do_update=args.update,
             force_update=args.update_manual,
+            prefix=args.prefix,
         )
 
 
 def process_file(
-    source_file, clang_binary, cmdline_filters, cmdline_opts, do_update, force_update
+    source_file,
+    clang_binary,
+    cmdline_filters,
+    cmdline_opts,
+    do_update,
+    force_update,
+    prefix,
 ):
     note_firstline = (
         "// NOTE: CHECK lines have been autogenerated by " "gen_ast_dump_json_test.py"
@@ -227,14 +240,14 @@ def process_file(
         for out_ast in out_asts:
             append_str = json.dumps(out_ast, indent=1, ensure_ascii=False)
             out_str = "\n\n"
-            out_str += "// CHECK-NOT: {{^}}Dumping\n"
+            out_str += f"// {prefix}-NOT: {{{{^}}}}Dumping\n"
             index = 0
             for append_line in append_str.splitlines()[2:]:
                 if index == 0:
-                    out_str += "// CHECK: %s\n" % (append_line.rstrip())
+                    out_str += f"// {prefix}: %s\n" % (append_line.rstrip())
                     index += 1
                 else:
-                    out_str += "// CHECK-NEXT: %s\n" % (append_line.rstrip())
+                    out_str += f"// {prefix}-NEXT: %s\n" % (append_line.rstrip())
 
             f.write(out_str)
         f.flush()
diff --git a/clang/test/SemaTemplate/cwg2398.cpp b/clang/test/SemaTemplate/cwg2398.cpp
index dccb17c48d325..8592be469bb50 100644
--- a/clang/test/SemaTemplate/cwg2398.cpp
+++ b/clang/test/SemaTemplate/cwg2398.cpp
@@ -630,6 +630,26 @@ namespace regression2 {
   template <typename, int> struct Matrix;
   template struct D<Matrix<double, 3>>;
 } // namespace regression2
+namespace regression3 {
+  struct None {};
+  template<class T> struct Node { using type = T; };
+
+  template <template<class> class TT, class T>
+  // old-note@-1 {{previous template type parameter declared here}}
+  struct A {
+    static_assert(!__is_same(T, None));
+    using type2 = typename A<TT, typename T::type>::type2;
+  };
+
+  template <template<class> class TT> struct A<TT, None> {
+    using type2 = void;
+  };
+
+  template <class...> class B {};
+  // old-note@-1 {{template type parameter pack does not match template type parameter}}
+  template struct A<B, Node<None>>;
+  // old-error@-1 {{different template}}
+} // namespace regression3
 
 namespace nttp_auto {
   namespace t1 {
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/CxxModuleHandler.cpp b/lldb/source/Plugins/ExpressionParser/Clang/CxxModuleHandler.cpp
index c201153fd7ceb..112f106fc3bc3 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/CxxModuleHandler.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/CxxModuleHandler.cpp
@@ -280,7 +280,8 @@ std::optional<Decl *> CxxModuleHandler::tryInstantiateStdTemplate(Decl *d) {
       new_class_template->getDeclContext(),
       new_class_template->getTemplatedDecl()->getLocation(),
       new_class_template->getLocation(), new_class_template, imported_args,
-      nullptr);
+      td->hasMatchedPackOnParmToNonPackOnArg(),
+      /*PrevDecl=*/nullptr);
 
   new_class_template->AddSpecialization(result, InsertPos);
   if (new_class_template->isOutOfLine())

From d420bf8a299ad13ac7a11971ea1430fc0edebcea Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Wed, 19 Feb 2025 08:21:56 -0500
Subject: [PATCH 176/282] [libc++] Guard include of <features.h> with
 __has_include (#127691)

Some configurations define __AMDGPU__ or __NVPTX__ on platforms that
don't provide <features.h>, such as CUDA on Mac.

(cherry picked from commit 2c8b1248513624e89b510397224f0f405116f3d3)
---
 libcxx/include/__configuration/platform.h | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/libcxx/include/__configuration/platform.h b/libcxx/include/__configuration/platform.h
index cff99376ee24b..8d0f8f63f5213 100644
--- a/libcxx/include/__configuration/platform.h
+++ b/libcxx/include/__configuration/platform.h
@@ -32,12 +32,14 @@
 
 // Need to detect which libc we're using if we're on Linux.
 #if defined(__linux__) || defined(__AMDGPU__) || defined(__NVPTX__)
-#  include <features.h>
-#  if defined(__GLIBC_PREREQ)
-#    define _LIBCPP_GLIBC_PREREQ(a, b) __GLIBC_PREREQ(a, b)
-#  else
-#    define _LIBCPP_GLIBC_PREREQ(a, b) 0
-#  endif // defined(__GLIBC_PREREQ)
+#  if __has_include(<features.h>)
+#    include <features.h>
+#    if defined(__GLIBC_PREREQ)
+#      define _LIBCPP_GLIBC_PREREQ(a, b) __GLIBC_PREREQ(a, b)
+#    else
+#      define _LIBCPP_GLIBC_PREREQ(a, b) 0
+#    endif // defined(__GLIBC_PREREQ)
+#  endif
 #endif
 
 #ifndef __BYTE_ORDER__

From 1504fc57d88d5d700d5f8053ebc46b33e8bb12bf Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Sat, 22 Feb 2025 00:35:52 +0700
Subject: [PATCH 177/282] AMDGPU: Stop emitting an error on illegal
 addrspacecasts (#127487) (#127751)

These cannot be static compile errors, and should be treated as
poison. Invalid casts may be introduced which are dynamically dead.

For example:

```
  void foo(volatile generic int* x) {
    __builtin_assume(is_shared(x));
    *x = 4;
  }

  void bar() {
    private int y;
    foo(&y); // violation, wrong address space
  }
```

This could produce a compile time backend error or not depending on
the optimization level. Similarly, the new test demonstrates a failure
on a lowered atomicrmw which required inserting runtime address
space checks. The invalid cases are dynamically dead, we should not
error, and the AtomicExpand pass shouldn't have to consider the details
of the incoming pointer to produce valid IR.

This should go to the release branch. This fixes broken -O0 compiles
with 64-bit atomics which would have started failing in
1d03708.

(cherry picked from commit 18ea6c9)
---
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |   7 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |   7 +-
 llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll  | 755 ++++++++++++++++++
 .../CodeGen/AMDGPU/invalid-addrspacecast.ll   |  44 +-
 4 files changed, 796 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index e9e47eaadd557..e84f0f5fa615a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -2426,11 +2426,8 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
     return true;
   }
 
-  DiagnosticInfoUnsupported InvalidAddrSpaceCast(
-      MF.getFunction(), "invalid addrspacecast", B.getDebugLoc());
-
-  LLVMContext &Ctx = MF.getFunction().getContext();
-  Ctx.diagnose(InvalidAddrSpaceCast);
+  // Invalid casts are poison.
+  // TODO: Should return poison
   B.buildUndef(Dst);
   MI.eraseFromParent();
   return true;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index b632c50dae0e3..e09df53995d61 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -7340,11 +7340,8 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
 
   // global <-> flat are no-ops and never emitted.
 
-  const MachineFunction &MF = DAG.getMachineFunction();
-  DiagnosticInfoUnsupported InvalidAddrSpaceCast(
-      MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
-  DAG.getContext()->diagnose(InvalidAddrSpaceCast);
-
+  // Invalid casts are poison.
+  // TODO: Should return poison
   return DAG.getUNDEF(Op->getValueType(0));
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
index f5c9b1a79b476..9b446896db590 100644
--- a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
@@ -444,6 +444,761 @@ define float @no_unsafe(ptr %addr, float %val) {
   ret float %res
 }
 
+@global = hidden addrspace(1) global i64 0, align 8
+
+; Make sure there is no error on an invalid addrspacecast without optimizations
+define i64 @optnone_atomicrmw_add_i64_expand(i64 %val) #1 {
+; GFX908-LABEL: optnone_atomicrmw_add_i64_expand:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX908-NEXT:    s_mov_b32 s6, 32
+; GFX908-NEXT:    s_lshr_b64 s[4:5], s[4:5], s6
+; GFX908-NEXT:    s_getpc_b64 s[6:7]
+; GFX908-NEXT:    s_add_u32 s6, s6, global@rel32@lo+4
+; GFX908-NEXT:    s_addc_u32 s7, s7, global@rel32@hi+12
+; GFX908-NEXT:    s_cmp_eq_u32 s7, s4
+; GFX908-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GFX908-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
+; GFX908-NEXT:    s_mov_b64 s[4:5], -1
+; GFX908-NEXT:    s_mov_b32 s6, 1
+; GFX908-NEXT:    v_cmp_ne_u32_e64 s[6:7], v2, s6
+; GFX908-NEXT:    s_and_b64 vcc, exec, s[6:7]
+; GFX908-NEXT:    ; implicit-def: $vgpr3_vgpr4
+; GFX908-NEXT:    s_cbranch_vccnz .LBB4_3
+; GFX908-NEXT:  .LBB4_1: ; %Flow
+; GFX908-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
+; GFX908-NEXT:    s_mov_b32 s4, 1
+; GFX908-NEXT:    v_cmp_ne_u32_e64 s[4:5], v2, s4
+; GFX908-NEXT:    s_and_b64 vcc, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_vccnz .LBB4_4
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.private
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    buffer_load_dword v3, v0, s[0:3], 0 offen
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_add_co_u32_e64 v0, s[4:5], v3, v0
+; GFX908-NEXT:    v_addc_co_u32_e64 v1, s[4:5], v4, v1, s[4:5]
+; GFX908-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX908-NEXT:    buffer_store_dword v0, v0, s[0:3], 0 offen
+; GFX908-NEXT:    s_branch .LBB4_4
+; GFX908-NEXT:  .LBB4_3: ; %atomicrmw.global
+; GFX908-NEXT:    s_getpc_b64 s[4:5]
+; GFX908-NEXT:    s_add_u32 s4, s4, global@rel32@lo+4
+; GFX908-NEXT:    s_addc_u32 s5, s5, global@rel32@hi+12
+; GFX908-NEXT:    v_mov_b32_e32 v2, s4
+; GFX908-NEXT:    v_mov_b32_e32 v3, s5
+; GFX908-NEXT:    flat_atomic_add_x2 v[3:4], v[2:3], v[0:1] glc
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    s_branch .LBB4_1
+; GFX908-NEXT:  .LBB4_4: ; %atomicrmw.phi
+; GFX908-NEXT:  ; %bb.5: ; %atomicrmw.end
+; GFX908-NEXT:    s_mov_b32 s4, 32
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_lshrrev_b64 v[1:2], s4, v[3:4]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: optnone_atomicrmw_add_i64_expand:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    s_mov_b32 s6, 32
+; GFX90A-NEXT:    s_lshr_b64 s[4:5], s[4:5], s6
+; GFX90A-NEXT:    s_getpc_b64 s[6:7]
+; GFX90A-NEXT:    s_add_u32 s6, s6, global@rel32@lo+4
+; GFX90A-NEXT:    s_addc_u32 s7, s7, global@rel32@hi+12
+; GFX90A-NEXT:    s_cmp_eq_u32 s7, s4
+; GFX90A-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
+; GFX90A-NEXT:    s_mov_b64 s[4:5], -1
+; GFX90A-NEXT:    s_mov_b32 s6, 1
+; GFX90A-NEXT:    v_cmp_ne_u32_e64 s[6:7], v2, s6
+; GFX90A-NEXT:    s_and_b64 vcc, exec, s[6:7]
+; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT:    s_cbranch_vccnz .LBB4_3
+; GFX90A-NEXT:  .LBB4_1: ; %Flow
+; GFX90A-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; GFX90A-NEXT:    s_mov_b32 s4, 1
+; GFX90A-NEXT:    v_cmp_ne_u32_e64 s[4:5], v4, s4
+; GFX90A-NEXT:    s_and_b64 vcc, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_vccnz .LBB4_4
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.private
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 offen
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
+; GFX90A-NEXT:    v_add_co_u32_e64 v0, s[4:5], v2, v0
+; GFX90A-NEXT:    v_addc_co_u32_e64 v1, s[4:5], v3, v1, s[4:5]
+; GFX90A-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX90A-NEXT:    buffer_store_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NEXT:    s_branch .LBB4_4
+; GFX90A-NEXT:  .LBB4_3: ; %atomicrmw.global
+; GFX90A-NEXT:    s_getpc_b64 s[4:5]
+; GFX90A-NEXT:    s_add_u32 s4, s4, global@rel32@lo+4
+; GFX90A-NEXT:    s_addc_u32 s5, s5, global@rel32@hi+12
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    flat_atomic_add_x2 v[2:3], v[2:3], v[0:1] glc
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    s_branch .LBB4_1
+; GFX90A-NEXT:  .LBB4_4: ; %atomicrmw.phi
+; GFX90A-NEXT:  ; %bb.5: ; %atomicrmw.end
+; GFX90A-NEXT:    s_mov_b32 s4, 32
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_lshrrev_b64 v[4:5], s4, v[2:3]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: optnone_atomicrmw_add_i64_expand:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX940-NEXT:    s_mov_b32 s2, 32
+; GFX940-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX940-NEXT:    s_getpc_b64 s[2:3]
+; GFX940-NEXT:    s_add_u32 s2, s2, global@rel32@lo+4
+; GFX940-NEXT:    s_addc_u32 s3, s3, global@rel32@hi+12
+; GFX940-NEXT:    s_cmp_eq_u32 s3, s0
+; GFX940-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX940-NEXT:    s_mov_b64 s[0:1], -1
+; GFX940-NEXT:    s_mov_b32 s2, 1
+; GFX940-NEXT:    v_cmp_ne_u32_e64 s[2:3], v2, s2
+; GFX940-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; GFX940-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX940-NEXT:    s_cbranch_vccnz .LBB4_3
+; GFX940-NEXT:  .LBB4_1: ; %Flow
+; GFX940-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[0:1]
+; GFX940-NEXT:    s_mov_b32 s0, 1
+; GFX940-NEXT:    v_cmp_ne_u32_e64 s[0:1], v4, s0
+; GFX940-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_vccnz .LBB4_4
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.private
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    scratch_load_dwordx2 v[2:3], off, s0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1]
+; GFX940-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 sc0 sc1
+; GFX940-NEXT:    s_branch .LBB4_4
+; GFX940-NEXT:  .LBB4_3: ; %atomicrmw.global
+; GFX940-NEXT:    s_getpc_b64 s[0:1]
+; GFX940-NEXT:    s_add_u32 s0, s0, global@rel32@lo+4
+; GFX940-NEXT:    s_addc_u32 s1, s1, global@rel32@hi+12
+; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX940-NEXT:    flat_atomic_add_x2 v[2:3], v[2:3], v[0:1] sc0
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    s_branch .LBB4_1
+; GFX940-NEXT:  .LBB4_4: ; %atomicrmw.phi
+; GFX940-NEXT:  ; %bb.5: ; %atomicrmw.end
+; GFX940-NEXT:    s_mov_b32 s0, 32
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_lshrrev_b64 v[4:5], s0, v[2:3]
+; GFX940-NEXT:    v_mov_b32_e32 v0, v2
+; GFX940-NEXT:    v_mov_b32_e32 v1, v4
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: optnone_atomicrmw_add_i64_expand:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1100-NEXT:    s_mov_b32 s2, 32
+; GFX1100-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX1100-NEXT:    s_getpc_b64 s[2:3]
+; GFX1100-NEXT:    s_add_u32 s2, s2, global@rel32@lo+4
+; GFX1100-NEXT:    s_addc_u32 s3, s3, global@rel32@hi+12
+; GFX1100-NEXT:    s_cmp_eq_u32 s3, s0
+; GFX1100-NEXT:    s_cselect_b32 s0, -1, 0
+; GFX1100-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX1100-NEXT:    s_mov_b32 s0, -1
+; GFX1100-NEXT:    s_mov_b32 s1, 1
+; GFX1100-NEXT:    v_cmp_ne_u32_e64 s1, v2, s1
+; GFX1100-NEXT:    s_and_b32 vcc_lo, exec_lo, s1
+; GFX1100-NEXT:    ; implicit-def: $vgpr3_vgpr4
+; GFX1100-NEXT:    s_cbranch_vccnz .LBB4_3
+; GFX1100-NEXT:  .LBB4_1: ; %Flow
+; GFX1100-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX1100-NEXT:    s_mov_b32 s0, 1
+; GFX1100-NEXT:    v_cmp_ne_u32_e64 s0, v2, s0
+; GFX1100-NEXT:    s_and_b32 vcc_lo, exec_lo, s0
+; GFX1100-NEXT:    s_cbranch_vccnz .LBB4_4
+; GFX1100-NEXT:  ; %bb.2: ; %atomicrmw.private
+; GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-NEXT:    scratch_load_b64 v[3:4], off, s0
+; GFX1100-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-NEXT:    v_add_co_u32 v0, s0, v3, v0
+; GFX1100-NEXT:    v_add_co_ci_u32_e64 v1, s0, v4, v1, s0
+; GFX1100-NEXT:    scratch_store_b64 off, v[0:1], s0
+; GFX1100-NEXT:    s_branch .LBB4_4
+; GFX1100-NEXT:  .LBB4_3: ; %atomicrmw.global
+; GFX1100-NEXT:    s_getpc_b64 s[0:1]
+; GFX1100-NEXT:    s_add_u32 s0, s0, global@rel32@lo+4
+; GFX1100-NEXT:    s_addc_u32 s1, s1, global@rel32@hi+12
+; GFX1100-NEXT:    v_mov_b32_e32 v3, s1
+; GFX1100-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1100-NEXT:    flat_atomic_add_u64 v[3:4], v[2:3], v[0:1] glc
+; GFX1100-NEXT:    s_mov_b32 s0, 0
+; GFX1100-NEXT:    s_branch .LBB4_1
+; GFX1100-NEXT:  .LBB4_4: ; %atomicrmw.phi
+; GFX1100-NEXT:  ; %bb.5: ; %atomicrmw.end
+; GFX1100-NEXT:    s_mov_b32 s0, 32
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_lshrrev_b64 v[1:2], s0, v[3:4]
+; GFX1100-NEXT:    v_mov_b32_e32 v0, v3
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-LABEL: optnone_atomicrmw_add_i64_expand:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT:    s_wait_expcnt 0x0
+; GFX1200-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1200-NEXT:    s_mov_b32 s2, 32
+; GFX1200-NEXT:    s_wait_alu 0xfffe
+; GFX1200-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX1200-NEXT:    s_getpc_b64 s[2:3]
+; GFX1200-NEXT:    s_wait_alu 0xfffe
+; GFX1200-NEXT:    s_sext_i32_i16 s3, s3
+; GFX1200-NEXT:    s_add_co_u32 s2, s2, global@rel32@lo+12
+; GFX1200-NEXT:    s_wait_alu 0xfffe
+; GFX1200-NEXT:    s_add_co_ci_u32 s3, s3, global@rel32@hi+24
+; GFX1200-NEXT:    s_wait_alu 0xfffe
+; GFX1200-NEXT:    s_cmp_eq_u32 s3, s0
+; GFX1200-NEXT:    s_cselect_b32 s0, -1, 0
+; GFX1200-NEXT:    s_wait_alu 0xfffe
+; GFX1200-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX1200-NEXT:    s_mov_b32 s0, -1
+; GFX1200-NEXT:    s_mov_b32 s1, 1
+; GFX1200-NEXT:    s_wait_alu 0xfffe
+; GFX1200-NEXT:    v_cmp_ne_u32_e64 s1, v2, s1
+; GFX1200-NEXT:    s_and_b32 vcc_lo, exec_lo, s1
+; GFX1200-NEXT:    ; implicit-def: $vgpr3_vgpr4
+; GFX1200-NEXT:    s_cbranch_vccnz .LBB4_3
+; GFX1200-NEXT:  .LBB4_1: ; %Flow
+; GFX1200-NEXT:    s_wait_alu 0xfffe
+; GFX1200-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX1200-NEXT:    s_mov_b32 s0, 1
+; GFX1200-NEXT:    s_wait_alu 0xfffe
+; GFX1200-NEXT:    v_cmp_ne_u32_e64 s0, v2, s0
+; GFX1200-NEXT:    s_and_b32 vcc_lo, exec_lo, s0
+; GFX1200-NEXT:    s_cbranch_vccnz .LBB4_4
+; GFX1200-NEXT:  ; %bb.2: ; %atomicrmw.private
+; GFX1200-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT:    scratch_load_b64 v[3:4], off, s0
+; GFX1200-NEXT:    s_wait_loadcnt 0x0
+; GFX1200-NEXT:    v_add_co_u32 v0, s0, v3, v0
+; GFX1200-NEXT:    v_add_co_ci_u32_e64 v1, s0, v4, v1, s0
+; GFX1200-NEXT:    scratch_store_b64 off, v[0:1], s0
+; GFX1200-NEXT:    s_branch .LBB4_4
+; GFX1200-NEXT:  .LBB4_3: ; %atomicrmw.global
+; GFX1200-NEXT:    s_getpc_b64 s[0:1]
+; GFX1200-NEXT:    s_wait_alu 0xfffe
+; GFX1200-NEXT:    s_sext_i32_i16 s1, s1
+; GFX1200-NEXT:    s_add_co_u32 s0, s0, global@rel32@lo+12
+; GFX1200-NEXT:    s_wait_alu 0xfffe
+; GFX1200-NEXT:    s_add_co_ci_u32 s1, s1, global@rel32@hi+24
+; GFX1200-NEXT:    s_wait_alu 0xfffe
+; GFX1200-NEXT:    v_mov_b32_e32 v3, s1
+; GFX1200-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1200-NEXT:    flat_atomic_add_u64 v[3:4], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1200-NEXT:    s_mov_b32 s0, 0
+; GFX1200-NEXT:    s_branch .LBB4_1
+; GFX1200-NEXT:  .LBB4_4: ; %atomicrmw.phi
+; GFX1200-NEXT:  ; %bb.5: ; %atomicrmw.end
+; GFX1200-NEXT:    s_mov_b32 s0, 32
+; GFX1200-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT:    s_wait_alu 0xfffe
+; GFX1200-NEXT:    v_lshrrev_b64 v[1:2], s0, v[3:4]
+; GFX1200-NEXT:    v_mov_b32_e32 v0, v3
+; GFX1200-NEXT:    s_setpc_b64 s[30:31]
+; GFX942-LABEL: optnone_atomicrmw_add_i64_expand:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX942-NEXT:    s_mov_b32 s2, 32
+; GFX942-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX942-NEXT:    s_getpc_b64 s[2:3]
+; GFX942-NEXT:    s_add_u32 s2, s2, global@rel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s3, s3, global@rel32@hi+12
+; GFX942-NEXT:    s_cmp_eq_u32 s3, s0
+; GFX942-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[0:1], -1
+; GFX942-NEXT:    s_mov_b32 s2, 1
+; GFX942-NEXT:    v_cmp_ne_u32_e64 s[2:3], v2, s2
+; GFX942-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; GFX942-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX942-NEXT:    s_cbranch_vccnz .LBB4_3
+; GFX942-NEXT:  .LBB4_1: ; %Flow
+; GFX942-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[0:1]
+; GFX942-NEXT:    s_mov_b32 s0, 1
+; GFX942-NEXT:    v_cmp_ne_u32_e64 s[0:1], v4, s0
+; GFX942-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GFX942-NEXT:    s_cbranch_vccnz .LBB4_4
+; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.private
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    scratch_load_dwordx2 v[2:3], off, s0
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1]
+; GFX942-NEXT:    scratch_store_dwordx2 off, v[0:1], s0
+; GFX942-NEXT:    s_branch .LBB4_4
+; GFX942-NEXT:  .LBB4_3: ; %atomicrmw.global
+; GFX942-NEXT:    s_getpc_b64 s[0:1]
+; GFX942-NEXT:    s_add_u32 s0, s0, global@rel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s1, s1, global@rel32@hi+12
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-NEXT:    flat_atomic_add_x2 v[2:3], v[2:3], v[0:1] sc0
+; GFX942-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-NEXT:    s_branch .LBB4_1
+; GFX942-NEXT:  .LBB4_4: ; %atomicrmw.phi
+; GFX942-NEXT:  ; %bb.5: ; %atomicrmw.end
+; GFX942-NEXT:    s_mov_b32 s0, 32
+; GFX942-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_lshrrev_b64 v[4:5], s0, v[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v0, v2
+; GFX942-NEXT:    v_mov_b32_e32 v1, v4
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %rmw = atomicrmw add ptr addrspacecast (ptr addrspace(1) @global to ptr), i64 %val syncscope("agent") monotonic, align 8
+  ret i64 %rmw
+}
+
+; Make sure there is no error on an invalid addrspacecast without optimizations
+define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 {
+; GFX908-LABEL: optnone_atomicrmw_fadd_f64_expand:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX908-NEXT:    s_mov_b32 s6, 32
+; GFX908-NEXT:    s_lshr_b64 s[4:5], s[4:5], s6
+; GFX908-NEXT:    s_getpc_b64 s[6:7]
+; GFX908-NEXT:    s_add_u32 s6, s6, global@rel32@lo+4
+; GFX908-NEXT:    s_addc_u32 s7, s7, global@rel32@hi+12
+; GFX908-NEXT:    s_cmp_eq_u32 s7, s4
+; GFX908-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GFX908-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
+; GFX908-NEXT:    s_mov_b64 s[4:5], -1
+; GFX908-NEXT:    s_mov_b32 s6, 1
+; GFX908-NEXT:    v_readfirstlane_b32 s7, v2
+; GFX908-NEXT:    s_cmp_lg_u32 s7, s6
+; GFX908-NEXT:    s_cselect_b64 s[6:7], -1, 0
+; GFX908-NEXT:    s_and_b64 vcc, exec, s[6:7]
+; GFX908-NEXT:    ; implicit-def: $vgpr3_vgpr4
+; GFX908-NEXT:    s_cbranch_vccnz .LBB5_2
+; GFX908-NEXT:    s_branch .LBB5_3
+; GFX908-NEXT:  .LBB5_1: ; %atomicrmw.private
+; GFX908-NEXT:    buffer_load_dword v3, v0, s[0:3], 0 offen
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_add_f64 v[0:1], v[3:4], v[0:1]
+; GFX908-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX908-NEXT:    buffer_store_dword v0, v0, s[0:3], 0 offen
+; GFX908-NEXT:    s_branch .LBB5_6
+; GFX908-NEXT:  .LBB5_2: ; %atomicrmw.global
+; GFX908-NEXT:    s_getpc_b64 s[4:5]
+; GFX908-NEXT:    s_add_u32 s4, s4, global@rel32@lo+4
+; GFX908-NEXT:    s_addc_u32 s5, s5, global@rel32@hi+12
+; GFX908-NEXT:    v_mov_b32_e32 v2, s4
+; GFX908-NEXT:    v_mov_b32_e32 v3, s5
+; GFX908-NEXT:    flat_load_dwordx2 v[3:4], v[2:3]
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    s_branch .LBB5_4
+; GFX908-NEXT:  .LBB5_3: ; %Flow
+; GFX908-NEXT:    s_and_b64 vcc, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_vccnz .LBB5_1
+; GFX908-NEXT:    s_branch .LBB5_6
+; GFX908-NEXT:  .LBB5_4: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_mov_b32_e32 v5, v3
+; GFX908-NEXT:    v_add_f64 v[3:4], v[5:6], v[0:1]
+; GFX908-NEXT:    s_getpc_b64 s[6:7]
+; GFX908-NEXT:    s_add_u32 s6, s6, global@rel32@lo+4
+; GFX908-NEXT:    s_addc_u32 s7, s7, global@rel32@hi+12
+; GFX908-NEXT:    v_mov_b32_e32 v8, s7
+; GFX908-NEXT:    v_mov_b32_e32 v7, s6
+; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6] glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_cmp_eq_u64_e64 s[6:7], v[3:4], v[5:6]
+; GFX908-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB5_4
+; GFX908-NEXT:  ; %bb.5: ; %atomicrmw.end1
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    s_branch .LBB5_3
+; GFX908-NEXT:  .LBB5_6: ; %atomicrmw.phi
+; GFX908-NEXT:  ; %bb.7: ; %atomicrmw.end
+; GFX908-NEXT:    s_mov_b32 s4, 32
+; GFX908-NEXT:    v_lshrrev_b64 v[1:2], s4, v[3:4]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: optnone_atomicrmw_fadd_f64_expand:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    s_mov_b32 s6, 32
+; GFX90A-NEXT:    s_lshr_b64 s[4:5], s[4:5], s6
+; GFX90A-NEXT:    s_getpc_b64 s[6:7]
+; GFX90A-NEXT:    s_add_u32 s6, s6, global@rel32@lo+4
+; GFX90A-NEXT:    s_addc_u32 s7, s7, global@rel32@hi+12
+; GFX90A-NEXT:    s_cmp_eq_u32 s7, s4
+; GFX90A-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
+; GFX90A-NEXT:    s_mov_b64 s[4:5], -1
+; GFX90A-NEXT:    s_mov_b32 s6, 1
+; GFX90A-NEXT:    v_readfirstlane_b32 s7, v2
+; GFX90A-NEXT:    s_cmp_lg_u32 s7, s6
+; GFX90A-NEXT:    s_cselect_b64 s[6:7], -1, 0
+; GFX90A-NEXT:    s_and_b64 vcc, exec, s[6:7]
+; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT:    s_cbranch_vccnz .LBB5_2
+; GFX90A-NEXT:    s_branch .LBB5_3
+; GFX90A-NEXT:  .LBB5_1: ; %atomicrmw.private
+; GFX90A-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 offen
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
+; GFX90A-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX90A-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX90A-NEXT:    buffer_store_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NEXT:    s_branch .LBB5_6
+; GFX90A-NEXT:  .LBB5_2: ; %atomicrmw.global
+; GFX90A-NEXT:    s_getpc_b64 s[4:5]
+; GFX90A-NEXT:    s_add_u32 s4, s4, global@rel32@lo+4
+; GFX90A-NEXT:    s_addc_u32 s5, s5, global@rel32@hi+12
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    s_branch .LBB5_4
+; GFX90A-NEXT:  .LBB5_3: ; %Flow
+; GFX90A-NEXT:    s_and_b64 vcc, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_vccnz .LBB5_1
+; GFX90A-NEXT:    s_branch .LBB5_6
+; GFX90A-NEXT:  .LBB5_4: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX90A-NEXT:    s_getpc_b64 s[6:7]
+; GFX90A-NEXT:    s_add_u32 s6, s6, global@rel32@lo+4
+; GFX90A-NEXT:    s_addc_u32 s7, s7, global@rel32@hi+12
+; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[6:7], v[2:5] glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_cmp_eq_u64_e64 s[6:7], v[2:3], v[4:5]
+; GFX90A-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB5_4
+; GFX90A-NEXT:  ; %bb.5: ; %atomicrmw.end1
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    s_branch .LBB5_3
+; GFX90A-NEXT:  .LBB5_6: ; %atomicrmw.phi
+; GFX90A-NEXT:  ; %bb.7: ; %atomicrmw.end
+; GFX90A-NEXT:    s_mov_b32 s4, 32
+; GFX90A-NEXT:    v_lshrrev_b64 v[4:5], s4, v[2:3]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: optnone_atomicrmw_fadd_f64_expand:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX940-NEXT:    s_mov_b32 s2, 32
+; GFX940-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX940-NEXT:    s_getpc_b64 s[2:3]
+; GFX940-NEXT:    s_add_u32 s2, s2, global@rel32@lo+4
+; GFX940-NEXT:    s_addc_u32 s3, s3, global@rel32@hi+12
+; GFX940-NEXT:    s_cmp_eq_u32 s3, s0
+; GFX940-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX940-NEXT:    s_mov_b64 s[0:1], -1
+; GFX940-NEXT:    s_mov_b32 s2, 1
+; GFX940-NEXT:    v_readfirstlane_b32 s3, v2
+; GFX940-NEXT:    s_cmp_lg_u32 s3, s2
+; GFX940-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GFX940-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; GFX940-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX940-NEXT:    s_cbranch_vccnz .LBB5_2
+; GFX940-NEXT:    s_branch .LBB5_3
+; GFX940-NEXT:  .LBB5_1: ; %atomicrmw.private
+; GFX940-NEXT:    scratch_load_dwordx2 v[2:3], off, s0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX940-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 sc0 sc1
+; GFX940-NEXT:    s_branch .LBB5_6
+; GFX940-NEXT:  .LBB5_2: ; %atomicrmw.global
+; GFX940-NEXT:    s_getpc_b64 s[0:1]
+; GFX940-NEXT:    s_add_u32 s0, s0, global@rel32@lo+4
+; GFX940-NEXT:    s_addc_u32 s1, s1, global@rel32@hi+12
+; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX940-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    s_branch .LBB5_4
+; GFX940-NEXT:  .LBB5_3: ; %Flow
+; GFX940-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_vccnz .LBB5_1
+; GFX940-NEXT:    s_branch .LBB5_6
+; GFX940-NEXT:  .LBB5_4: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
+; GFX940-NEXT:    v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX940-NEXT:    s_getpc_b64 s[2:3]
+; GFX940-NEXT:    s_add_u32 s2, s2, global@rel32@lo+4
+; GFX940-NEXT:    s_addc_u32 s3, s3, global@rel32@hi+12
+; GFX940-NEXT:    v_mov_b64_e32 v[6:7], s[2:3]
+; GFX940-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[6:7], v[2:5] sc0 sc1
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_eq_u64_e64 s[2:3], v[2:3], v[4:5]
+; GFX940-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB5_4
+; GFX940-NEXT:  ; %bb.5: ; %atomicrmw.end1
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    s_branch .LBB5_3
+; GFX940-NEXT:  .LBB5_6: ; %atomicrmw.phi
+; GFX940-NEXT:  ; %bb.7: ; %atomicrmw.end
+; GFX940-NEXT:    s_mov_b32 s0, 32
+; GFX940-NEXT:    v_lshrrev_b64 v[4:5], s0, v[2:3]
+; GFX940-NEXT:    v_mov_b32_e32 v0, v2
+; GFX940-NEXT:    v_mov_b32_e32 v1, v4
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: optnone_atomicrmw_fadd_f64_expand:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1100-NEXT:    s_mov_b32 s2, 32
+; GFX1100-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX1100-NEXT:    s_getpc_b64 s[2:3]
+; GFX1100-NEXT:    s_add_u32 s2, s2, global@rel32@lo+4
+; GFX1100-NEXT:    s_addc_u32 s3, s3, global@rel32@hi+12
+; GFX1100-NEXT:    s_cmp_eq_u32 s3, s0
+; GFX1100-NEXT:    s_cselect_b32 s0, -1, 0
+; GFX1100-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX1100-NEXT:    s_mov_b32 s0, -1
+; GFX1100-NEXT:    s_mov_b32 s1, 1
+; GFX1100-NEXT:    v_cmp_ne_u32_e64 s1, v2, s1
+; GFX1100-NEXT:    s_and_b32 vcc_lo, exec_lo, s1
+; GFX1100-NEXT:    ; implicit-def: $vgpr3_vgpr4
+; GFX1100-NEXT:    s_cbranch_vccnz .LBB5_2
+; GFX1100-NEXT:    s_branch .LBB5_3
+; GFX1100-NEXT:  .LBB5_1: ; %atomicrmw.private
+; GFX1100-NEXT:    scratch_load_b64 v[3:4], off, s0
+; GFX1100-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-NEXT:    v_add_f64 v[0:1], v[3:4], v[0:1]
+; GFX1100-NEXT:    scratch_store_b64 off, v[0:1], s0
+; GFX1100-NEXT:    s_branch .LBB5_6
+; GFX1100-NEXT:  .LBB5_2: ; %atomicrmw.global
+; GFX1100-NEXT:    s_getpc_b64 s[0:1]
+; GFX1100-NEXT:    s_add_u32 s0, s0, global@rel32@lo+4
+; GFX1100-NEXT:    s_addc_u32 s1, s1, global@rel32@hi+12
+; GFX1100-NEXT:    v_mov_b32_e32 v3, s1
+; GFX1100-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1100-NEXT:    flat_load_b64 v[3:4], v[2:3]
+; GFX1100-NEXT:    s_mov_b32 s0, 0
+; GFX1100-NEXT:    s_branch .LBB5_4
+; GFX1100-NEXT:  .LBB5_3: ; %Flow
+; GFX1100-NEXT:    s_and_b32 vcc_lo, exec_lo, s0
+; GFX1100-NEXT:    s_cbranch_vccnz .LBB5_1
+; GFX1100-NEXT:    s_branch .LBB5_6
+; GFX1100-NEXT:  .LBB5_4: ; %atomicrmw.start
+; GFX1100-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1100-NEXT:    v_mov_b32_e32 v5, v3
+; GFX1100-NEXT:    v_add_f64 v[3:4], v[5:6], v[0:1]
+; GFX1100-NEXT:    s_getpc_b64 s[2:3]
+; GFX1100-NEXT:    s_add_u32 s2, s2, global@rel32@lo+4
+; GFX1100-NEXT:    s_addc_u32 s3, s3, global@rel32@hi+12
+; GFX1100-NEXT:    v_mov_b32_e32 v8, s3
+; GFX1100-NEXT:    v_mov_b32_e32 v7, s2
+; GFX1100-NEXT:    flat_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6] glc
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_cmp_eq_u64_e64 s1, v[3:4], v[5:6]
+; GFX1100-NEXT:    s_or_b32 s0, s1, s0
+; GFX1100-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1100-NEXT:    s_cbranch_execnz .LBB5_4
+; GFX1100-NEXT:  ; %bb.5: ; %atomicrmw.end1
+; GFX1100-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1100-NEXT:    s_mov_b32 s0, 0
+; GFX1100-NEXT:    s_branch .LBB5_3
+; GFX1100-NEXT:  .LBB5_6: ; %atomicrmw.phi
+; GFX1100-NEXT:  ; %bb.7: ; %atomicrmw.end
+; GFX1100-NEXT:    s_mov_b32 s0, 32
+; GFX1100-NEXT:    v_lshrrev_b64 v[1:2], s0, v[3:4]
+; GFX1100-NEXT:    v_mov_b32_e32 v0, v3
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-LABEL: optnone_atomicrmw_fadd_f64_expand:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT:    s_wait_expcnt 0x0
+; GFX1200-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1200-NEXT:    s_mov_b32 s2, 32
+; GFX1200-NEXT:    s_wait_alu 0xfffe
+; GFX1200-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX1200-NEXT:    s_getpc_b64 s[2:3]
+; GFX1200-NEXT:    s_wait_alu 0xfffe
+; GFX1200-NEXT:    s_sext_i32_i16 s3, s3
+; GFX1200-NEXT:    s_add_co_u32 s2, s2, global@rel32@lo+12
+; GFX1200-NEXT:    s_wait_alu 0xfffe
+; GFX1200-NEXT:    s_add_co_ci_u32 s3, s3, global@rel32@hi+24
+; GFX1200-NEXT:    s_wait_alu 0xfffe
+; GFX1200-NEXT:    s_cmp_eq_u32 s3, s0
+; GFX1200-NEXT:    s_cselect_b32 s0, -1, 0
+; GFX1200-NEXT:    s_wait_alu 0xfffe
+; GFX1200-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX1200-NEXT:    s_mov_b32 s0, -1
+; GFX1200-NEXT:    s_mov_b32 s1, 1
+; GFX1200-NEXT:    s_wait_alu 0xfffe
+; GFX1200-NEXT:    v_cmp_ne_u32_e64 s1, v2, s1
+; GFX1200-NEXT:    s_and_b32 vcc_lo, exec_lo, s1
+; GFX1200-NEXT:    ; implicit-def: $vgpr3_vgpr4
+; GFX1200-NEXT:    s_cbranch_vccnz .LBB5_2
+; GFX1200-NEXT:    s_branch .LBB5_3
+; GFX1200-NEXT:  .LBB5_1: ; %atomicrmw.private
+; GFX1200-NEXT:    scratch_load_b64 v[3:4], off, s0
+; GFX1200-NEXT:    s_wait_loadcnt 0x0
+; GFX1200-NEXT:    v_add_f64_e64 v[0:1], v[3:4], v[0:1]
+; GFX1200-NEXT:    scratch_store_b64 off, v[0:1], s0
+; GFX1200-NEXT:    s_branch .LBB5_6
+; GFX1200-NEXT:  .LBB5_2: ; %atomicrmw.global
+; GFX1200-NEXT:    s_getpc_b64 s[0:1]
+; GFX1200-NEXT:    s_wait_alu 0xfffe
+; GFX1200-NEXT:    s_sext_i32_i16 s1, s1
+; GFX1200-NEXT:    s_add_co_u32 s0, s0, global@rel32@lo+12
+; GFX1200-NEXT:    s_wait_alu 0xfffe
+; GFX1200-NEXT:    s_add_co_ci_u32 s1, s1, global@rel32@hi+24
+; GFX1200-NEXT:    s_wait_alu 0xfffe
+; GFX1200-NEXT:    v_mov_b32_e32 v3, s1
+; GFX1200-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1200-NEXT:    flat_load_b64 v[3:4], v[2:3]
+; GFX1200-NEXT:    s_mov_b32 s0, 0
+; GFX1200-NEXT:    s_branch .LBB5_4
+; GFX1200-NEXT:  .LBB5_3: ; %Flow
+; GFX1200-NEXT:    s_wait_alu 0xfffe
+; GFX1200-NEXT:    s_and_b32 vcc_lo, exec_lo, s0
+; GFX1200-NEXT:    s_cbranch_vccnz .LBB5_1
+; GFX1200-NEXT:    s_branch .LBB5_6
+; GFX1200-NEXT:  .LBB5_4: ; %atomicrmw.start
+; GFX1200-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1200-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1200-NEXT:    v_mov_b32_e32 v5, v3
+; GFX1200-NEXT:    v_add_f64_e64 v[3:4], v[5:6], v[0:1]
+; GFX1200-NEXT:    s_getpc_b64 s[2:3]
+; GFX1200-NEXT:    s_wait_alu 0xfffe
+; GFX1200-NEXT:    s_sext_i32_i16 s3, s3
+; GFX1200-NEXT:    s_add_co_u32 s2, s2, global@rel32@lo+12
+; GFX1200-NEXT:    s_wait_alu 0xfffe
+; GFX1200-NEXT:    s_add_co_ci_u32 s3, s3, global@rel32@hi+24
+; GFX1200-NEXT:    s_wait_alu 0xfffe
+; GFX1200-NEXT:    v_mov_b32_e32 v8, s3
+; GFX1200-NEXT:    v_mov_b32_e32 v7, s2
+; GFX1200-NEXT:    flat_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1200-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT:    v_cmp_eq_u64_e64 s1, v[3:4], v[5:6]
+; GFX1200-NEXT:    s_or_b32 s0, s1, s0
+; GFX1200-NEXT:    s_wait_alu 0xfffe
+; GFX1200-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1200-NEXT:    s_cbranch_execnz .LBB5_4
+; GFX1200-NEXT:  ; %bb.5: ; %atomicrmw.end1
+; GFX1200-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1200-NEXT:    s_mov_b32 s0, 0
+; GFX1200-NEXT:    s_branch .LBB5_3
+; GFX1200-NEXT:  .LBB5_6: ; %atomicrmw.phi
+; GFX1200-NEXT:  ; %bb.7: ; %atomicrmw.end
+; GFX1200-NEXT:    s_mov_b32 s0, 32
+; GFX1200-NEXT:    s_wait_alu 0xfffe
+; GFX1200-NEXT:    v_lshrrev_b64 v[1:2], s0, v[3:4]
+; GFX1200-NEXT:    v_mov_b32_e32 v0, v3
+; GFX1200-NEXT:    s_setpc_b64 s[30:31]
+; GFX942-LABEL: optnone_atomicrmw_fadd_f64_expand:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX942-NEXT:    s_mov_b32 s2, 32
+; GFX942-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX942-NEXT:    s_getpc_b64 s[2:3]
+; GFX942-NEXT:    s_add_u32 s2, s2, global@rel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s3, s3, global@rel32@hi+12
+; GFX942-NEXT:    s_cmp_eq_u32 s3, s0
+; GFX942-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[0:1], -1
+; GFX942-NEXT:    s_mov_b32 s2, 1
+; GFX942-NEXT:    v_readfirstlane_b32 s3, v2
+; GFX942-NEXT:    s_cmp_lg_u32 s3, s2
+; GFX942-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GFX942-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; GFX942-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX942-NEXT:    s_cbranch_vccnz .LBB5_2
+; GFX942-NEXT:    s_branch .LBB5_3
+; GFX942-NEXT:  .LBB5_1: ; %atomicrmw.private
+; GFX942-NEXT:    scratch_load_dwordx2 v[2:3], off, s0
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX942-NEXT:    scratch_store_dwordx2 off, v[0:1], s0
+; GFX942-NEXT:    s_branch .LBB5_6
+; GFX942-NEXT:  .LBB5_2: ; %atomicrmw.global
+; GFX942-NEXT:    s_getpc_b64 s[0:1]
+; GFX942-NEXT:    s_add_u32 s0, s0, global@rel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s1, s1, global@rel32@hi+12
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
+; GFX942-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-NEXT:    s_branch .LBB5_4
+; GFX942-NEXT:  .LBB5_3: ; %Flow
+; GFX942-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GFX942-NEXT:    s_cbranch_vccnz .LBB5_1
+; GFX942-NEXT:    s_branch .LBB5_6
+; GFX942-NEXT:  .LBB5_4: ; %atomicrmw.start
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
+; GFX942-NEXT:    v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX942-NEXT:    s_getpc_b64 s[2:3]
+; GFX942-NEXT:    s_add_u32 s2, s2, global@rel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s3, s3, global@rel32@hi+12
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], s[2:3]
+; GFX942-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[6:7], v[2:5] sc0 sc1
+; GFX942-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u64_e64 s[2:3], v[2:3], v[4:5]
+; GFX942-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX942-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX942-NEXT:    s_cbranch_execnz .LBB5_4
+; GFX942-NEXT:  ; %bb.5: ; %atomicrmw.end1
+; GFX942-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-NEXT:    s_branch .LBB5_3
+; GFX942-NEXT:  .LBB5_6: ; %atomicrmw.phi
+; GFX942-NEXT:  ; %bb.7: ; %atomicrmw.end
+; GFX942-NEXT:    s_mov_b32 s0, 32
+; GFX942-NEXT:    v_lshrrev_b64 v[4:5], s0, v[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v0, v2
+; GFX942-NEXT:    v_mov_b32_e32 v1, v4
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %rmw = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @global to ptr), double %val monotonic, align 8
+  ret double %rmw
+}
+
 attributes #0 = { nounwind }
+attributes #1 = { noinline nounwind optnone }
 
 !0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll
index e1ba6489a5317..f0609f62a9024 100644
--- a/llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll
+++ b/llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll
@@ -1,36 +1,66 @@
-; RUN: not llc -global-isel=0 -mtriple=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s 2>&1 | FileCheck -check-prefix=ERROR %s
-; RUN: not llc -global-isel=1 -mtriple=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s 2>&1 | FileCheck -check-prefix=ERROR %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsad < %s | FileCheck %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa < %s | FileCheck %s
+
+; Check illegal casts are codegened as poison, and not an error.
 
-; ERROR: error: <unknown>:0:0: in function use_group_to_global_addrspacecast void (ptr addrspace(3)): invalid addrspacecast
 define amdgpu_kernel void @use_group_to_global_addrspacecast(ptr addrspace(3) %ptr) {
+; CHECK-LABEL: use_group_to_global_addrspacecast:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    flat_store_dword v[0:1], v0
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_endpgm
   %stof = addrspacecast ptr addrspace(3) %ptr to ptr addrspace(1)
   store volatile i32 0, ptr addrspace(1) %stof
   ret void
 }
 
-; ERROR: error: <unknown>:0:0: in function use_local_to_constant32bit_addrspacecast void (ptr addrspace(3)): invalid addrspacecast
 define amdgpu_kernel void @use_local_to_constant32bit_addrspacecast(ptr addrspace(3) %ptr) {
+; CHECK-LABEL: use_local_to_constant32bit_addrspacecast:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_mov_b32 s1, 0
+; CHECK-NEXT:    s_load_dword s0, s[0:1], 0x0
+; CHECK-NEXT:    s_endpgm
   %stof = addrspacecast ptr addrspace(3) %ptr to ptr addrspace(6)
   %load = load volatile i32, ptr addrspace(6) %stof
   ret void
 }
 
-; ERROR: error: <unknown>:0:0: in function use_constant32bit_to_local_addrspacecast void (ptr addrspace(6)): invalid addrspacecast
 define amdgpu_kernel void @use_constant32bit_to_local_addrspacecast(ptr addrspace(6) %ptr) {
+; CHECK-LABEL: use_constant32bit_to_local_addrspacecast:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_mov_b32 m0, -1
+; CHECK-NEXT:    ds_read_b32 v0, v0
+; CHECK-NEXT:    s_endpgm
   %cast = addrspacecast ptr addrspace(6) %ptr to ptr addrspace(3)
   %load = load volatile i32, ptr addrspace(3) %cast
   ret void
 }
 
-; ERROR: error: <unknown>:0:0: in function use_local_to_42_addrspacecast void (ptr addrspace(3)): invalid addrspacecast
 define amdgpu_kernel void @use_local_to_42_addrspacecast(ptr addrspace(3) %ptr) {
+; SDAG-LABEL: use_local_to_42_addrspacecast:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_endpgm
+;
+; GISEL-LABEL: use_local_to_42_addrspacecast:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_endpgm
   %cast = addrspacecast ptr addrspace(3) %ptr to ptr addrspace(42)
   store volatile ptr addrspace(42) %cast, ptr addrspace(1) null
   ret void
 }
 
-; ERROR: error: <unknown>:0:0: in function use_42_to_local_addrspacecast void (ptr addrspace(42)): invalid addrspacecast
 define amdgpu_kernel void @use_42_to_local_addrspacecast(ptr addrspace(42) %ptr) {
+; CHECK-LABEL: use_42_to_local_addrspacecast:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_mov_b32 m0, -1
+; CHECK-NEXT:    ds_read_b32 v0, v0
+; CHECK-NEXT:    s_endpgm
   %cast = addrspacecast ptr addrspace(42) %ptr to ptr addrspace(3)
   %load = load volatile i32, ptr addrspace(3) %cast
   ret void

From dc1bd6a8fa6a5f4fc38f7c3ce77c0ffcfcaa66e9 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Wed, 19 Feb 2025 17:46:29 -0800
Subject: [PATCH 178/282] [CMake][Release] Statically link clang with stage1
 runtimes (#127268)

This change will cause clang and the other tools to statically link
against the runtimes built in stage1. This will make the built binaries
more portable by eliminating dependencies on system libraries like
libgcc and libstdc++.

(cherry picked from commit f5b311e47de044160aeb25221095898c35c4847f)
---
 clang/cmake/caches/Release.cmake | 27 +++++++++++++++++++++++----
 1 file changed, 23 insertions(+), 4 deletions(-)

diff --git a/clang/cmake/caches/Release.cmake b/clang/cmake/caches/Release.cmake
index 23e99493087ff..a1c68fc51dbd0 100644
--- a/clang/cmake/caches/Release.cmake
+++ b/clang/cmake/caches/Release.cmake
@@ -48,10 +48,8 @@ set(CLANG_ENABLE_BOOTSTRAP ON CACHE BOOL "")
 
 set(STAGE1_PROJECTS "clang")
 
-# Building Flang on Windows requires compiler-rt, so we need to build it in
-# stage1.  compiler-rt is also required for building the Flang tests on
-# macOS.
-set(STAGE1_RUNTIMES "compiler-rt")
+# Build all runtimes so we can statically link them into the stage2 compiler.
+set(STAGE1_RUNTIMES "compiler-rt;libcxx;libcxxabi;libunwind")
 
 if (LLVM_RELEASE_ENABLE_PGO)
   list(APPEND STAGE1_PROJECTS "lld")
@@ -90,9 +88,20 @@ else()
   set(CLANG_BOOTSTRAP_TARGETS ${LLVM_RELEASE_FINAL_STAGE_TARGETS} CACHE STRING "")
 endif()
 
+if (LLVM_RELEASE_ENABLE_LTO)
+  # Enable LTO for the runtimes.  We need to configure stage1 clang to default
+  # to using lld as the linker because the stage1 toolchain will be used to
+  # build and link the runtimes.
+  # FIXME: We can't use LLVM_ENABLE_LTO=Thin here, because it causes the CMake
+  # step for the libcxx build to fail.  CMAKE_INTERPROCEDURAL_OPTIMIZATION does
+  # enable ThinLTO, though.
+  set(RUNTIMES_CMAKE_ARGS "-DCMAKE_INTERPROCEDURAL_OPTIMIZATION=ON -DLLVM_ENABLE_LLD=ON" CACHE STRING "")
+endif()
+
 # Stage 1 Common Config
 set(LLVM_ENABLE_RUNTIMES ${STAGE1_RUNTIMES} CACHE STRING "")
 set(LLVM_ENABLE_PROJECTS ${STAGE1_PROJECTS} CACHE STRING "")
+set(LIBCXX_STATICALLY_LINK_ABI_IN_STATIC_LIBRARY ON CACHE STRING "")
 
 # stage2-instrumented and Final Stage Config:
 # Options that need to be set in both the instrumented stage (if we are doing
@@ -102,6 +111,16 @@ set_instrument_and_final_stage_var(LLVM_ENABLE_LTO "${LLVM_RELEASE_ENABLE_LTO}"
 if (LLVM_RELEASE_ENABLE_LTO)
   set_instrument_and_final_stage_var(LLVM_ENABLE_LLD "ON" BOOL)
 endif()
+set_instrument_and_final_stage_var(LLVM_ENABLE_LIBCXX "ON" BOOL)
+set_instrument_and_final_stage_var(LLVM_STATIC_LINK_CXX_STDLIB "ON" BOOL)
+set(RELEASE_LINKER_FLAGS "-rtlib=compiler-rt --unwindlib=libunwind")
+if(NOT ${CMAKE_HOST_SYSTEM_NAME} MATCHES "Darwin")
+  set(RELEASE_LINKER_FLAGS "${RELEASE_LINKER_FLAGS} -static-libgcc")
+endif()
+
+set_instrument_and_final_stage_var(CMAKE_EXE_LINKER_FLAGS ${RELEASE_LINKER_FLAGS} STRING)
+set_instrument_and_final_stage_var(CMAKE_SHARED_LINKER_FLAGS ${RELEASE_LINKER_FLAGS} STRING)
+set_instrument_and_final_stage_var(CMAKE_MODULE_LINKER_FLAGS ${RELEASE_LINKER_FLAGS} STRING)
 
 # Final Stage Config (stage2)
 set_final_stage_var(LLVM_ENABLE_RUNTIMES "${LLVM_RELEASE_ENABLE_RUNTIMES}" STRING)

From 77195a5edb332947a991a1f0c4e915f5f1d9411f Mon Sep 17 00:00:00 2001
From: Alexander Richardson <alexrichardson@google.com>
Date: Sun, 9 Feb 2025 12:18:52 -0800
Subject: [PATCH 179/282] [CSKY] Default to unsigned char

This matches the ABI document found at
https://github.com/c-sky/csky-doc/blob/master/C-SKY_V2_CPU_Applications_Binary_Interface_Standards_Manual.pdf

Partially addresses https://github.com/llvm/llvm-project/issues/115957

Reviewed By: zixuan-wu

Pull Request: https://github.com/llvm/llvm-project/pull/115961

(cherry picked from commit d2047242e6d0f0deb7634ff22ab164354c520c79)
---
 clang/lib/Driver/ToolChains/Clang.cpp | 1 +
 clang/test/Driver/csky-toolchain.c    | 1 +
 2 files changed, 2 insertions(+)

diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index ec5ee29ece434..57b7d2bd46989 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -1358,6 +1358,7 @@ static bool isSignedCharDefault(const llvm::Triple &Triple) {
       return true;
     return false;
 
+  case llvm::Triple::csky:
   case llvm::Triple::hexagon:
   case llvm::Triple::msp430:
   case llvm::Triple::ppcle:
diff --git a/clang/test/Driver/csky-toolchain.c b/clang/test/Driver/csky-toolchain.c
index 66485464652ac..638ce64ec98cd 100644
--- a/clang/test/Driver/csky-toolchain.c
+++ b/clang/test/Driver/csky-toolchain.c
@@ -3,6 +3,7 @@
 
 // RUN: %clang -### %s --target=csky 2>&1 | FileCheck -check-prefix=CC1 %s
 // CC1: "-cc1" "-triple" "csky"
+// CC1: "-fno-signed-char"
 
 // In the below tests, --rtlib=platform is used so that the driver ignores
 // the configure-time CLANG_DEFAULT_RTLIB option when choosing the runtime lib

From 8e06e0e65ffa7e5e87ba2c0b48f7bcd9b1cf1600 Mon Sep 17 00:00:00 2001
From: Mariya Podchishchaeva <mariya.podchishchaeva@intel.com>
Date: Thu, 13 Feb 2025 10:59:21 +0100
Subject: [PATCH 180/282] [clang] Fix preprocessor output from #embed (#126742)

When bytes with negative signed char values appear in the data, make
sure to use raw bytes from the data string when preprocessing, not char
values.

Fixes https://github.com/llvm/llvm-project/issues/102798
---
 clang/docs/ReleaseNotes.rst                        | 2 ++
 clang/lib/Frontend/PrintPreprocessedOutput.cpp     | 5 ++---
 clang/test/Preprocessor/embed_preprocess_to_file.c | 8 ++++++++
 3 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 03c420bcfd932..e716efa46a1f0 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -897,6 +897,8 @@ Bug Fixes in This Version
 - No longer return ``false`` for ``noexcept`` expressions involving a
   ``delete`` which resolves to a destroying delete but the type of the object
   being deleted has a potentially throwing destructor (#GH118660).
+- Clang now outputs correct values when #embed data contains bytes with negative
+  signed char values (#GH102798).
 
 Bug Fixes to Compiler Builtins
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/lib/Frontend/PrintPreprocessedOutput.cpp b/clang/lib/Frontend/PrintPreprocessedOutput.cpp
index 1005825441b3e..2ae355fb33885 100644
--- a/clang/lib/Frontend/PrintPreprocessedOutput.cpp
+++ b/clang/lib/Frontend/PrintPreprocessedOutput.cpp
@@ -974,11 +974,10 @@ static void PrintPreprocessedTokens(Preprocessor &PP, Token &Tok,
       // Loop over the contents and print them as a comma-delimited list of
       // values.
       bool PrintComma = false;
-      for (auto Iter = Data->BinaryData.begin(), End = Data->BinaryData.end();
-           Iter != End; ++Iter) {
+      for (unsigned char Byte : Data->BinaryData.bytes()) {
         if (PrintComma)
           *Callbacks->OS << ", ";
-        *Callbacks->OS << static_cast<unsigned>(*Iter);
+        *Callbacks->OS << static_cast<int>(Byte);
         PrintComma = true;
       }
     } else if (Tok.isAnnotation()) {
diff --git a/clang/test/Preprocessor/embed_preprocess_to_file.c b/clang/test/Preprocessor/embed_preprocess_to_file.c
index 9895d958cf96d..b3c99d36f784a 100644
--- a/clang/test/Preprocessor/embed_preprocess_to_file.c
+++ b/clang/test/Preprocessor/embed_preprocess_to_file.c
@@ -37,3 +37,11 @@ const char even_more[] = {
 // DIRECTIVE-NEXT: #embed <jk.txt> prefix(4, 5,) suffix(, 6, 7) /* clang -E -dE */
 // DIRECTIVE-NEXT:  , 8, 9, 10
 // DIRECTIVE-NEXT: };
+
+constexpr char big_one[] = {
+#embed <big_char.txt>
+};
+
+// EXPANDED: constexpr char big_one[] = {255
+// DIRECTIVE: constexpr char big_one[] = {
+// DIRECTIVE-NEXT: #embed <big_char.txt>

From 720ab3be5f83c781730854f3f0e2c799a80d1b99 Mon Sep 17 00:00:00 2001
From: Hassnaa Hamdi <hassnaa.hamdi@arm.com>
Date: Thu, 13 Feb 2025 17:48:02 +0000
Subject: [PATCH 181/282] [VPlan] Compute cost for binary op VPInstruction with
 underlying values. #125434

As exposed by #125094, we are missing cost computation for some
binary VPInstructions we created based on original IR instructions.
Their cost should be considered.
PR: #125434

Author: Florian Hahn <flo@fhahn.com>

Change-Id: Icf985b3f1cd40898a17faaf47b241e2651f9e8dd
---
 llvm/lib/Transforms/Vectorize/VPlan.h         |  5 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 24 ++++++
 .../X86/CostModel/vpinstruction-cost.ll       | 74 +++++++++++++++++++
 3 files changed, 99 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/X86/CostModel/vpinstruction-cost.ll

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 6c95b08a02014..f4e5b1870894c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1314,10 +1314,7 @@ class VPInstruction : public VPRecipeWithIRFlags,
 
   /// Return the cost of this VPInstruction.
   InstructionCost computeCost(ElementCount VF,
-                              VPCostContext &Ctx) const override {
-    // TODO: Compute accurate cost after retiring the legacy cost model.
-    return 0;
-  }
+                              VPCostContext &Ctx) const override;
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the VPInstruction to \p O.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 6b746a48c46ec..97be2da24fc37 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -703,6 +703,30 @@ Value *VPInstruction::generate(VPTransformState &State) {
   }
 }
 
+InstructionCost VPInstruction::computeCost(ElementCount VF,
+                                           VPCostContext &Ctx) const {
+  if (Instruction::isBinaryOp(getOpcode())) {
+    if (!getUnderlyingValue()) {
+      // TODO: Compute cost for VPInstructions without underlying values once
+      // the legacy cost model has been retired.
+      return 0;
+    }
+    assert(!doesGeneratePerAllLanes() &&
+           "Should only generate a vector value or single scalar, not scalars "
+           "for all lanes.");
+    Type *ResTy = Ctx.Types.inferScalarType(this);
+    if (!vputils::onlyFirstLaneUsed(this))
+      ResTy = toVectorTy(ResTy, VF);
+    return Ctx.TTI.getArithmeticInstrCost(getOpcode(), ResTy, Ctx.CostKind);
+  }
+
+  // TODO: Compute cost other VPInstructions once the legacy cost model has
+  // been retired.
+  assert(!getUnderlyingValue() &&
+          "unexpected VPInstruction witht underlying value");
+  return 0;
+}
+
 bool VPInstruction::isVectorToScalar() const {
   return getOpcode() == VPInstruction::ExtractFromEnd ||
          getOpcode() == VPInstruction::ComputeReductionResult ||
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/vpinstruction-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/vpinstruction-cost.ll
new file mode 100644
index 0000000000000..e7229bd0e7521
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/vpinstruction-cost.ll
@@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of"
+; RUN: opt -S -passes=loop-vectorize -mcpu=skylake-avx512 -mtriple=x86_64-apple-macosx -debug -disable-output -S %s 2>&1 | FileCheck %s
+
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+
+define void @wide_or_replaced_with_add_vpinstruction(ptr %src, ptr noalias %dst) {
+; CHECK-LABEL: 'wide_or_replaced_with_add_vpinstruction'
+; CHECK:  Cost of 1 for VF 2: induction instruction %iv.next = add nuw nsw i64 %iv, 1
+; CHECK:  Cost of 0 for VF 2: induction instruction %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+; CHECK:  Cost of 1 for VF 2: exit condition instruction %exitcond = icmp eq i64 %iv.next, 32
+; CHECK:  Cost of 0 for VF 2: EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK:  Cost of 0 for VF 2: ir<%iv> = WIDEN-INDUCTION ir<0>, ir<1>, vp<%0>
+; CHECK:  Cost of 0 for VF 2: vp<%4> = SCALAR-STEPS vp<%3>, ir<1>
+; CHECK:  Cost of 0 for VF 2: CLONE ir<%g.src> = getelementptr inbounds ir<%src>, vp<%4>
+; CHECK:  Cost of 0 for VF 2: vp<%5> = vector-pointer ir<%g.src>
+; CHECK:  Cost of 1 for VF 2: WIDEN ir<%l> = load vp<%5>
+; CHECK:  Cost of 1 for VF 2: WIDEN ir<%iv.4> = add ir<%iv>, ir<4>
+; CHECK:  Cost of 1 for VF 2: WIDEN ir<%c> = icmp ule ir<%l>, ir<128>
+; CHECK:  Cost of 1 for VF 2: EMIT ir<%or> = add ir<%iv.4>, ir<1>
+; CHECK:  Cost of 0 for VF 2: CLONE ir<%g.dst> = getelementptr ir<%dst>, ir<%or>
+; CHECK:  Cost of 0 for VF 2: vp<%6> = vector-pointer ir<%g.dst>
+; CHECK:  Cost of 1 for VF 2: WIDEN store vp<%6>, ir<%iv.4>, ir<%c>
+; CHECK:  Cost of 0 for VF 2: EMIT vp<%index.next> = add nuw vp<%3>, vp<%1>
+; CHECK:  Cost of 0 for VF 2: EMIT branch-on-count vp<%index.next>, vp<%2>
+; CHECK:  Cost of 0 for VF 2: vector loop backedge
+; CHECK:  Cost of 1 for VF 4: induction instruction %iv.next = add nuw nsw i64 %iv, 1
+; CHECK:  Cost of 0 for VF 4: induction instruction %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+; CHECK:  Cost of 1 for VF 4: exit condition instruction %exitcond = icmp eq i64 %iv.next, 32
+; CHECK:  Cost of 0 for VF 4: EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK:  Cost of 0 for VF 4: ir<%iv> = WIDEN-INDUCTION ir<0>, ir<1>, vp<%0>
+; CHECK:  Cost of 0 for VF 4: vp<%4> = SCALAR-STEPS vp<%3>, ir<1>
+; CHECK:  Cost of 0 for VF 4: CLONE ir<%g.src> = getelementptr inbounds ir<%src>, vp<%4>
+; CHECK:  Cost of 0 for VF 4: vp<%5> = vector-pointer ir<%g.src>
+; CHECK:  Cost of 1 for VF 4: WIDEN ir<%l> = load vp<%5>
+; CHECK:  Cost of 1 for VF 4: WIDEN ir<%iv.4> = add ir<%iv>, ir<4>
+; CHECK:  Cost of 1 for VF 4: WIDEN ir<%c> = icmp ule ir<%l>, ir<128>
+; CHECK:  Cost of 1 for VF 4: EMIT ir<%or> = add ir<%iv.4>, ir<1>
+; CHECK:  Cost of 0 for VF 4: CLONE ir<%g.dst> = getelementptr ir<%dst>, ir<%or>
+; CHECK:  Cost of 0 for VF 4: vp<%6> = vector-pointer ir<%g.dst>
+; CHECK:  Cost of 1 for VF 4: WIDEN store vp<%6>, ir<%iv.4>, ir<%c>
+; CHECK:  Cost of 0 for VF 4: EMIT vp<%index.next> = add nuw vp<%3>, vp<%1>
+; CHECK:  Cost of 0 for VF 4: EMIT branch-on-count vp<%index.next>, vp<%2>
+; CHECK:  Cost of 0 for VF 4: vector loop backedge
+; CHECK:  Cost of 1 for VF 4: induction instruction %iv.next = add nuw nsw i64 %iv, 1
+; CHECK:  Cost of 0 for VF 4: induction instruction %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+; CHECK:  Cost of 1 for VF 4: exit condition instruction %exitcond = icmp eq i64 %iv.next, 32
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %g.src = getelementptr inbounds i64, ptr %src, i64 %iv
+  %l = load i64, ptr %g.src
+  %iv.4 = add nuw nsw i64 %iv, 4
+  %c = icmp ule i64 %l, 128
+  br i1 %c, label %loop.then, label %loop.latch
+
+loop.then:
+  %or = or disjoint i64 %iv.4, 1
+  %g.dst = getelementptr inbounds i64, ptr %dst, i64 %or
+  store i64 %iv.4, ptr %g.dst, align 4
+  br label %loop.latch
+
+loop.latch:
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 32
+  br i1 %exitcond, label %exit, label %loop.header
+
+exit:
+  ret void
+}
\ No newline at end of file

From deb63e72d6c9ed98a2fbf4f8249ca6911bd189b8 Mon Sep 17 00:00:00 2001
From: Matheus Izvekov <mizvekov@gmail.com>
Date: Fri, 21 Feb 2025 15:49:10 -0300
Subject: [PATCH 182/282] [clang] Track function template instantiation from
 definition (#125266) (#127777)

This fixes instantiation of definition for friend function templates,
when the declaration found and the one containing the definition have
different template contexts.

In these cases, the the function declaration corresponding to the
definition is not available; it may not even be instantiated at all.

So this patch adds a bit which tracks which function template
declaration was instantiated from the member template. It's used to find
which primary template serves as a context for the purpose of
obtainining the template arguments needed to instantiate the definition.

Fixes #55509
---
 clang/docs/ReleaseNotes.rst                   |   1 +
 clang/include/clang/AST/Decl.h                |   7 ++
 clang/include/clang/AST/DeclBase.h            |  10 +-
 clang/include/clang/AST/DeclTemplate.h        |  20 ++++
 clang/lib/AST/Decl.cpp                        |   1 +
 clang/lib/Sema/SemaTemplateDeduction.cpp      |  17 +--
 clang/lib/Sema/SemaTemplateInstantiate.cpp    |   9 +-
 .../lib/Sema/SemaTemplateInstantiateDecl.cpp  |  27 ++++-
 clang/lib/Serialization/ASTReaderDecl.cpp     |   1 +
 clang/lib/Serialization/ASTWriterDecl.cpp     |   3 +-
 clang/test/SemaTemplate/GH55509.cpp           | 112 ++++++++++++++++++
 11 files changed, 180 insertions(+), 28 deletions(-)
 create mode 100644 clang/test/SemaTemplate/GH55509.cpp

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index e716efa46a1f0..a2518042cb5b0 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -1057,6 +1057,7 @@ Bug Fixes to C++ Support
 - Fix that some dependent immediate expressions did not cause immediate escalation (#GH119046)
 - Fixed a substitution bug in transforming CTAD aliases when the type alias contains a non-pack template argument
   corresponding to a pack parameter (#GH124715)
+- Clang is now better at keeping track of friend function template instance contexts. (#GH55509)
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/include/clang/AST/Decl.h b/clang/include/clang/AST/Decl.h
index 9593bab576412..362a2741a0cdd 100644
--- a/clang/include/clang/AST/Decl.h
+++ b/clang/include/clang/AST/Decl.h
@@ -2298,6 +2298,13 @@ class FunctionDecl : public DeclaratorDecl,
     FunctionDeclBits.IsLateTemplateParsed = ILT;
   }
 
+  bool isInstantiatedFromMemberTemplate() const {
+    return FunctionDeclBits.IsInstantiatedFromMemberTemplate;
+  }
+  void setInstantiatedFromMemberTemplate(bool Val = true) {
+    FunctionDeclBits.IsInstantiatedFromMemberTemplate = Val;
+  }
+
   /// Whether this function is "trivial" in some specialized C++ senses.
   /// Can only be true for default constructors, copy constructors,
   /// copy assignment operators, and destructors.  Not meaningful until
diff --git a/clang/include/clang/AST/DeclBase.h b/clang/include/clang/AST/DeclBase.h
index 3bb82c1572ef9..648dae2838e03 100644
--- a/clang/include/clang/AST/DeclBase.h
+++ b/clang/include/clang/AST/DeclBase.h
@@ -1780,6 +1780,8 @@ class DeclContext {
     uint64_t HasImplicitReturnZero : 1;
     LLVM_PREFERRED_TYPE(bool)
     uint64_t IsLateTemplateParsed : 1;
+    LLVM_PREFERRED_TYPE(bool)
+    uint64_t IsInstantiatedFromMemberTemplate : 1;
 
     /// Kind of contexpr specifier as defined by ConstexprSpecKind.
     LLVM_PREFERRED_TYPE(ConstexprSpecKind)
@@ -1830,7 +1832,7 @@ class DeclContext {
   };
 
   /// Number of inherited and non-inherited bits in FunctionDeclBitfields.
-  enum { NumFunctionDeclBits = NumDeclContextBits + 31 };
+  enum { NumFunctionDeclBits = NumDeclContextBits + 32 };
 
   /// Stores the bits used by CXXConstructorDecl. If modified
   /// NumCXXConstructorDeclBits and the accessor
@@ -1841,12 +1843,12 @@ class DeclContext {
     LLVM_PREFERRED_TYPE(FunctionDeclBitfields)
     uint64_t : NumFunctionDeclBits;
 
-    /// 20 bits to fit in the remaining available space.
+    /// 19 bits to fit in the remaining available space.
     /// Note that this makes CXXConstructorDeclBitfields take
     /// exactly 64 bits and thus the width of NumCtorInitializers
     /// will need to be shrunk if some bit is added to NumDeclContextBitfields,
     /// NumFunctionDeclBitfields or CXXConstructorDeclBitfields.
-    uint64_t NumCtorInitializers : 17;
+    uint64_t NumCtorInitializers : 16;
     LLVM_PREFERRED_TYPE(bool)
     uint64_t IsInheritingConstructor : 1;
 
@@ -1860,7 +1862,7 @@ class DeclContext {
   };
 
   /// Number of inherited and non-inherited bits in CXXConstructorDeclBitfields.
-  enum { NumCXXConstructorDeclBits = NumFunctionDeclBits + 20 };
+  enum { NumCXXConstructorDeclBits = NumFunctionDeclBits + 19 };
 
   /// Stores the bits used by ObjCMethodDecl.
   /// If modified NumObjCMethodDeclBits and the accessor
diff --git a/clang/include/clang/AST/DeclTemplate.h b/clang/include/clang/AST/DeclTemplate.h
index 03c43765206b1..472b079c8f728 100644
--- a/clang/include/clang/AST/DeclTemplate.h
+++ b/clang/include/clang/AST/DeclTemplate.h
@@ -1011,6 +1011,26 @@ class FunctionTemplateDecl : public RedeclarableTemplateDecl {
     return getTemplatedDecl()->isThisDeclarationADefinition();
   }
 
+  bool isCompatibleWithDefinition() const {
+    return getTemplatedDecl()->isInstantiatedFromMemberTemplate() ||
+           isThisDeclarationADefinition();
+  }
+
+  // This bit closely tracks 'RedeclarableTemplateDecl::InstantiatedFromMember',
+  // except this is per declaration, while the redeclarable field is
+  // per chain. This indicates a template redeclaration which
+  // is compatible with the definition, in the non-trivial case
+  // where this is not already a definition.
+  // This is only really needed for instantiating the definition of friend
+  // function templates, which can have redeclarations in different template
+  // contexts.
+  // The bit is actually stored in the FunctionDecl for space efficiency
+  // reasons.
+  void setInstantiatedFromMemberTemplate(FunctionTemplateDecl *D) {
+    getTemplatedDecl()->setInstantiatedFromMemberTemplate();
+    RedeclarableTemplateDecl::setInstantiatedFromMemberTemplate(D);
+  }
+
   /// Return the specialization with the provided arguments if it exists,
   /// otherwise return the insertion point.
   FunctionDecl *findSpecialization(ArrayRef<TemplateArgument> Args,
diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp
index 0bd4d64b54a0f..ba77c748815d5 100644
--- a/clang/lib/AST/Decl.cpp
+++ b/clang/lib/AST/Decl.cpp
@@ -3069,6 +3069,7 @@ FunctionDecl::FunctionDecl(Kind DK, ASTContext &C, DeclContext *DC,
   FunctionDeclBits.IsIneligibleOrNotSelected = false;
   FunctionDeclBits.HasImplicitReturnZero = false;
   FunctionDeclBits.IsLateTemplateParsed = false;
+  FunctionDeclBits.IsInstantiatedFromMemberTemplate = false;
   FunctionDeclBits.ConstexprKind = static_cast<uint64_t>(ConstexprKind);
   FunctionDeclBits.BodyContainsImmediateEscalatingExpression = false;
   FunctionDeclBits.InstantiationIsPending = false;
diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp
index 425c41f0f6236..5304b5a2155b4 100644
--- a/clang/lib/Sema/SemaTemplateDeduction.cpp
+++ b/clang/lib/Sema/SemaTemplateDeduction.cpp
@@ -4072,22 +4072,7 @@ TemplateDeductionResult Sema::FinishTemplateArgumentDeduction(
   if (FunctionTemplate->getFriendObjectKind())
     Owner = FunctionTemplate->getLexicalDeclContext();
   FunctionDecl *FD = FunctionTemplate->getTemplatedDecl();
-  // additional check for inline friend,
-  // ```
-  //   template <class F1> int foo(F1 X);
-  //   template <int A1> struct A {
-  //     template <class F1> friend int foo(F1 X) { return A1; }
-  //   };
-  //   template struct A<1>;
-  //   int a = foo(1.0);
-  // ```
-  const FunctionDecl *FDFriend;
-  if (FD->getFriendObjectKind() == Decl::FriendObjectKind::FOK_None &&
-      FD->isDefined(FDFriend, /*CheckForPendingFriendDefinition*/ true) &&
-      FDFriend->getFriendObjectKind() != Decl::FriendObjectKind::FOK_None) {
-    FD = const_cast<FunctionDecl *>(FDFriend);
-    Owner = FD->getLexicalDeclContext();
-  }
+
   MultiLevelTemplateArgumentList SubstArgs(
       FunctionTemplate, CanonicalDeducedArgumentList->asArray(),
       /*Final=*/false);
diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp
index eec56b7493bad..cf29d8a101b43 100644
--- a/clang/lib/Sema/SemaTemplateInstantiate.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp
@@ -479,9 +479,6 @@ MultiLevelTemplateArgumentList Sema::getTemplateInstantiationArgs(
   using namespace TemplateInstArgsHelpers;
   const Decl *CurDecl = ND;
 
-  if (!CurDecl)
-    CurDecl = Decl::castFromDeclContext(DC);
-
   if (Innermost) {
     Result.addOuterTemplateArguments(const_cast<NamedDecl *>(ND), *Innermost,
                                      Final);
@@ -495,8 +492,10 @@ MultiLevelTemplateArgumentList Sema::getTemplateInstantiationArgs(
     // has a depth of 0.
     if (const auto *TTP = dyn_cast<TemplateTemplateParmDecl>(CurDecl))
       HandleDefaultTempArgIntoTempTempParam(TTP, Result);
-    CurDecl = Response::UseNextDecl(CurDecl).NextDecl;
-  }
+    CurDecl = DC ? Decl::castFromDeclContext(DC)
+                 : Response::UseNextDecl(CurDecl).NextDecl;
+  } else if (!CurDecl)
+    CurDecl = Decl::castFromDeclContext(DC);
 
   while (!CurDecl->isFileContextDecl()) {
     Response R;
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index b4f4469ed4e48..89ad2a0a9b7bb 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -12,6 +12,7 @@
 #include "TreeTransform.h"
 #include "clang/AST/ASTConsumer.h"
 #include "clang/AST/ASTContext.h"
+#include "clang/AST/ASTLambda.h"
 #include "clang/AST/ASTMutationListener.h"
 #include "clang/AST/DeclTemplate.h"
 #include "clang/AST/DependentDiagnostic.h"
@@ -5245,9 +5246,31 @@ void Sema::InstantiateFunctionDefinition(SourceLocation PointOfInstantiation,
     RebuildTypeSourceInfoForDefaultSpecialMembers();
     SetDeclDefaulted(Function, PatternDecl->getLocation());
   } else {
+    NamedDecl *ND = Function;
+    DeclContext *DC = ND->getLexicalDeclContext();
+    std::optional<ArrayRef<TemplateArgument>> Innermost;
+    if (auto *Primary = Function->getPrimaryTemplate();
+        Primary &&
+        !isGenericLambdaCallOperatorOrStaticInvokerSpecialization(Function) &&
+        Function->getTemplateSpecializationKind() !=
+            TSK_ExplicitSpecialization) {
+      auto It = llvm::find_if(Primary->redecls(),
+                              [](const RedeclarableTemplateDecl *RTD) {
+                                return cast<FunctionTemplateDecl>(RTD)
+                                    ->isCompatibleWithDefinition();
+                              });
+      assert(It != Primary->redecls().end() &&
+             "Should't get here without a definition");
+      if (FunctionDecl *Def = cast<FunctionTemplateDecl>(*It)
+                                  ->getTemplatedDecl()
+                                  ->getDefinition())
+        DC = Def->getLexicalDeclContext();
+      else
+        DC = (*It)->getLexicalDeclContext();
+      Innermost.emplace(Function->getTemplateSpecializationArgs()->asArray());
+    }
     MultiLevelTemplateArgumentList TemplateArgs = getTemplateInstantiationArgs(
-        Function, Function->getLexicalDeclContext(), /*Final=*/false,
-        /*Innermost=*/std::nullopt, false, PatternDecl);
+        Function, DC, /*Final=*/false, Innermost, false, PatternDecl);
 
     // Substitute into the qualifier; we can get a substitution failure here
     // through evil use of alias templates.
diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index 8921b92178ee2..b4ff71c8958a4 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -1064,6 +1064,7 @@ void ASTDeclReader::VisitFunctionDecl(FunctionDecl *FD) {
   FD->setHasImplicitReturnZero(FunctionDeclBits.getNextBit());
   FD->setIsMultiVersion(FunctionDeclBits.getNextBit());
   FD->setLateTemplateParsed(FunctionDeclBits.getNextBit());
+  FD->setInstantiatedFromMemberTemplate(FunctionDeclBits.getNextBit());
   FD->setFriendConstraintRefersToEnclosingTemplate(
       FunctionDeclBits.getNextBit());
   FD->setUsesSEHTry(FunctionDeclBits.getNextBit());
diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp
index 3505db441e829..a5d8a3f05bc99 100644
--- a/clang/lib/Serialization/ASTWriterDecl.cpp
+++ b/clang/lib/Serialization/ASTWriterDecl.cpp
@@ -679,7 +679,7 @@ void ASTDeclWriter::VisitDeclaratorDecl(DeclaratorDecl *D) {
 }
 
 void ASTDeclWriter::VisitFunctionDecl(FunctionDecl *D) {
-  static_assert(DeclContext::NumFunctionDeclBits == 44,
+  static_assert(DeclContext::NumFunctionDeclBits == 45,
                 "You need to update the serializer after you change the "
                 "FunctionDeclBits");
 
@@ -785,6 +785,7 @@ void ASTDeclWriter::VisitFunctionDecl(FunctionDecl *D) {
   FunctionDeclBits.addBit(D->hasImplicitReturnZero());
   FunctionDeclBits.addBit(D->isMultiVersion());
   FunctionDeclBits.addBit(D->isLateTemplateParsed());
+  FunctionDeclBits.addBit(D->isInstantiatedFromMemberTemplate());
   FunctionDeclBits.addBit(D->FriendConstraintRefersToEnclosingTemplate());
   FunctionDeclBits.addBit(D->usesSEHTry());
   Record.push_back(FunctionDeclBits);
diff --git a/clang/test/SemaTemplate/GH55509.cpp b/clang/test/SemaTemplate/GH55509.cpp
new file mode 100644
index 0000000000000..773a84305a0cd
--- /dev/null
+++ b/clang/test/SemaTemplate/GH55509.cpp
@@ -0,0 +1,112 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++26 %s
+
+namespace t1 {
+  template<int N> struct A {
+    template<class C> friend auto cica(const A<N-1>&, C) {
+      return N;
+    }
+  };
+
+  template<> struct A<0> {
+    template<class C> friend auto cica(const A<0>&, C);
+    // expected-note@-1 {{declared here}}
+  };
+
+  void test() {
+    cica(A<0>{}, 0);
+    // expected-error@-1 {{function 'cica<int>' with deduced return type cannot be used before it is defined}}
+
+    (void)A<1>{};
+    cica(A<0>{}, 0);
+  }
+} // namespace t1
+namespace t2 {
+  template<int N> struct A {
+    template<class C> friend auto cica(const A<N-1>&, C) {
+      return N;
+    }
+  };
+
+  template<> struct A<0> {
+    template<class C> friend auto cica(const A<0>&, C);
+  };
+
+  template <int N, class = decltype(cica(A<N>{}, nullptr))>
+  void MakeCica();
+  // expected-note@-1 {{candidate function}}
+
+  template <int N> void MakeCica(A<N+1> = {});
+  // expected-note@-1 {{candidate function}}
+
+  void test() {
+    MakeCica<0>();
+
+    MakeCica<0>();
+    // expected-error@-1 {{call to 'MakeCica' is ambiguous}}
+  }
+} // namespace t2
+namespace t3 {
+  template<int N> struct A {
+    template<class C> friend auto cica(const A<N-1>&, C) {
+      return N-1;
+    }
+  };
+
+  template<> struct A<0> {
+    template<class C> friend auto cica(const A<0>&, C);
+  };
+
+  template <int N, class AT, class = decltype(cica(AT{}, nullptr))>
+  static constexpr bool MakeCica(int);
+
+  template <int N, class AT>
+  static constexpr bool MakeCica(short, A<N+1> = {});
+
+  template <int N, class AT = A<N>, class Val = decltype(MakeCica<N, AT>(0))>
+  static constexpr bool has_cica = Val{};
+
+  constexpr bool cica2 = has_cica<0> || has_cica<0>;
+} // namespace t3
+namespace t4 {
+  template<int N> struct A {
+    template<class C> friend auto cica(const A<N-1>&, C);
+  };
+
+  template<> struct A<0> {
+    template<class C> friend auto cica(const A<0>&, C) {
+      C a;
+    }
+  };
+
+  template struct A<1>;
+
+  void test() {
+    cica(A<0>{}, 0);
+  }
+} // namespace t4
+namespace regression1 {
+  template <class> class A;
+
+  template <class T> [[gnu::abi_tag("TAG")]] void foo(A<T>);
+
+  template <class> struct A {
+    friend void foo <>(A);
+  };
+
+  template struct A<int>;
+
+  template <class T> [[gnu::abi_tag("TAG")]] void foo(A<T>) {}
+
+  template void foo<int>(A<int>);
+} // namespace regression1
+namespace regression2 {
+  template <class> struct A {
+    template <class T> static void f() {
+      A<int>::f<T>();
+    }
+  };
+  template <> template <class T> void A<int>::f() {
+    static_assert(__is_same(T, long));
+  }
+  template void A<void>::f<long>();
+} // namespace regression2

From 3076a68f69aac3f87195eec12f38908a499263cb Mon Sep 17 00:00:00 2001
From: Sudharsan Veeravalli <quic_svs@quicinc.com>
Date: Fri, 21 Feb 2025 12:53:13 +0530
Subject: [PATCH 183/282] [RISCV] [MachineOutliner] Analyze all candidates
 (#127659)

#117700 made a change from analyzing all the candidates to analyzing
just the first candidate before deciding to either delete or keep all of
them.

Even though the candidates all have the same instructions, the basic
blocks in which they are present are different and we will need to check
each of them before deciding whether to keep or erase them.
Particularly, `isAvailableAcrossAndOutOfSeq` checks to see if the
register (x5 in this case) is available from the end of the MBB to the
beginning of the candidate and not checking this for each candidate led
to incorrect candidates being outlined resulting in correctness issues
in a few downstream benchmarks.

Similarly, deleting all the candidates if the first one is not viable
will result in missed outlining opportunities.

(cherry picked from commit 6757cf4e6f1c7767d605e579930a24758c0778dc)
---
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp      |  52 +++----
 .../machine-outliner-call-x5-liveout.mir      | 136 ++++++++++++++++++
 2 files changed, 158 insertions(+), 30 deletions(-)
 create mode 100644 llvm/test/CodeGen/RISCV/machine-outliner-call-x5-liveout.mir

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 12a7af0750813..87f1f35835cbe 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -3010,30 +3010,25 @@ static bool cannotInsertTailCall(const MachineBasicBlock &MBB) {
   return false;
 }
 
-static std::optional<MachineOutlinerConstructionID>
-analyzeCandidate(outliner::Candidate &C) {
+static bool analyzeCandidate(outliner::Candidate &C) {
   // If last instruction is return then we can rely on
   // the verification already performed in the getOutliningTypeImpl.
   if (C.back().isReturn()) {
     assert(!cannotInsertTailCall(*C.getMBB()) &&
            "The candidate who uses return instruction must be outlined "
            "using tail call");
-    return MachineOutlinerTailCall;
+    return false;
   }
 
-  auto CandidateUsesX5 = [](outliner::Candidate &C) {
-    const TargetRegisterInfo *TRI = C.getMF()->getSubtarget().getRegisterInfo();
-    if (std::any_of(C.begin(), C.end(), [TRI](const MachineInstr &MI) {
-          return isMIModifiesReg(MI, TRI, RISCV::X5);
-        }))
-      return true;
-    return !C.isAvailableAcrossAndOutOfSeq(RISCV::X5, *TRI);
-  };
-
-  if (!CandidateUsesX5(C))
-    return MachineOutlinerDefault;
+  // Filter out candidates where the X5 register (t0) can't be used to setup
+  // the function call.
+  const TargetRegisterInfo *TRI = C.getMF()->getSubtarget().getRegisterInfo();
+  if (std::any_of(C.begin(), C.end(), [TRI](const MachineInstr &MI) {
+        return isMIModifiesReg(MI, TRI, RISCV::X5);
+      }))
+    return true;
 
-  return std::nullopt;
+  return !C.isAvailableAcrossAndOutOfSeq(RISCV::X5, *TRI);
 }
 
 std::optional<std::unique_ptr<outliner::OutlinedFunction>>
@@ -3042,35 +3037,32 @@ RISCVInstrInfo::getOutliningCandidateInfo(
     std::vector<outliner::Candidate> &RepeatedSequenceLocs,
     unsigned MinRepeats) const {
 
-  // Each RepeatedSequenceLoc is identical.
-  outliner::Candidate &Candidate = RepeatedSequenceLocs[0];
-  auto CandidateInfo = analyzeCandidate(Candidate);
-  if (!CandidateInfo)
-    RepeatedSequenceLocs.clear();
+  // Analyze each candidate and erase the ones that are not viable.
+  llvm::erase_if(RepeatedSequenceLocs, analyzeCandidate);
 
   // If the sequence doesn't have enough candidates left, then we're done.
   if (RepeatedSequenceLocs.size() < MinRepeats)
     return std::nullopt;
 
+  // Each RepeatedSequenceLoc is identical.
+  outliner::Candidate &Candidate = RepeatedSequenceLocs[0];
   unsigned InstrSizeCExt =
       Candidate.getMF()->getSubtarget<RISCVSubtarget>().hasStdExtCOrZca() ? 2
                                                                           : 4;
   unsigned CallOverhead = 0, FrameOverhead = 0;
 
-  MachineOutlinerConstructionID MOCI = CandidateInfo.value();
-  switch (MOCI) {
-  case MachineOutlinerDefault:
-    // call t0, function = 8 bytes.
-    CallOverhead = 8;
-    // jr t0 = 4 bytes, 2 bytes if compressed instructions are enabled.
-    FrameOverhead = InstrSizeCExt;
-    break;
-  case MachineOutlinerTailCall:
+  MachineOutlinerConstructionID MOCI = MachineOutlinerDefault;
+  if (Candidate.back().isReturn()) {
+    MOCI = MachineOutlinerTailCall;
     // tail call = auipc + jalr in the worst case without linker relaxation.
     CallOverhead = 4 + InstrSizeCExt;
     // Using tail call we move ret instruction from caller to callee.
     FrameOverhead = 0;
-    break;
+  } else {
+    // call t0, function = 8 bytes.
+    CallOverhead = 8;
+    // jr t0 = 4 bytes, 2 bytes if compressed instructions are enabled.
+    FrameOverhead = InstrSizeCExt;
   }
 
   for (auto &C : RepeatedSequenceLocs)
diff --git a/llvm/test/CodeGen/RISCV/machine-outliner-call-x5-liveout.mir b/llvm/test/CodeGen/RISCV/machine-outliner-call-x5-liveout.mir
new file mode 100644
index 0000000000000..f7bea33e52885
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/machine-outliner-call-x5-liveout.mir
@@ -0,0 +1,136 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=riscv32 -x mir -run-pass=machine-outliner -simplify-mir -verify-machineinstrs < %s \
+# RUN: | FileCheck -check-prefixes=RV32I-MO %s
+# RUN: llc -mtriple=riscv64 -x mir -run-pass=machine-outliner -simplify-mir -verify-machineinstrs < %s \
+# RUN: | FileCheck -check-prefixes=RV64I-MO %s
+
+# MIR has been edited by hand to have x5 as live out in @dont_outline
+
+---
+
+name:            outline_0
+tracksRegLiveness: true
+isOutlined: false
+body:             |
+  bb.0:
+    liveins: $x10, $x11
+
+    ; RV32I-MO-LABEL: name: outline_0
+    ; RV32I-MO: liveins: $x10, $x11
+    ; RV32I-MO-NEXT: {{  $}}
+    ; RV32I-MO-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11
+    ; RV32I-MO-NEXT: PseudoRET implicit $x11
+    ;
+    ; RV64I-MO-LABEL: name: outline_0
+    ; RV64I-MO: liveins: $x10, $x11
+    ; RV64I-MO-NEXT: {{  $}}
+    ; RV64I-MO-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11
+    ; RV64I-MO-NEXT: PseudoRET implicit $x11
+    $x11 = ORI $x11, 1023
+    $x12 = ADDI $x10, 17
+    $x11 = AND $x12, $x11
+    $x10 = SUB $x10, $x11
+    PseudoRET implicit $x11
+
+...
+---
+
+name:            dont_outline
+tracksRegLiveness: true
+isOutlined: false
+body:             |
+  ; RV32I-MO-LABEL: name: dont_outline
+  ; RV32I-MO: bb.0:
+  ; RV32I-MO-NEXT:   liveins: $x10, $x11
+  ; RV32I-MO-NEXT: {{  $}}
+  ; RV32I-MO-NEXT:   $x11 = ORI $x11, 1023
+  ; RV32I-MO-NEXT:   $x12 = ADDI $x10, 17
+  ; RV32I-MO-NEXT:   $x11 = AND $x12, $x11
+  ; RV32I-MO-NEXT:   $x10 = SUB $x10, $x11
+  ; RV32I-MO-NEXT:   BEQ $x10, $x11, %bb.1
+  ; RV32I-MO-NEXT: {{  $}}
+  ; RV32I-MO-NEXT: bb.1:
+  ; RV32I-MO-NEXT:   liveins: $x10, $x5
+  ; RV32I-MO-NEXT: {{  $}}
+  ; RV32I-MO-NEXT:   PseudoRET implicit $x10, implicit $x5
+  ;
+  ; RV64I-MO-LABEL: name: dont_outline
+  ; RV64I-MO: bb.0:
+  ; RV64I-MO-NEXT:   liveins: $x10, $x11
+  ; RV64I-MO-NEXT: {{  $}}
+  ; RV64I-MO-NEXT:   $x11 = ORI $x11, 1023
+  ; RV64I-MO-NEXT:   $x12 = ADDI $x10, 17
+  ; RV64I-MO-NEXT:   $x11 = AND $x12, $x11
+  ; RV64I-MO-NEXT:   $x10 = SUB $x10, $x11
+  ; RV64I-MO-NEXT:   BEQ $x10, $x11, %bb.1
+  ; RV64I-MO-NEXT: {{  $}}
+  ; RV64I-MO-NEXT: bb.1:
+  ; RV64I-MO-NEXT:   liveins: $x10, $x5
+  ; RV64I-MO-NEXT: {{  $}}
+  ; RV64I-MO-NEXT:   PseudoRET implicit $x10, implicit $x5
+  bb.0:
+    liveins: $x10, $x11
+
+    $x11 = ORI $x11, 1023
+    $x12 = ADDI $x10, 17
+    $x11 = AND $x12, $x11
+    $x10 = SUB $x10, $x11
+    BEQ $x10, $x11, %bb.1
+
+  bb.1:
+    liveins: $x10, $x5
+    PseudoRET implicit $x10, implicit $x5
+
+...
+---
+
+name:            outline_1
+tracksRegLiveness: true
+isOutlined: false
+body:             |
+  bb.0:
+    liveins: $x10, $x11
+
+    ; RV32I-MO-LABEL: name: outline_1
+    ; RV32I-MO: liveins: $x10, $x11
+    ; RV32I-MO-NEXT: {{  $}}
+    ; RV32I-MO-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11
+    ; RV32I-MO-NEXT: PseudoRET implicit $x10
+    ;
+    ; RV64I-MO-LABEL: name: outline_1
+    ; RV64I-MO: liveins: $x10, $x11
+    ; RV64I-MO-NEXT: {{  $}}
+    ; RV64I-MO-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11
+    ; RV64I-MO-NEXT: PseudoRET implicit $x10
+    $x11 = ORI $x11, 1023
+    $x12 = ADDI $x10, 17
+    $x11 = AND $x12, $x11
+    $x10 = SUB $x10, $x11
+    PseudoRET implicit $x10
+
+...
+---
+name:            outline_2
+tracksRegLiveness: true
+isOutlined: false
+body:             |
+  bb.0:
+    liveins: $x10, $x11
+
+    ; RV32I-MO-LABEL: name: outline_2
+    ; RV32I-MO: liveins: $x10, $x11
+    ; RV32I-MO-NEXT: {{  $}}
+    ; RV32I-MO-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11
+    ; RV32I-MO-NEXT: PseudoRET implicit $x12
+    ;
+    ; RV64I-MO-LABEL: name: outline_2
+    ; RV64I-MO: liveins: $x10, $x11
+    ; RV64I-MO-NEXT: {{  $}}
+    ; RV64I-MO-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11
+    ; RV64I-MO-NEXT: PseudoRET implicit $x12
+    $x11 = ORI $x11, 1023
+    $x12 = ADDI $x10, 17
+    $x11 = AND $x12, $x11
+    $x10 = SUB $x10, $x11
+    PseudoRET implicit $x12
+...

From d51f23377a77eace4ef006e0e6b23460ed05576c Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Sat, 22 Feb 2025 02:23:06 +0700
Subject: [PATCH 184/282] AMDGPU: Add some release 20 notes (#128136)

---
 llvm/docs/ReleaseNotes.md | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index c80aecfdea084..e654509792652 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -159,6 +159,17 @@ Changes to the AArch64 Backend
 Changes to the AMDGPU Backend
 -----------------------------
 
+* Initial support for gfx950
+
+* Improved ``llvm.memcpy``, ``llvm.memmove`` and ``llvm.memset`` lowering
+
+* Fixed expansion of 64-bit flat address space ``atomicrmw`` and
+  ``cmpxchg`` operations which may access private
+  memory. `noalias.addrspace` metadat may be used to avoid the
+  expansion if the target address is known to not be on the stack.
+
+* Fix compile failures when emitting unreachable functions.
+
 * Removed `llvm.amdgcn.flat.atomic.fadd` and
   `llvm.amdgcn.global.atomic.fadd` intrinsics. Users should use the
   {ref}`atomicrmw <i_atomicrmw>` instruction with `fadd` and

From b727a13fecc4e29b6f8499afd95626795c9f6a8e Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans@hanshq.net>
Date: Thu, 20 Feb 2025 11:02:33 +0100
Subject: [PATCH 185/282] Add Wasm, RISC-V, BPF, and NVPTX targets back to
 Windows release packaging (#127794)

In #106059 we reduced the targets to those supported by Windows (X86 and
ARM) to avoid running into size limitations of the NSIS compiler.

Since then, people complained about the lack of Wasm [1], RISC-V [2],
BPF [3], and NVPTX [4]. These do seem to fit in the installer (at least
for 20.1.0-rc2), so let's add them back.

[1]
https://discourse.llvm.org/t/llvm-19-x-release-third-party-binaries/80374/26
[2]
https://discourse.llvm.org/t/llvm-19-x-release-third-party-binaries/80374/53
[3] https://github.com/llvm/llvm-project/issues/127120
[4]
https://github.com/llvm/llvm-project/pull/127794#issuecomment-2668677203

(cherry picked from commit 6e047a5ab42698165a4746ef681396fab1698327)
---
 llvm/utils/release/build_llvm_release.bat | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/utils/release/build_llvm_release.bat b/llvm/utils/release/build_llvm_release.bat
index dd041d7d384ec..1c30673cf88bd 100755
--- a/llvm/utils/release/build_llvm_release.bat
+++ b/llvm/utils/release/build_llvm_release.bat
@@ -150,7 +150,7 @@ set common_cmake_flags=^
   -DCMAKE_BUILD_TYPE=Release ^
   -DLLVM_ENABLE_ASSERTIONS=OFF ^
   -DLLVM_INSTALL_TOOLCHAIN_ONLY=ON ^
-  -DLLVM_TARGETS_TO_BUILD="AArch64;ARM;X86" ^
+  -DLLVM_TARGETS_TO_BUILD="AArch64;ARM;X86;BPF;WebAssembly;RISCV;NVPTX" ^
   -DLLVM_BUILD_LLVM_C_DYLIB=ON ^
   -DCMAKE_INSTALL_UCRT_LIBRARIES=ON ^
   -DPython3_FIND_REGISTRY=NEVER ^

From b84ffb9f3b349dd4548a9d3c0ead91021b7905a3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sebastian=20Jod=C5=82owski?= <jodelek@gmail.com>
Date: Wed, 19 Feb 2025 14:41:07 -0800
Subject: [PATCH 186/282] [CUDA] Add support for sm101 and sm120 target
 architectures (#127187)

Add support for sm101 and sm120 target architectures. It requires CUDA
12.8.

---------

Co-authored-by: Sebastian Jodlowski <sjodlowski@nuro.ai>
(cherry picked from commit 0127f169dc8e0b5b6c2a24f74cd42d9d277916f6)
---
 clang/include/clang/Basic/BuiltinsNVPTX.td    |  8 ++++---
 clang/include/clang/Basic/Cuda.h              |  4 ++++
 clang/lib/Basic/Cuda.cpp                      |  8 +++++++
 clang/lib/Basic/Targets/NVPTX.cpp             | 23 +++++++++++++++----
 clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp      |  4 ++++
 .../test/Misc/target-invalid-cpu-note/nvptx.c |  4 ++++
 6 files changed, 43 insertions(+), 8 deletions(-)

diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.td b/clang/include/clang/Basic/BuiltinsNVPTX.td
index 9d24a992563a4..b550fff8567df 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.td
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.td
@@ -21,12 +21,14 @@ class SM<string version, list<SMFeatures> newer_list> : SMFeatures {
                         !strconcat(f, "|", newer.Features));
 }
 
+let Features = "sm_120a" in def SM_120a : SMFeatures;
+let Features = "sm_101a" in def SM_101a : SMFeatures;
 let Features = "sm_100a" in def SM_100a : SMFeatures;
-
-def SM_100 : SM<"100", [SM_100a]>;
-
 let Features = "sm_90a" in def SM_90a : SMFeatures;
 
+def SM_120 : SM<"120", [SM_120a]>;
+def SM_101 : SM<"101", [SM_101a, SM_120]>;
+def SM_100 : SM<"100", [SM_100a, SM_101]>;
 def SM_90 : SM<"90", [SM_90a, SM_100]>;
 def SM_89 : SM<"89", [SM_90]>;
 def SM_87 : SM<"87", [SM_89]>;
diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h
index f33ba46233a7a..5c909a8e9ca11 100644
--- a/clang/include/clang/Basic/Cuda.h
+++ b/clang/include/clang/Basic/Cuda.h
@@ -82,6 +82,10 @@ enum class OffloadArch {
   SM_90a,
   SM_100,
   SM_100a,
+  SM_101,
+  SM_101a,
+  SM_120,
+  SM_120a,
   GFX600,
   GFX601,
   GFX602,
diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp
index 1bfec0b37c5ee..79cac0ec119dd 100644
--- a/clang/lib/Basic/Cuda.cpp
+++ b/clang/lib/Basic/Cuda.cpp
@@ -100,6 +100,10 @@ static const OffloadArchToStringMap arch_names[] = {
     SM(90a),                         // Hopper
     SM(100),                         // Blackwell
     SM(100a),                        // Blackwell
+    SM(101),                         // Blackwell
+    SM(101a),                        // Blackwell
+    SM(120),                         // Blackwell
+    SM(120a),                        // Blackwell
     GFX(600),  // gfx600
     GFX(601),  // gfx601
     GFX(602),  // gfx602
@@ -230,6 +234,10 @@ CudaVersion MinVersionForOffloadArch(OffloadArch A) {
     return CudaVersion::CUDA_120;
   case OffloadArch::SM_100:
   case OffloadArch::SM_100a:
+  case OffloadArch::SM_101:
+  case OffloadArch::SM_101a:
+  case OffloadArch::SM_120:
+  case OffloadArch::SM_120a:
     return CudaVersion::CUDA_128;
   default:
     llvm_unreachable("invalid enum");
diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp
index a03f4983b9d03..9be12cbe7ac19 100644
--- a/clang/lib/Basic/Targets/NVPTX.cpp
+++ b/clang/lib/Basic/Targets/NVPTX.cpp
@@ -176,7 +176,7 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts,
 
   if (Opts.CUDAIsDevice || Opts.OpenMPIsTargetDevice || !HostTarget) {
     // Set __CUDA_ARCH__ for the GPU specified.
-    std::string CUDAArchCode = [this] {
+    llvm::StringRef CUDAArchCode = [this] {
       switch (GPU) {
       case OffloadArch::GFX600:
       case OffloadArch::GFX601:
@@ -283,14 +283,27 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts,
       case OffloadArch::SM_100:
       case OffloadArch::SM_100a:
         return "1000";
+      case OffloadArch::SM_101:
+      case OffloadArch::SM_101a:
+         return "1010";
+      case OffloadArch::SM_120:
+      case OffloadArch::SM_120a:
+         return "1200";
       }
       llvm_unreachable("unhandled OffloadArch");
     }();
     Builder.defineMacro("__CUDA_ARCH__", CUDAArchCode);
-    if (GPU == OffloadArch::SM_90a)
-      Builder.defineMacro("__CUDA_ARCH_FEAT_SM90_ALL", "1");
-    if (GPU == OffloadArch::SM_100a)
-      Builder.defineMacro("__CUDA_ARCH_FEAT_SM100_ALL", "1");
+    switch(GPU) {
+      case OffloadArch::SM_90a:
+      case OffloadArch::SM_100a:
+      case OffloadArch::SM_101a:
+      case OffloadArch::SM_120a:
+        Builder.defineMacro("__CUDA_ARCH_FEAT_SM" + CUDAArchCode.drop_back() + "_ALL", "1");
+        break;
+      default:
+        // Do nothing if this is not an enhanced architecture.
+        break;
+    }
   }
 }
 
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index c13928f61a748..dc417880a50e9 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -2278,6 +2278,10 @@ void CGOpenMPRuntimeGPU::processRequiresDirective(const OMPRequiresDecl *D) {
       case OffloadArch::SM_90a:
       case OffloadArch::SM_100:
       case OffloadArch::SM_100a:
+      case OffloadArch::SM_101:
+      case OffloadArch::SM_101a:
+      case OffloadArch::SM_120:
+      case OffloadArch::SM_120a:
       case OffloadArch::GFX600:
       case OffloadArch::GFX601:
       case OffloadArch::GFX602:
diff --git a/clang/test/Misc/target-invalid-cpu-note/nvptx.c b/clang/test/Misc/target-invalid-cpu-note/nvptx.c
index 3afcdf8c9fe5c..d8e4d7e63e234 100644
--- a/clang/test/Misc/target-invalid-cpu-note/nvptx.c
+++ b/clang/test/Misc/target-invalid-cpu-note/nvptx.c
@@ -28,6 +28,10 @@
 // CHECK-SAME: {{^}}, sm_90a
 // CHECK-SAME: {{^}}, sm_100
 // CHECK-SAME: {{^}}, sm_100a
+// CHECK-SAME: {{^}}, sm_101
+// CHECK-SAME: {{^}}, sm_101a
+// CHECK-SAME: {{^}}, sm_120
+// CHECK-SAME: {{^}}, sm_120a
 // CHECK-SAME: {{^}}, gfx600
 // CHECK-SAME: {{^}}, gfx601
 // CHECK-SAME: {{^}}, gfx602

From af9d7dda2125c2ab10758ce6b5a968fd56af5048 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Wed, 5 Feb 2025 08:33:14 -0500
Subject: [PATCH 187/282] [libc++] Fix stray usage of
 _LIBCPP_HAS_NO_WIDE_CHARACTERS on Windows

(cherry picked from commit bcfd9f81e1bc9954d616ffbb8625099916bebd5b)
---
 libcxx/include/__locale_dir/support/windows.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libcxx/include/__locale_dir/support/windows.h b/libcxx/include/__locale_dir/support/windows.h
index ff89d3e87eb44..f0f76c527264a 100644
--- a/libcxx/include/__locale_dir/support/windows.h
+++ b/libcxx/include/__locale_dir/support/windows.h
@@ -215,7 +215,7 @@ inline _LIBCPP_HIDE_FROM_ABI size_t __strxfrm(char* __dest, const char* __src, s
   return ::_strxfrm_l(__dest, __src, __n, __loc);
 }
 
-#ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
+#if _LIBCPP_HAS_WIDE_CHARACTERS
 inline _LIBCPP_HIDE_FROM_ABI int __iswctype(wint_t __c, wctype_t __type, __locale_t __loc) {
   return ::_iswctype_l(__c, __type, __loc);
 }
@@ -240,7 +240,7 @@ inline _LIBCPP_HIDE_FROM_ABI int __wcscoll(const wchar_t* __ws1, const wchar_t*
 inline _LIBCPP_HIDE_FROM_ABI size_t __wcsxfrm(wchar_t* __dest, const wchar_t* __src, size_t __n, __locale_t __loc) {
   return ::_wcsxfrm_l(__dest, __src, __n, __loc);
 }
-#endif // !_LIBCPP_HAS_NO_WIDE_CHARACTERS
+#endif // _LIBCPP_HAS_WIDE_CHARACTERS
 
 #if defined(__MINGW32__) && __MSVCRT_VERSION__ < 0x0800
 _LIBCPP_EXPORTED_FROM_ABI size_t __strftime(char*, size_t, const char*, const struct tm*, __locale_t);

From 43a04b1db60414089bc7f864feb7cd8be7600498 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Thu, 20 Feb 2025 08:38:42 -0500
Subject: [PATCH 188/282] [libc++] Reduce the dependency of the locale base API
 on the base system from the headers (#117764)

Many parts of the locale base API are only required when building the
shared/static library, but not from the headers. Document those
functions and carve out a few of those that don't work when
_XOPEN_SOURCE is defined to something old.

Fixes #117630

(cherry picked from commit f00b32e2d0ee666d32f1ddd0c687e269fab95b44)
---
 libcxx/include/__locale_dir/locale_base_api.h | 56 ++++++++++++-------
 .../include/__locale_dir/support/bsd_like.h   | 22 +++++---
 libcxx/include/__locale_dir/support/fuchsia.h |  9 ++-
 .../support/no_locale/characters.h            |  8 ++-
 libcxx/include/__locale_dir/support/windows.h | 18 ++++--
 libcxx/test/libcxx/xopen_source.gen.py        | 53 ++++++++++++++++++
 6 files changed, 128 insertions(+), 38 deletions(-)
 create mode 100644 libcxx/test/libcxx/xopen_source.gen.py

diff --git a/libcxx/include/__locale_dir/locale_base_api.h b/libcxx/include/__locale_dir/locale_base_api.h
index bbee9f49867fd..c1e73caeecced 100644
--- a/libcxx/include/__locale_dir/locale_base_api.h
+++ b/libcxx/include/__locale_dir/locale_base_api.h
@@ -23,12 +23,16 @@
 // Variadic functions may be implemented as templates with a parameter pack instead
 // of C-style variadic functions.
 //
+// Most of these functions are only required when building the library. Functions that are also
+// required when merely using the headers are marked as such below.
+//
 // TODO: __localeconv shouldn't take a reference, but the Windows implementation doesn't allow copying __locale_t
+// TODO: Eliminate the need for any of these functions from the headers.
 //
 // Locale management
 // -----------------
 // namespace __locale {
-//  using __locale_t = implementation-defined;
+//  using __locale_t = implementation-defined;  // required by the headers
 //  using __lconv_t  = implementation-defined;
 //  __locale_t  __newlocale(int, const char*, __locale_t);
 //  void        __freelocale(__locale_t);
@@ -36,6 +40,7 @@
 //  __lconv_t*  __localeconv(__locale_t&);
 // }
 //
+// // required by the headers
 // #define _LIBCPP_COLLATE_MASK   /* implementation-defined */
 // #define _LIBCPP_CTYPE_MASK     /* implementation-defined */
 // #define _LIBCPP_MONETARY_MASK  /* implementation-defined */
@@ -48,6 +53,7 @@
 // Strtonum functions
 // ------------------
 // namespace __locale {
+//  // required by the headers
 //  float               __strtof(const char*, char**, __locale_t);
 //  double              __strtod(const char*, char**, __locale_t);
 //  long double         __strtold(const char*, char**, __locale_t);
@@ -60,8 +66,8 @@
 // namespace __locale {
 //  int     __islower(int, __locale_t);
 //  int     __isupper(int, __locale_t);
-//  int     __isdigit(int, __locale_t);
-//  int     __isxdigit(int, __locale_t);
+//  int     __isdigit(int, __locale_t);  // required by the headers
+//  int     __isxdigit(int, __locale_t); // required by the headers
 //  int     __toupper(int, __locale_t);
 //  int     __tolower(int, __locale_t);
 //  int     __strcoll(const char*, const char*, __locale_t);
@@ -99,9 +105,10 @@
 //  int     __mbtowc(wchar_t*, const char*, size_t, __locale_t);
 //  size_t  __mbrlen(const char*, size_t, mbstate_t*, __locale_t);
 //  size_t  __mbsrtowcs(wchar_t*, const char**, size_t, mbstate_t*, __locale_t);
-//  int     __snprintf(char*, size_t, __locale_t, const char*, ...);
-//  int     __asprintf(char**, __locale_t, const char*, ...);
-//  int     __sscanf(const char*, __locale_t, const char*, ...);
+//
+//  int     __snprintf(char*, size_t, __locale_t, const char*, ...); // required by the headers
+//  int     __asprintf(char**, __locale_t, const char*, ...);        // required by the headers
+//  int     __sscanf(const char*, __locale_t, const char*, ...);     // required by the headers
 // }
 
 #if defined(__APPLE__)
@@ -143,8 +150,19 @@ namespace __locale {
 //
 // Locale management
 //
+#  define _LIBCPP_COLLATE_MASK LC_COLLATE_MASK
+#  define _LIBCPP_CTYPE_MASK LC_CTYPE_MASK
+#  define _LIBCPP_MONETARY_MASK LC_MONETARY_MASK
+#  define _LIBCPP_NUMERIC_MASK LC_NUMERIC_MASK
+#  define _LIBCPP_TIME_MASK LC_TIME_MASK
+#  define _LIBCPP_MESSAGES_MASK LC_MESSAGES_MASK
+#  define _LIBCPP_ALL_MASK LC_ALL_MASK
+#  define _LIBCPP_LC_ALL LC_ALL
+
 using __locale_t _LIBCPP_NODEBUG = locale_t;
-using __lconv_t _LIBCPP_NODEBUG  = lconv;
+
+#  if defined(_LIBCPP_BUILDING_LIBRARY)
+using __lconv_t _LIBCPP_NODEBUG = lconv;
 
 inline _LIBCPP_HIDE_FROM_ABI __locale_t __newlocale(int __category_mask, const char* __name, __locale_t __loc) {
   return newlocale(__category_mask, __name, __loc);
@@ -157,15 +175,7 @@ inline _LIBCPP_HIDE_FROM_ABI char* __setlocale(int __category, char const* __loc
 inline _LIBCPP_HIDE_FROM_ABI void __freelocale(__locale_t __loc) { freelocale(__loc); }
 
 inline _LIBCPP_HIDE_FROM_ABI __lconv_t* __localeconv(__locale_t& __loc) { return __libcpp_localeconv_l(__loc); }
-
-#  define _LIBCPP_COLLATE_MASK LC_COLLATE_MASK
-#  define _LIBCPP_CTYPE_MASK LC_CTYPE_MASK
-#  define _LIBCPP_MONETARY_MASK LC_MONETARY_MASK
-#  define _LIBCPP_NUMERIC_MASK LC_NUMERIC_MASK
-#  define _LIBCPP_TIME_MASK LC_TIME_MASK
-#  define _LIBCPP_MESSAGES_MASK LC_MESSAGES_MASK
-#  define _LIBCPP_ALL_MASK LC_ALL_MASK
-#  define _LIBCPP_LC_ALL LC_ALL
+#  endif // _LIBCPP_BUILDING_LIBRARY
 
 //
 // Strtonum functions
@@ -194,10 +204,15 @@ __strtoull(const char* __nptr, char** __endptr, int __base, __locale_t __loc) {
 //
 // Character manipulation functions
 //
+#  if defined(_LIBCPP_BUILDING_LIBRARY)
 inline _LIBCPP_HIDE_FROM_ABI int __islower(int __ch, __locale_t __loc) { return islower_l(__ch, __loc); }
 inline _LIBCPP_HIDE_FROM_ABI int __isupper(int __ch, __locale_t __loc) { return isupper_l(__ch, __loc); }
+#  endif
+
 inline _LIBCPP_HIDE_FROM_ABI int __isdigit(int __ch, __locale_t __loc) { return isdigit_l(__ch, __loc); }
 inline _LIBCPP_HIDE_FROM_ABI int __isxdigit(int __ch, __locale_t __loc) { return isxdigit_l(__ch, __loc); }
+
+#  if defined(_LIBCPP_BUILDING_LIBRARY)
 inline _LIBCPP_HIDE_FROM_ABI int __strcoll(const char* __s1, const char* __s2, __locale_t __loc) {
   return strcoll_l(__s1, __s2, __loc);
 }
@@ -207,7 +222,7 @@ inline _LIBCPP_HIDE_FROM_ABI size_t __strxfrm(char* __dest, const char* __src, s
 inline _LIBCPP_HIDE_FROM_ABI int __toupper(int __ch, __locale_t __loc) { return toupper_l(__ch, __loc); }
 inline _LIBCPP_HIDE_FROM_ABI int __tolower(int __ch, __locale_t __loc) { return tolower_l(__ch, __loc); }
 
-#  if _LIBCPP_HAS_WIDE_CHARACTERS
+#    if _LIBCPP_HAS_WIDE_CHARACTERS
 inline _LIBCPP_HIDE_FROM_ABI int __wcscoll(const wchar_t* __s1, const wchar_t* __s2, __locale_t __loc) {
   return wcscoll_l(__s1, __s2, __loc);
 }
@@ -229,7 +244,7 @@ inline _LIBCPP_HIDE_FROM_ABI int __iswpunct(wint_t __ch, __locale_t __loc) { ret
 inline _LIBCPP_HIDE_FROM_ABI int __iswxdigit(wint_t __ch, __locale_t __loc) { return iswxdigit_l(__ch, __loc); }
 inline _LIBCPP_HIDE_FROM_ABI wint_t __towupper(wint_t __ch, __locale_t __loc) { return towupper_l(__ch, __loc); }
 inline _LIBCPP_HIDE_FROM_ABI wint_t __towlower(wint_t __ch, __locale_t __loc) { return towlower_l(__ch, __loc); }
-#  endif
+#    endif
 
 inline _LIBCPP_HIDE_FROM_ABI size_t
 __strftime(char* __s, size_t __max, const char* __format, const tm* __tm, __locale_t __loc) {
@@ -242,7 +257,7 @@ __strftime(char* __s, size_t __max, const char* __format, const tm* __tm, __loca
 inline _LIBCPP_HIDE_FROM_ABI decltype(__libcpp_mb_cur_max_l(__locale_t())) __mb_len_max(__locale_t __loc) {
   return __libcpp_mb_cur_max_l(__loc);
 }
-#  if _LIBCPP_HAS_WIDE_CHARACTERS
+#    if _LIBCPP_HAS_WIDE_CHARACTERS
 inline _LIBCPP_HIDE_FROM_ABI wint_t __btowc(int __ch, __locale_t __loc) { return __libcpp_btowc_l(__ch, __loc); }
 inline _LIBCPP_HIDE_FROM_ABI int __wctob(wint_t __ch, __locale_t __loc) { return __libcpp_wctob_l(__ch, __loc); }
 inline _LIBCPP_HIDE_FROM_ABI size_t
@@ -270,7 +285,8 @@ inline _LIBCPP_HIDE_FROM_ABI size_t
 __mbsrtowcs(wchar_t* __dest, const char** __src, size_t __len, mbstate_t* __ps, __locale_t __loc) {
   return __libcpp_mbsrtowcs_l(__dest, __src, __len, __ps, __loc);
 }
-#  endif
+#    endif // _LIBCPP_HAS_WIDE_CHARACTERS
+#  endif   // _LIBCPP_BUILDING_LIBRARY
 
 _LIBCPP_DIAGNOSTIC_PUSH
 _LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wgcc-compat")
diff --git a/libcxx/include/__locale_dir/support/bsd_like.h b/libcxx/include/__locale_dir/support/bsd_like.h
index c0080b13a08cf..405f1589c8c94 100644
--- a/libcxx/include/__locale_dir/support/bsd_like.h
+++ b/libcxx/include/__locale_dir/support/bsd_like.h
@@ -46,7 +46,8 @@ namespace __locale {
 #define _LIBCPP_LC_ALL LC_ALL
 
 using __locale_t = ::locale_t;
-using __lconv_t  = std::lconv;
+#if defined(_LIBCPP_BUILDING_LIBRARY)
+using __lconv_t = std::lconv;
 
 inline _LIBCPP_HIDE_FROM_ABI __locale_t __newlocale(int __category_mask, const char* __locale, __locale_t __base) {
   return ::newlocale(__category_mask, __locale, __base);
@@ -59,6 +60,7 @@ inline _LIBCPP_HIDE_FROM_ABI char* __setlocale(int __category, char const* __loc
 }
 
 inline _LIBCPP_HIDE_FROM_ABI __lconv_t* __localeconv(__locale_t& __loc) { return ::localeconv_l(__loc); }
+#endif // _LIBCPP_BUILDING_LIBRARY
 
 //
 // Strtonum functions
@@ -87,14 +89,17 @@ __strtoull(const char* __nptr, char** __endptr, int __base, __locale_t __loc) {
 //
 // Character manipulation functions
 //
+#if defined(_LIBCPP_BUILDING_LIBRARY)
 inline _LIBCPP_HIDE_FROM_ABI int __islower(int __c, __locale_t __loc) { return ::islower_l(__c, __loc); }
 
 inline _LIBCPP_HIDE_FROM_ABI int __isupper(int __c, __locale_t __loc) { return ::isupper_l(__c, __loc); }
+#endif
 
 inline _LIBCPP_HIDE_FROM_ABI int __isdigit(int __c, __locale_t __loc) { return ::isdigit_l(__c, __loc); }
 
 inline _LIBCPP_HIDE_FROM_ABI int __isxdigit(int __c, __locale_t __loc) { return ::isxdigit_l(__c, __loc); }
 
+#if defined(_LIBCPP_BUILDING_LIBRARY)
 inline _LIBCPP_HIDE_FROM_ABI int __toupper(int __c, __locale_t __loc) { return ::toupper_l(__c, __loc); }
 
 inline _LIBCPP_HIDE_FROM_ABI int __tolower(int __c, __locale_t __loc) { return ::tolower_l(__c, __loc); }
@@ -107,7 +112,7 @@ inline _LIBCPP_HIDE_FROM_ABI size_t __strxfrm(char* __dest, const char* __src, s
   return ::strxfrm_l(__dest, __src, __n, __loc);
 }
 
-#if _LIBCPP_HAS_WIDE_CHARACTERS
+#  if _LIBCPP_HAS_WIDE_CHARACTERS
 inline _LIBCPP_HIDE_FROM_ABI int __iswctype(wint_t __c, wctype_t __type, __locale_t __loc) {
   return ::iswctype_l(__c, __type, __loc);
 }
@@ -143,7 +148,7 @@ inline _LIBCPP_HIDE_FROM_ABI int __wcscoll(const wchar_t* __ws1, const wchar_t*
 inline _LIBCPP_HIDE_FROM_ABI size_t __wcsxfrm(wchar_t* __dest, const wchar_t* __src, size_t __n, __locale_t __loc) {
   return ::wcsxfrm_l(__dest, __src, __n, __loc);
 }
-#endif // _LIBCPP_HAS_WIDE_CHARACTERS
+#  endif // _LIBCPP_HAS_WIDE_CHARACTERS
 
 inline _LIBCPP_HIDE_FROM_ABI size_t
 __strftime(char* __s, size_t __max, const char* __format, const struct tm* __tm, __locale_t __loc) {
@@ -155,14 +160,14 @@ __strftime(char* __s, size_t __max, const char* __format, const struct tm* __tm,
 //
 inline _LIBCPP_HIDE_FROM_ABI decltype(MB_CUR_MAX) __mb_len_max(__locale_t __loc) { return MB_CUR_MAX_L(__loc); }
 
-#if _LIBCPP_HAS_WIDE_CHARACTERS
+#  if _LIBCPP_HAS_WIDE_CHARACTERS
 inline _LIBCPP_HIDE_FROM_ABI wint_t __btowc(int __c, __locale_t __loc) { return ::btowc_l(__c, __loc); }
 
 inline _LIBCPP_HIDE_FROM_ABI int __wctob(wint_t __c, __locale_t __loc) { return ::wctob_l(__c, __loc); }
 
 inline _LIBCPP_HIDE_FROM_ABI size_t
 __wcsnrtombs(char* __dest, const wchar_t** __src, size_t __nwc, size_t __len, mbstate_t* __ps, __locale_t __loc) {
-  return ::wcsnrtombs_l(__dest, __src, __nwc, __len, __ps, __loc);
+  return ::wcsnrtombs_l(__dest, __src, __nwc, __len, __ps, __loc); // wcsnrtombs is a POSIX extension
 }
 
 inline _LIBCPP_HIDE_FROM_ABI size_t __wcrtomb(char* __s, wchar_t __wc, mbstate_t* __ps, __locale_t __loc) {
@@ -171,7 +176,7 @@ inline _LIBCPP_HIDE_FROM_ABI size_t __wcrtomb(char* __s, wchar_t __wc, mbstate_t
 
 inline _LIBCPP_HIDE_FROM_ABI size_t
 __mbsnrtowcs(wchar_t* __dest, const char** __src, size_t __nms, size_t __len, mbstate_t* __ps, __locale_t __loc) {
-  return ::mbsnrtowcs_l(__dest, __src, __nms, __len, __ps, __loc);
+  return ::mbsnrtowcs_l(__dest, __src, __nms, __len, __ps, __loc); // mbsnrtowcs is a POSIX extension
 }
 
 inline _LIBCPP_HIDE_FROM_ABI size_t
@@ -191,7 +196,8 @@ inline _LIBCPP_HIDE_FROM_ABI size_t
 __mbsrtowcs(wchar_t* __dest, const char** __src, size_t __len, mbstate_t* __ps, __locale_t __loc) {
   return ::mbsrtowcs_l(__dest, __src, __len, __ps, __loc);
 }
-#endif
+#  endif // _LIBCPP_HAS_WIDE_CHARACTERS
+#endif   // _LIBCPP_BUILDING_LIBRARY
 
 _LIBCPP_DIAGNOSTIC_PUSH
 _LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wgcc-compat")
@@ -211,7 +217,7 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_VARIADIC_ATTRIBUTE_FORMAT(__printf__, 4, 5) int __
 template <class... _Args>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_VARIADIC_ATTRIBUTE_FORMAT(__printf__, 3, 4) int __asprintf(
     char** __s, __locale_t __loc, const char* __format, _Args&&... __args) {
-  return ::asprintf_l(__s, __loc, __format, std::forward<_Args>(__args)...);
+  return ::asprintf_l(__s, __loc, __format, std::forward<_Args>(__args)...); // non-standard
 }
 
 template <class... _Args>
diff --git a/libcxx/include/__locale_dir/support/fuchsia.h b/libcxx/include/__locale_dir/support/fuchsia.h
index 237f48562d6e0..fb9de74ab7c7b 100644
--- a/libcxx/include/__locale_dir/support/fuchsia.h
+++ b/libcxx/include/__locale_dir/support/fuchsia.h
@@ -50,7 +50,9 @@ struct __locale_guard {
 #define _LIBCPP_LC_ALL LC_ALL
 
 using __locale_t = locale_t;
-using __lconv_t  = std::lconv;
+
+#if defined(_LIBCPP_BUILDING_LIBRARY)
+using __lconv_t = std::lconv;
 
 inline _LIBCPP_HIDE_FROM_ABI __locale_t __newlocale(int __category_mask, const char* __name, __locale_t __loc) {
   return ::newlocale(__category_mask, __name, __loc);
@@ -74,7 +76,7 @@ inline _LIBCPP_HIDE_FROM_ABI decltype(MB_CUR_MAX) __mb_len_max(__locale_t __loc)
   __locale_guard __current(__loc);
   return MB_CUR_MAX;
 }
-#if _LIBCPP_HAS_WIDE_CHARACTERS
+#  if _LIBCPP_HAS_WIDE_CHARACTERS
 inline _LIBCPP_HIDE_FROM_ABI wint_t __btowc(int __ch, __locale_t __loc) {
   __locale_guard __current(__loc);
   return std::btowc(__ch);
@@ -115,7 +117,8 @@ __mbsrtowcs(wchar_t* __dest, const char** __src, size_t __len, mbstate_t* __ps,
   __locale_guard __current(__loc);
   return ::mbsrtowcs(__dest, __src, __len, __ps);
 }
-#endif
+#  endif // _LIBCPP_HAS_WIDE_CHARACTERS
+#endif   // _LIBCPP_BUILDING_LIBRARY
 
 _LIBCPP_DIAGNOSTIC_PUSH
 _LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wgcc-compat")
diff --git a/libcxx/include/__locale_dir/support/no_locale/characters.h b/libcxx/include/__locale_dir/support/no_locale/characters.h
index 20e45fc350e2e..4fb48ed9ceac1 100644
--- a/libcxx/include/__locale_dir/support/no_locale/characters.h
+++ b/libcxx/include/__locale_dir/support/no_locale/characters.h
@@ -29,14 +29,17 @@ namespace __locale {
 //
 // Character manipulation functions
 //
+#if defined(_LIBCPP_BUILDING_LIBRARY)
 inline _LIBCPP_HIDE_FROM_ABI int __islower(int __c, __locale_t) { return std::islower(__c); }
 
 inline _LIBCPP_HIDE_FROM_ABI int __isupper(int __c, __locale_t) { return std::isupper(__c); }
+#endif
 
 inline _LIBCPP_HIDE_FROM_ABI int __isdigit(int __c, __locale_t) { return std::isdigit(__c); }
 
 inline _LIBCPP_HIDE_FROM_ABI int __isxdigit(int __c, __locale_t) { return std::isxdigit(__c); }
 
+#if defined(_LIBCPP_BUILDING_LIBRARY)
 inline _LIBCPP_HIDE_FROM_ABI int __toupper(int __c, __locale_t) { return std::toupper(__c); }
 
 inline _LIBCPP_HIDE_FROM_ABI int __tolower(int __c, __locale_t) { return std::tolower(__c); }
@@ -49,7 +52,7 @@ inline _LIBCPP_HIDE_FROM_ABI size_t __strxfrm(char* __dest, const char* __src, s
   return std::strxfrm(__dest, __src, __n);
 }
 
-#if _LIBCPP_HAS_WIDE_CHARACTERS
+#  if _LIBCPP_HAS_WIDE_CHARACTERS
 inline _LIBCPP_HIDE_FROM_ABI int __iswctype(wint_t __c, wctype_t __type, __locale_t) {
   return std::iswctype(__c, __type);
 }
@@ -85,12 +88,13 @@ inline _LIBCPP_HIDE_FROM_ABI int __wcscoll(const wchar_t* __ws1, const wchar_t*
 inline _LIBCPP_HIDE_FROM_ABI size_t __wcsxfrm(wchar_t* __dest, const wchar_t* __src, size_t __n, __locale_t) {
   return std::wcsxfrm(__dest, __src, __n);
 }
-#endif // _LIBCPP_HAS_WIDE_CHARACTERS
+#  endif // _LIBCPP_HAS_WIDE_CHARACTERS
 
 inline _LIBCPP_HIDE_FROM_ABI size_t
 __strftime(char* __s, size_t __max, const char* __format, const struct tm* __tm, __locale_t) {
   return std::strftime(__s, __max, __format, __tm);
 }
+#endif // _LIBCPP_BUILDING_LIBRARY
 
 } // namespace __locale
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__locale_dir/support/windows.h b/libcxx/include/__locale_dir/support/windows.h
index f0f76c527264a..56d34c6f0e6ca 100644
--- a/libcxx/include/__locale_dir/support/windows.h
+++ b/libcxx/include/__locale_dir/support/windows.h
@@ -153,6 +153,7 @@ class __locale_t {
   __lconv_storage* __lc_ = nullptr;
 };
 
+#if defined(_LIBCPP_BUILDING_LIBRARY)
 _LIBCPP_EXPORTED_FROM_ABI __locale_t __newlocale(int __mask, const char* __locale, __locale_t __base);
 inline _LIBCPP_HIDE_FROM_ABI void __freelocale(__locale_t __loc) { ::_free_locale(__loc); }
 inline _LIBCPP_HIDE_FROM_ABI char* __setlocale(int __category, const char* __locale) {
@@ -162,6 +163,7 @@ inline _LIBCPP_HIDE_FROM_ABI char* __setlocale(int __category, const char* __loc
   return __new_locale;
 }
 _LIBCPP_EXPORTED_FROM_ABI __lconv_t* __localeconv(__locale_t& __loc);
+#endif // _LIBCPP_BUILDING_LIBRARY
 
 //
 // Strtonum functions
@@ -195,14 +197,17 @@ __strtoull(const char* __nptr, char** __endptr, int __base, __locale_t __loc) {
 //
 // Character manipulation functions
 //
+#if defined(_LIBCPP_BUILDING_LIBRARY)
 inline _LIBCPP_HIDE_FROM_ABI int __islower(int __c, __locale_t __loc) { return _islower_l(__c, __loc); }
 
 inline _LIBCPP_HIDE_FROM_ABI int __isupper(int __c, __locale_t __loc) { return _isupper_l(__c, __loc); }
+#endif
 
 inline _LIBCPP_HIDE_FROM_ABI int __isdigit(int __c, __locale_t __loc) { return _isdigit_l(__c, __loc); }
 
 inline _LIBCPP_HIDE_FROM_ABI int __isxdigit(int __c, __locale_t __loc) { return _isxdigit_l(__c, __loc); }
 
+#if defined(_LIBCPP_BUILDING_LIBRARY)
 inline _LIBCPP_HIDE_FROM_ABI int __toupper(int __c, __locale_t __loc) { return ::_toupper_l(__c, __loc); }
 
 inline _LIBCPP_HIDE_FROM_ABI int __tolower(int __c, __locale_t __loc) { return ::_tolower_l(__c, __loc); }
@@ -215,7 +220,7 @@ inline _LIBCPP_HIDE_FROM_ABI size_t __strxfrm(char* __dest, const char* __src, s
   return ::_strxfrm_l(__dest, __src, __n, __loc);
 }
 
-#if _LIBCPP_HAS_WIDE_CHARACTERS
+#  if _LIBCPP_HAS_WIDE_CHARACTERS
 inline _LIBCPP_HIDE_FROM_ABI int __iswctype(wint_t __c, wctype_t __type, __locale_t __loc) {
   return ::_iswctype_l(__c, __type, __loc);
 }
@@ -240,16 +245,16 @@ inline _LIBCPP_HIDE_FROM_ABI int __wcscoll(const wchar_t* __ws1, const wchar_t*
 inline _LIBCPP_HIDE_FROM_ABI size_t __wcsxfrm(wchar_t* __dest, const wchar_t* __src, size_t __n, __locale_t __loc) {
   return ::_wcsxfrm_l(__dest, __src, __n, __loc);
 }
-#endif // _LIBCPP_HAS_WIDE_CHARACTERS
+#  endif // _LIBCPP_HAS_WIDE_CHARACTERS
 
-#if defined(__MINGW32__) && __MSVCRT_VERSION__ < 0x0800
+#  if defined(__MINGW32__) && __MSVCRT_VERSION__ < 0x0800
 _LIBCPP_EXPORTED_FROM_ABI size_t __strftime(char*, size_t, const char*, const struct tm*, __locale_t);
-#else
+#  else
 inline _LIBCPP_HIDE_FROM_ABI size_t
 __strftime(char* __ret, size_t __n, const char* __format, const struct tm* __tm, __locale_t __loc) {
   return ::_strftime_l(__ret, __n, __format, __tm, __loc);
 }
-#endif
+#  endif
 
 //
 // Other functions
@@ -273,6 +278,7 @@ _LIBCPP_EXPORTED_FROM_ABI size_t __mbrlen(const char* __restrict, size_t, mbstat
 
 _LIBCPP_EXPORTED_FROM_ABI size_t
 __mbsrtowcs(wchar_t* __restrict, const char** __restrict, size_t, mbstate_t* __restrict, __locale_t);
+#endif // _LIBCPP_BUILDING_LIBRARY
 
 _LIBCPP_EXPORTED_FROM_ABI _LIBCPP_ATTRIBUTE_FORMAT(__printf__, 4, 5) int __snprintf(
     char* __ret, size_t __n, __locale_t __loc, const char* __format, ...);
@@ -297,6 +303,7 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_VARIADIC_ATTRIBUTE_FORMAT(__scanf__, 3, 4) int __s
 _LIBCPP_DIAGNOSTIC_POP
 #undef _LIBCPP_VARIADIC_ATTRIBUTE_FORMAT
 
+#if defined(_LIBCPP_BUILDING_LIBRARY)
 struct __locale_guard {
   _LIBCPP_HIDE_FROM_ABI __locale_guard(__locale_t __l) : __status(_configthreadlocale(_ENABLE_PER_THREAD_LOCALE)) {
     // Setting the locale can be expensive even when the locale given is
@@ -328,6 +335,7 @@ struct __locale_guard {
   int __status;
   char* __locale_all = nullptr;
 };
+#endif // _LIBCPP_BUILDING_LIBRARY
 
 } // namespace __locale
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/test/libcxx/xopen_source.gen.py b/libcxx/test/libcxx/xopen_source.gen.py
new file mode 100644
index 0000000000000..3f2686483730a
--- /dev/null
+++ b/libcxx/test/libcxx/xopen_source.gen.py
@@ -0,0 +1,53 @@
+# ===----------------------------------------------------------------------===##
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# ===----------------------------------------------------------------------===##
+
+# Make sure that libc++ headers work when defining _XOPEN_SOURCE=500.
+# We may not want to guarantee this forever, but since this works today and
+# it's something that users rely on, it makes sense to put a test on it.
+#
+# https://github.com/llvm/llvm-project/issues/117630
+
+# RUN: %{python} %s %{libcxx-dir}/utils
+# END.
+
+import sys
+
+sys.path.append(sys.argv[1])
+from libcxx.header_information import (
+    lit_header_restrictions,
+    lit_header_undeprecations,
+    public_headers,
+)
+
+for header in public_headers:
+    for version in (500, 600, 700):
+        # TODO: <fstream> currently uses ::fseeko unguarded, which fails with _XOPEN_SOURCE=500.
+        if header == "fstream" and version == 500:
+            continue
+
+        print(
+            f"""\
+//--- {header}.xopen_source_{version}.compile.pass.cpp
+
+// Some parts of the code like <fstream> use non-standard functions in their implementation,
+// and these functions are not provided when _XOPEN_SOURCE is set to older values. This
+// breaks when building with modules even when we don't use the offending headers directly.
+// UNSUPPORTED: clang-modules-build
+
+// The AIX localization support uses some functions as part of their headers that require a
+// recent value of _XOPEN_SOURCE.
+// UNSUPPORTED: LIBCXX-AIX-FIXME
+
+{lit_header_restrictions.get(header, '')}
+{lit_header_undeprecations.get(header, '')}
+
+// ADDITIONAL_COMPILE_FLAGS: -D_XOPEN_SOURCE={version}
+
+#include <{header}>
+"""
+        )

From e0c4a3397fd2f80740d776de85360dc12cd0bcc7 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Wed, 19 Feb 2025 16:46:59 -0600
Subject: [PATCH 189/282] [Clang] Fix cross-lane scan when given divergent
 lanes (#127703)

Summary:
The scan operation implemented here only works if there are contiguous
ones in the executation mask that can be used to propagate the result.
There are two solutions to this, one is to enter 'whole-wave-mode' and
forcibly turn them back on, or to do this serially. This implementation
does the latter because it's more portable, but checks to see if the
parallel fast-path is applicable.

Needs to be backported for correct behavior and because it fixes a
failing libc test.

(cherry picked from commit 6cc7ca084a5bbb7ccf606cab12065604453dde59)
---
 clang/lib/Headers/gpuintrin.h                 | 74 ++++++++++++-------
 clang/lib/Headers/nvptxintrin.h               |  5 +-
 .../src/__support/GPU/scan_reduce.cpp         | 49 ++++++++++++
 3 files changed, 102 insertions(+), 26 deletions(-)

diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index 11c87e85cd497..efdc3d94ac0b3 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -150,35 +150,33 @@ __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x,
                             __builtin_bit_cast(uint64_t, __x), __width));
 }
 
-// Gets the sum of all lanes inside the warp or wavefront.
-#define __DO_LANE_SUM(__type, __suffix)                                        \
-  _DEFAULT_FN_ATTRS static __inline__ __type __gpu_lane_sum_##__suffix(        \
-      uint64_t __lane_mask, __type __x) {                                      \
-    for (uint32_t __step = __gpu_num_lanes() / 2; __step > 0; __step /= 2) {   \
-      uint32_t __index = __step + __gpu_lane_id();                             \
-      __x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x,           \
-                                          __gpu_num_lanes());                  \
-    }                                                                          \
-    return __gpu_read_first_lane_##__suffix(__lane_mask, __x);                 \
-  }
-__DO_LANE_SUM(uint32_t, u32); // uint32_t __gpu_lane_sum_u32(m, x)
-__DO_LANE_SUM(uint64_t, u64); // uint64_t __gpu_lane_sum_u64(m, x)
-__DO_LANE_SUM(float, f32);    // float __gpu_lane_sum_f32(m, x)
-__DO_LANE_SUM(double, f64);   // double __gpu_lane_sum_f64(m, x)
-#undef __DO_LANE_SUM
-
 // Gets the accumulator scan of the threads in the warp or wavefront.
 #define __DO_LANE_SCAN(__type, __bitmask_type, __suffix)                       \
   _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_scan_##__suffix(     \
       uint64_t __lane_mask, uint32_t __x) {                                    \
-    for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) {       \
-      uint32_t __index = __gpu_lane_id() - __step;                             \
-      __bitmask_type bitmask = __gpu_lane_id() >= __step;                      \
-      __x += __builtin_bit_cast(                                               \
-          __type, -bitmask & __builtin_bit_cast(__bitmask_type,                \
-                                                __gpu_shuffle_idx_##__suffix(  \
-                                                    __lane_mask, __index, __x, \
-                                                    __gpu_num_lanes())));      \
+    uint64_t __first = __lane_mask >> __builtin_ctzll(__lane_mask);            \
+    bool __divergent = __gpu_read_first_lane_##__suffix(                       \
+        __lane_mask, __first & (__first + 1));                                 \
+    if (__divergent) {                                                         \
+      __type __accum = 0;                                                      \
+      for (uint64_t __mask = __lane_mask; __mask; __mask &= __mask - 1) {      \
+        __type __index = __builtin_ctzll(__mask);                              \
+        __type __tmp = __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x, \
+                                                    __gpu_num_lanes());        \
+        __x = __gpu_lane_id() == __index ? __accum + __tmp : __x;              \
+        __accum += __tmp;                                                      \
+      }                                                                        \
+    } else {                                                                   \
+      for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) {     \
+        uint32_t __index = __gpu_lane_id() - __step;                           \
+        __bitmask_type bitmask = __gpu_lane_id() >= __step;                    \
+        __x += __builtin_bit_cast(                                             \
+            __type,                                                            \
+            -bitmask & __builtin_bit_cast(__bitmask_type,                      \
+                                          __gpu_shuffle_idx_##__suffix(        \
+                                              __lane_mask, __index, __x,       \
+                                              __gpu_num_lanes())));            \
+      }                                                                        \
     }                                                                          \
     return __x;                                                                \
   }
@@ -188,6 +186,32 @@ __DO_LANE_SCAN(float, uint32_t, f32);    // float __gpu_lane_scan_f32(m, x)
 __DO_LANE_SCAN(double, uint64_t, f64);   // double __gpu_lane_scan_f64(m, x)
 #undef __DO_LANE_SCAN
 
+// Gets the sum of all lanes inside the warp or wavefront.
+#define __DO_LANE_SUM(__type, __suffix)                                        \
+  _DEFAULT_FN_ATTRS static __inline__ __type __gpu_lane_sum_##__suffix(        \
+      uint64_t __lane_mask, __type __x) {                                      \
+    uint64_t __first = __lane_mask >> __builtin_ctzll(__lane_mask);            \
+    bool __divergent = __gpu_read_first_lane_##__suffix(                       \
+        __lane_mask, __first & (__first + 1));                                 \
+    if (__divergent) {                                                         \
+      return __gpu_shuffle_idx_##__suffix(                                     \
+          __lane_mask, 63 - __builtin_clzll(__lane_mask),                      \
+          __gpu_lane_scan_##__suffix(__lane_mask, __x), __gpu_num_lanes());    \
+    } else {                                                                   \
+      for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) {     \
+        uint32_t __index = __step + __gpu_lane_id();                           \
+        __x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x,         \
+                                            __gpu_num_lanes());                \
+      }                                                                        \
+      return __gpu_read_first_lane_##__suffix(__lane_mask, __x);               \
+    }                                                                          \
+  }
+__DO_LANE_SUM(uint32_t, u32); // uint32_t __gpu_lane_sum_u32(m, x)
+__DO_LANE_SUM(uint64_t, u64); // uint64_t __gpu_lane_sum_u64(m, x)
+__DO_LANE_SUM(float, f32);    // float __gpu_lane_sum_f32(m, x)
+__DO_LANE_SUM(double, f64);   // double __gpu_lane_sum_f64(m, x)
+#undef __DO_LANE_SUM
+
 _Pragma("omp end declare variant");
 _Pragma("omp end declare target");
 
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
index 40fa2edebe975..0afcb1c5ff0f0 100644
--- a/clang/lib/Headers/nvptxintrin.h
+++ b/clang/lib/Headers/nvptxintrin.h
@@ -151,8 +151,11 @@ _DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) {
 _DEFAULT_FN_ATTRS static __inline__ uint32_t
 __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
                       uint32_t __width) {
+  // Mask out inactive lanes to match AMDGPU behavior.
   uint32_t __mask = (uint32_t)__lane_mask;
-  return __nvvm_shfl_sync_idx_i32(__mask, __x, __idx,
+  bool __bitmask = (1ull << __idx) & __lane_mask;
+  return -__bitmask &
+         __nvvm_shfl_sync_idx_i32(__mask, __x, __idx,
                                   ((__gpu_num_lanes() - __width) << 8u) | 0x1f);
 }
 
diff --git a/libc/test/integration/src/__support/GPU/scan_reduce.cpp b/libc/test/integration/src/__support/GPU/scan_reduce.cpp
index bc621c3300cbe..1d50e1f99bf31 100644
--- a/libc/test/integration/src/__support/GPU/scan_reduce.cpp
+++ b/libc/test/integration/src/__support/GPU/scan_reduce.cpp
@@ -53,10 +53,59 @@ static void test_scan() {
   EXPECT_EQ(z, gpu::get_lane_id() % 2 ? gpu::get_lane_id() / 2 + 1 : 0);
 }
 
+static uint32_t random(uint64_t *rand_next) {
+  uint64_t x = *rand_next;
+  x ^= x >> 12;
+  x ^= x << 25;
+  x ^= x >> 27;
+  *rand_next = x;
+  return static_cast<uint32_t>((x * 0x2545F4914F6CDD1Dul) >> 32);
+}
+
+// Scan operations can break down under thread divergence, make sure that the
+// function works under some random divergence. We do this by trivially
+// implementing a scan with shared scratch memory and then comparing the
+// results.
+static void test_scan_divergent() {
+  static uint32_t input[64] = {0};
+  static uint32_t result[64] = {0};
+  uint64_t state = gpu::processor_clock() + __gpu_lane_id();
+
+  for (int i = 0; i < 64; ++i) {
+    uint64_t lanemask = gpu::get_lane_mask();
+    if (random(&state) & (1ull << gpu::get_lane_id())) {
+      uint64_t divergent = gpu::get_lane_mask();
+      uint32_t value = random(&state) % 256;
+      input[gpu::get_lane_id()] = value;
+
+      if (gpu::is_first_lane(divergent)) {
+        uint32_t accumulator = 0;
+        for (uint32_t lane = 0; lane < gpu::get_lane_size(); ++lane) {
+          uint32_t tmp = input[lane];
+          result[lane] = tmp + accumulator;
+          accumulator += tmp;
+        }
+      }
+      gpu::sync_lane(divergent);
+
+      uint32_t scan = gpu::scan(divergent, value);
+      EXPECT_EQ(scan, result[gpu::get_lane_id()]);
+    }
+    if (gpu::is_first_lane(lanemask))
+      __builtin_memset(input, 0, sizeof(input));
+    gpu::sync_lane(lanemask);
+  }
+}
+
 TEST_MAIN(int argc, char **argv, char **envp) {
+  if (gpu::get_thread_id() >= gpu::get_lane_size())
+    return 0;
+
   test_reduce();
 
   test_scan();
 
+  test_scan_divergent();
+
   return 0;
 }

From e6d4fd035fdf90348fbeba6e73f90feb6e66b30b Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Fri, 21 Feb 2025 12:08:49 +0700
Subject: [PATCH 190/282] AMDGPU: Widen f16 minimum/maximum to v2f16 on gfx950
 (#128121)

Unfortunately we only have the vector versions of v2f16 minimum3
and maximum. Widen to v2f16 so we can lower as minimum333(x, y, y).

(cherry picked from commit e729dc759d052de122c8a918fe51b05ac796bb50)
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp    |  40 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.h      |   1 +
 llvm/test/CodeGen/AMDGPU/fmaximum3.ll        | 689 ++++++++++++-------
 llvm/test/CodeGen/AMDGPU/fminimum3.ll        | 689 ++++++++++++-------
 llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll |  66 +-
 llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll |  66 +-
 6 files changed, 966 insertions(+), 585 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index e09df53995d61..d45ae7398e25d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -869,8 +869,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     if (Subtarget->hasMinimum3Maximum3F32())
       setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f32, Legal);
 
-    if (Subtarget->hasMinimum3Maximum3PKF16())
+    if (Subtarget->hasMinimum3Maximum3PKF16()) {
       setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::v2f16, Legal);
+
+      // If only the vector form is available, we need to widen to a vector.
+      if (!Subtarget->hasMinimum3Maximum3F16())
+        setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f16, Custom);
+    }
   }
 
   setOperationAction(ISD::INTRINSIC_WO_CHAIN,
@@ -5963,6 +5968,9 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::FMINNUM:
   case ISD::FMAXNUM:
     return lowerFMINNUM_FMAXNUM(Op, DAG);
+  case ISD::FMINIMUM:
+  case ISD::FMAXIMUM:
+    return lowerFMINIMUM_FMAXIMUM(Op, DAG);
   case ISD::FLDEXP:
   case ISD::STRICT_FLDEXP:
     return lowerFLDEXP(Op, DAG);
@@ -5984,8 +5992,6 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::FMUL:
   case ISD::FMINNUM_IEEE:
   case ISD::FMAXNUM_IEEE:
-  case ISD::FMINIMUM:
-  case ISD::FMAXIMUM:
   case ISD::FMINIMUMNUM:
   case ISD::FMAXIMUMNUM:
   case ISD::UADDSAT:
@@ -6840,6 +6846,34 @@ SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
   return Op;
 }
 
+SDValue SITargetLowering::lowerFMINIMUM_FMAXIMUM(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  if (VT.isVector())
+    return splitBinaryVectorOp(Op, DAG);
+
+  assert(!Subtarget->hasIEEEMinMax() && !Subtarget->hasMinimum3Maximum3F16() &&
+         Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
+         "should not need to widen f16 minimum/maximum to v2f16");
+
+  // Widen f16 operation to v2f16
+
+  // fminimum f16:x, f16:y ->
+  //   extract_vector_elt (fminimum (v2f16 (scalar_to_vector x))
+  //                                (v2f16 (scalar_to_vector y))), 0
+  SDLoc SL(Op);
+  SDValue WideSrc0 =
+      DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(0));
+  SDValue WideSrc1 =
+      DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(1));
+
+  SDValue Widened =
+      DAG.getNode(Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
+
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::f16, Widened,
+                     DAG.getConstant(0, SL, MVT::i32));
+}
+
 SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
   bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
   EVT VT = Op.getValueType();
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 1cd7f1b29e077..9b2c14862407a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -146,6 +146,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   /// Custom lowering for ISD::FP_ROUND for MVT::f16.
   SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerFMINIMUM_FMAXIMUM(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const;
   SDValue promoteUniformOpToI32(SDValue Op, DAGCombinerInfo &DCI) const;
   SDValue lowerMUL(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
index f0fa621e3b4bc..6724c37605eb4 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
@@ -1251,19 +1251,27 @@ define half @v_fmaximum3_f16(half %a, half %b, half %c) {
 ; GFX12-NEXT:    v_maximum3_f16 v0, v0, v1, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmaximum3_f16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f16_e32 v3, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT:    v_max_f16_e32 v1, v0, v2
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fmaximum3_f16:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_max_f16_e32 v3, v0, v1
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT:    v_max_f16_e32 v1, v0, v2
+; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call half @llvm.maximum.f16(half %a, half %b)
   %max1 = call half @llvm.maximum.f16(half %max0, half %c)
   ret half %max1
@@ -1280,19 +1288,27 @@ define half @v_fmaximum3_f16_commute(half %a, half %b, half %c) {
 ; GFX12-NEXT:    v_maximum3_f16 v0, v2, v0, v1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmaximum3_f16_commute:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f16_e32 v3, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT:    v_max_f16_e32 v1, v2, v0
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v2, v0
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fmaximum3_f16_commute:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_max_f16_e32 v3, v0, v1
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT:    v_max_f16_e32 v1, v2, v0
+; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v2, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f16_commute:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v2, v0, v0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call half @llvm.maximum.f16(half %a, half %b)
   %max1 = call half @llvm.maximum.f16(half %c, half %max0)
   ret half %max1
@@ -1309,22 +1325,34 @@ define amdgpu_ps i32 @s_fmaximum3_f16(half inreg %a, half inreg %b, half inreg %
 ; GFX12-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX12-NEXT:    ; return to shader part epilog
 ;
-; GFX9-LABEL: s_fmaximum3_f16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v0, s1
-; GFX9-NEXT:    v_max_f16_e32 v1, s0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT:    v_max_f16_e32 v1, s2, v0
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, s2, v0
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX9-NEXT:    ; return to shader part epilog
+; GFX942-LABEL: s_fmaximum3_f16:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-NEXT:    v_max_f16_e32 v1, s0, v0
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX942-NEXT:    v_max_f16_e32 v1, s2, v0
+; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, s2, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX942-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX942-NEXT:    ; return to shader part epilog
+;
+; GFX950-LABEL: s_fmaximum3_f16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    v_mov_b32_e32 v0, s0
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, s1, s1
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, s2, s2
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX950-NEXT:    ; return to shader part epilog
   %max0 = call half @llvm.maximum.f16(half %a, half %b)
   %max1 = call half @llvm.maximum.f16(half %max0, half %c)
   %cast = bitcast half %max1 to i16
@@ -1344,19 +1372,28 @@ define half @v_fmaximum3_f16_fabs0(half %a, half %b, half %c) {
 ; GFX12-NEXT:    v_maximum3_f16 v0, |v0|, v1, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmaximum3_f16_fabs0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f16_e64 v3, |v0|, v1
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, |v0|, v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT:    v_max_f16_e32 v1, v0, v2
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fmaximum3_f16_fabs0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_max_f16_e64 v3, |v0|, v1
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT:    v_cmp_o_f16_e64 vcc, |v0|, v1
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT:    v_max_f16_e32 v1, v0, v2
+; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f16_fabs0:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call half @llvm.fabs.f16(half %a)
   %max0 = call half @llvm.maximum.f16(half %a.fabs, half %b)
   %max1 = call half @llvm.maximum.f16(half %max0, half %c)
@@ -1374,19 +1411,28 @@ define half @v_fmaximum3_f16_fabs1(half %a, half %b, half %c) {
 ; GFX12-NEXT:    v_maximum3_f16 v0, v0, |v1|, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmaximum3_f16_fabs1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f16_e64 v3, v0, |v1|
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v1|
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT:    v_max_f16_e32 v1, v0, v2
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fmaximum3_f16_fabs1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_max_f16_e64 v3, v0, |v1|
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v1|
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT:    v_max_f16_e32 v1, v0, v2
+; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f16_fabs1:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %b.fabs = call half @llvm.fabs.f16(half %b)
   %max0 = call half @llvm.maximum.f16(half %a, half %b.fabs)
   %max1 = call half @llvm.maximum.f16(half %max0, half %c)
@@ -1404,19 +1450,28 @@ define half @v_fmaximum3_f16_fabs2(half %a, half %b, half %c) {
 ; GFX12-NEXT:    v_maximum3_f16 v0, v0, v1, |v2|
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmaximum3_f16_fabs2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f16_e32 v3, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT:    v_max_f16_e64 v1, v0, |v2|
-; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v2|
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fmaximum3_f16_fabs2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_max_f16_e32 v3, v0, v1
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT:    v_max_f16_e64 v1, v0, |v2|
+; GFX942-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v2|
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f16_fabs2:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %c.fabs = call half @llvm.fabs.f16(half %c)
   %max0 = call half @llvm.maximum.f16(half %a, half %b)
   %max1 = call half @llvm.maximum.f16(half %max0, half %c.fabs)
@@ -1434,19 +1489,30 @@ define half @v_fmaximum3_f16_fabs_all(half %a, half %b, half %c) {
 ; GFX12-NEXT:    v_maximum3_f16 v0, |v0|, |v1|, |v2|
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmaximum3_f16_fabs_all:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f16_e64 v3, |v0|, |v1|
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, |v0|, |v1|
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT:    v_max_f16_e64 v1, v0, |v2|
-; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v2|
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fmaximum3_f16_fabs_all:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_max_f16_e64 v3, |v0|, |v1|
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT:    v_cmp_o_f16_e64 vcc, |v0|, |v1|
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT:    v_max_f16_e64 v1, v0, |v2|
+; GFX942-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v2|
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f16_fabs_all:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX950-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
+; GFX950-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call half @llvm.fabs.f16(half %a)
   %b.fabs = call half @llvm.fabs.f16(half %b)
   %c.fabs = call half @llvm.fabs.f16(half %c)
@@ -1466,19 +1532,30 @@ define half @v_fmaximum3_f16_fneg_all(half %a, half %b, half %c) {
 ; GFX12-NEXT:    v_maximum3_f16 v0, -v0, -v1, -v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmaximum3_f16_fneg_all:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f16_e64 v3, -v0, -v1
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -v0, -v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT:    v_max_f16_e64 v1, v0, -v2
-; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, -v2
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fmaximum3_f16_fneg_all:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_max_f16_e64 v3, -v0, -v1
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT:    v_cmp_o_f16_e64 vcc, -v0, -v1
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT:    v_max_f16_e64 v1, v0, -v2
+; GFX942-NEXT:    v_cmp_o_f16_e64 vcc, v0, -v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f16_fneg_all:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX950-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
+; GFX950-NEXT:    v_xor_b32_e32 v2, 0x8000, v2
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %a.fneg = fneg half %a
   %b.fneg = fneg half %b
   %c.fneg = fneg half %c
@@ -1498,19 +1575,30 @@ define half @v_fmaximum3_f16_fneg_fabs_all(half %a, half %b, half %c) {
 ; GFX12-NEXT:    v_maximum3_f16 v0, -|v0|, -|v1|, -|v2|
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmaximum3_f16_fneg_fabs_all:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f16_e64 v3, -|v0|, -|v1|
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -|v0|, -|v1|
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT:    v_max_f16_e64 v1, v0, -|v2|
-; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, -|v2|
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fmaximum3_f16_fneg_fabs_all:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_max_f16_e64 v3, -|v0|, -|v1|
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT:    v_cmp_o_f16_e64 vcc, -|v0|, -|v1|
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT:    v_max_f16_e64 v1, v0, -|v2|
+; GFX942-NEXT:    v_cmp_o_f16_e64 vcc, v0, -|v2|
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f16_fneg_fabs_all:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_or_b32_e32 v0, 0x8000, v0
+; GFX950-NEXT:    v_or_b32_e32 v1, 0x8000, v1
+; GFX950-NEXT:    v_or_b32_e32 v2, 0x8000, v2
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call half @llvm.fabs.f16(half %a)
   %b.fabs = call half @llvm.fabs.f16(half %b)
   %c.fabs = call half @llvm.fabs.f16(half %c)
@@ -1533,19 +1621,28 @@ define half @v_fmaximum3_f16_fneg0(half %a, half %b, half %c) {
 ; GFX12-NEXT:    v_maximum3_f16 v0, -v0, v1, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmaximum3_f16_fneg0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f16_e64 v3, -v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -v0, v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT:    v_max_f16_e32 v1, v0, v2
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fmaximum3_f16_fneg0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_max_f16_e64 v3, -v0, v1
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT:    v_cmp_o_f16_e64 vcc, -v0, v1
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT:    v_max_f16_e32 v1, v0, v2
+; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f16_fneg0:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %a.fneg = fneg half %a
   %max0 = call half @llvm.maximum.f16(half %a.fneg, half %b)
   %max1 = call half @llvm.maximum.f16(half %max0, half %c)
@@ -1563,19 +1660,28 @@ define half @v_fmaximum3_f16_fneg1(half %a, half %b, half %c) {
 ; GFX12-NEXT:    v_maximum3_f16 v0, v0, -v1, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmaximum3_f16_fneg1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f16_e64 v3, v0, -v1
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, -v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT:    v_max_f16_e32 v1, v0, v2
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fmaximum3_f16_fneg1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_max_f16_e64 v3, v0, -v1
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT:    v_cmp_o_f16_e64 vcc, v0, -v1
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT:    v_max_f16_e32 v1, v0, v2
+; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f16_fneg1:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %b.fneg = fneg half %b
   %max0 = call half @llvm.maximum.f16(half %a, half %b.fneg)
   %max1 = call half @llvm.maximum.f16(half %max0, half %c)
@@ -1593,19 +1699,28 @@ define half @v_fmaximum3_f16_fneg2(half %a, half %b, half %c) {
 ; GFX12-NEXT:    v_maximum3_f16 v0, v0, v1, -v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmaximum3_f16_fneg2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f16_e32 v3, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT:    v_max_f16_e64 v1, v0, -v2
-; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, -v2
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fmaximum3_f16_fneg2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_max_f16_e32 v3, v0, v1
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT:    v_max_f16_e64 v1, v0, -v2
+; GFX942-NEXT:    v_cmp_o_f16_e64 vcc, v0, -v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f16_fneg2:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_xor_b32_e32 v2, 0x8000, v2
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %c.fneg = fneg half %c
   %max0 = call half @llvm.maximum.f16(half %a, half %b)
   %max1 = call half @llvm.maximum.f16(half %max0, half %c.fneg)
@@ -1623,19 +1738,28 @@ define half @v_fmaximum3_f16_const0(half %b, half %c) {
 ; GFX12-NEXT:    v_maximum3_f16 v0, v0, 0x4800, v1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmaximum3_f16_const0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f16_e32 v2, 0x4800, v0
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT:    v_max_f16_e32 v2, v0, v1
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fmaximum3_f16_const0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_max_f16_e32 v2, 0x4800, v0
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX942-NEXT:    v_max_f16_e32 v2, v0, v1
+; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f16_const0:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    s_movk_i32 s0, 0x4800
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, s0, s0
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call half @llvm.maximum.f16(half 8.0, half %b)
   %max1 = call half @llvm.maximum.f16(half %max0, half %c)
   ret half %max1
@@ -1652,19 +1776,27 @@ define half @v_fmaximum3_f16__const2(half %a, half %b) {
 ; GFX12-NEXT:    v_maximum3_f16 v0, v0, v1, 0x4800
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmaximum3_f16__const2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f16_e32 v2, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT:    v_max_f16_e32 v1, 0x4800, v0
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fmaximum3_f16__const2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_max_f16_e32 v2, v0, v1
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX942-NEXT:    v_max_f16_e32 v1, 0x4800, v0
+; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f16__const2:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT:    s_movk_i32 s0, 0x4800
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, s0, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call half @llvm.maximum.f16(half %a, half %b)
   %max1 = call half @llvm.maximum.f16(half %max0, half 8.0)
   ret half %max1
@@ -1681,19 +1813,27 @@ define half @v_fmaximum3_f16_inlineimm0(half %b, half %c) {
 ; GFX12-NEXT:    v_maximum3_f16 v0, v0, 4.0, v1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmaximum3_f16_inlineimm0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f16_e32 v2, 4.0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT:    v_max_f16_e32 v2, v0, v1
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fmaximum3_f16_inlineimm0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_max_f16_e32 v2, 4.0, v0
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX942-NEXT:    v_max_f16_e32 v2, v0, v1
+; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f16_inlineimm0:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, 4.0, 4.0
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call half @llvm.maximum.f16(half 4.0, half %b)
   %max1 = call half @llvm.maximum.f16(half %max0, half %c)
   ret half %max1
@@ -1710,19 +1850,27 @@ define half @v_fmaximum3_f16__inlineimm(half %a, half %b) {
 ; GFX12-NEXT:    v_maximum3_f16 v0, v0, v1, 4.0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmaximum3_f16__inlineimm:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f16_e32 v2, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT:    v_max_f16_e32 v1, 4.0, v0
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fmaximum3_f16__inlineimm:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_max_f16_e32 v2, v0, v1
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX942-NEXT:    v_max_f16_e32 v1, 4.0, v0
+; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f16__inlineimm:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, 4.0, 4.0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call half @llvm.maximum.f16(half %a, half %b)
   %max1 = call half @llvm.maximum.f16(half %max0, half 4.0)
   ret half %max1
@@ -1741,19 +1889,28 @@ define half @v_fmaximum3_f16_const1_const2(half %a) {
 ; GFX12-NEXT:    v_maximum3_f16 v0, v0, s0, 0x4c00
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmaximum3_f16_const1_const2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f16_e32 v1, 0x4800, v0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT:    v_max_f16_e32 v1, 0x4c00, v0
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fmaximum3_f16_const1_const2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_max_f16_e32 v1, 0x4800, v0
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX942-NEXT:    v_max_f16_e32 v1, 0x4c00, v0
+; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f16_const1_const2:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    s_movk_i32 s0, 0x4800
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, s0, s0
+; GFX950-NEXT:    s_movk_i32 s0, 0x4c00
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, s0, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call half @llvm.maximum.f16(half %a, half 8.0)
   %max1 = call half @llvm.maximum.f16(half %max0, half 16.0)
   ret half %max1
@@ -3620,20 +3777,30 @@ define <2 x half> @v_no_fmaximum3_f16__multi_use(half %a, half %b, half %c) {
 ; GFX12-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_no_fmaximum3_f16__multi_use:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f16_e32 v3, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT:    v_max_f16_e32 v1, v0, v2
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
-; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_no_fmaximum3_f16__multi_use:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_max_f16_e32 v3, v0, v1
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT:    v_max_f16_e32 v1, v0, v2
+; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX942-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_no_fmaximum3_f16__multi_use:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_pk_maximum3_f16 v1, v0, v2, v2
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call half @llvm.maximum.f16(half %a, half %b)
   %max1 = call half @llvm.maximum.f16(half %max0, half %c)
   %insert.0 = insertelement <2 x half> poison, half %max0, i32 0
@@ -3651,23 +3818,35 @@ define amdgpu_ps <2 x i32> @s_no_fmaximum3_f16__multi_use(half inreg %a, half in
 ; GFX12-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX12-NEXT:    ; return to shader part epilog
 ;
-; GFX9-LABEL: s_no_fmaximum3_f16__multi_use:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v0, s1
-; GFX9-NEXT:    v_max_f16_e32 v1, s0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT:    v_max_f16_e32 v1, s2, v0
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, s2, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX9-NEXT:    ; return to shader part epilog
+; GFX942-LABEL: s_no_fmaximum3_f16__multi_use:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-NEXT:    v_max_f16_e32 v1, s0, v0
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX942-NEXT:    v_max_f16_e32 v1, s2, v0
+; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, s2, v0
+; GFX942-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX942-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX942-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX942-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX942-NEXT:    ; return to shader part epilog
+;
+; GFX950-LABEL: s_no_fmaximum3_f16__multi_use:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    v_mov_b32_e32 v0, s0
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, s1, s1
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_pk_maximum3_f16 v1, v0, s2, s2
+; GFX950-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX950-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX950-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX950-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX950-NEXT:    ; return to shader part epilog
   %max0 = call half @llvm.maximum.f16(half %a, half %b)
   %max1 = call half @llvm.maximum.f16(half %max0, half %c)
   %cast0 = bitcast half %max0 to i16
diff --git a/llvm/test/CodeGen/AMDGPU/fminimum3.ll b/llvm/test/CodeGen/AMDGPU/fminimum3.ll
index 7a8a224c76a83..04d21548187a0 100644
--- a/llvm/test/CodeGen/AMDGPU/fminimum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fminimum3.ll
@@ -1251,19 +1251,27 @@ define half @v_fminimum3_f16(half %a, half %b, half %c) {
 ; GFX12-NEXT:    v_minimum3_f16 v0, v0, v1, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fminimum3_f16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f16_e32 v3, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT:    v_min_f16_e32 v1, v0, v2
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fminimum3_f16:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_min_f16_e32 v3, v0, v1
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT:    v_min_f16_e32 v1, v0, v2
+; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_f16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, v1, v1
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, v2, v2
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call half @llvm.minimum.f16(half %a, half %b)
   %max1 = call half @llvm.minimum.f16(half %max0, half %c)
   ret half %max1
@@ -1280,19 +1288,27 @@ define half @v_fminimum3_f16_commute(half %a, half %b, half %c) {
 ; GFX12-NEXT:    v_minimum3_f16 v0, v2, v0, v1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fminimum3_f16_commute:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f16_e32 v3, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT:    v_min_f16_e32 v1, v2, v0
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v2, v0
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fminimum3_f16_commute:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_min_f16_e32 v3, v0, v1
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT:    v_min_f16_e32 v1, v2, v0
+; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v2, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_f16_commute:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, v1, v1
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_pk_minimum3_f16 v0, v2, v0, v0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call half @llvm.minimum.f16(half %a, half %b)
   %max1 = call half @llvm.minimum.f16(half %c, half %max0)
   ret half %max1
@@ -1309,22 +1325,34 @@ define amdgpu_ps i32 @s_fminimum3_f16(half inreg %a, half inreg %b, half inreg %
 ; GFX12-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX12-NEXT:    ; return to shader part epilog
 ;
-; GFX9-LABEL: s_fminimum3_f16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v0, s1
-; GFX9-NEXT:    v_min_f16_e32 v1, s0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT:    v_min_f16_e32 v1, s2, v0
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, s2, v0
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX9-NEXT:    ; return to shader part epilog
+; GFX942-LABEL: s_fminimum3_f16:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-NEXT:    v_min_f16_e32 v1, s0, v0
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX942-NEXT:    v_min_f16_e32 v1, s2, v0
+; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, s2, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX942-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX942-NEXT:    ; return to shader part epilog
+;
+; GFX950-LABEL: s_fminimum3_f16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    v_mov_b32_e32 v0, s0
+; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, s1, s1
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, s2, s2
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX950-NEXT:    ; return to shader part epilog
   %max0 = call half @llvm.minimum.f16(half %a, half %b)
   %max1 = call half @llvm.minimum.f16(half %max0, half %c)
   %cast = bitcast half %max1 to i16
@@ -1344,19 +1372,28 @@ define half @v_fminimum3_f16_fabs0(half %a, half %b, half %c) {
 ; GFX12-NEXT:    v_minimum3_f16 v0, |v0|, v1, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fminimum3_f16_fabs0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f16_e64 v3, |v0|, v1
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, |v0|, v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT:    v_min_f16_e32 v1, v0, v2
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fminimum3_f16_fabs0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_min_f16_e64 v3, |v0|, v1
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT:    v_cmp_o_f16_e64 vcc, |v0|, v1
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT:    v_min_f16_e32 v1, v0, v2
+; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_f16_fabs0:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, v1, v1
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, v2, v2
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call half @llvm.fabs.f16(half %a)
   %max0 = call half @llvm.minimum.f16(half %a.fabs, half %b)
   %max1 = call half @llvm.minimum.f16(half %max0, half %c)
@@ -1374,19 +1411,28 @@ define half @v_fminimum3_f16_fabs1(half %a, half %b, half %c) {
 ; GFX12-NEXT:    v_minimum3_f16 v0, v0, |v1|, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fminimum3_f16_fabs1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f16_e64 v3, v0, |v1|
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v1|
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT:    v_min_f16_e32 v1, v0, v2
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fminimum3_f16_fabs1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_min_f16_e64 v3, v0, |v1|
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v1|
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT:    v_min_f16_e32 v1, v0, v2
+; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_f16_fabs1:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
+; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, v1, v1
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, v2, v2
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %b.fabs = call half @llvm.fabs.f16(half %b)
   %max0 = call half @llvm.minimum.f16(half %a, half %b.fabs)
   %max1 = call half @llvm.minimum.f16(half %max0, half %c)
@@ -1404,19 +1450,28 @@ define half @v_fminimum3_f16_fabs2(half %a, half %b, half %c) {
 ; GFX12-NEXT:    v_minimum3_f16 v0, v0, v1, |v2|
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fminimum3_f16_fabs2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f16_e32 v3, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT:    v_min_f16_e64 v1, v0, |v2|
-; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v2|
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fminimum3_f16_fabs2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_min_f16_e32 v3, v0, v1
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT:    v_min_f16_e64 v1, v0, |v2|
+; GFX942-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v2|
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_f16_fabs2:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
+; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, v1, v1
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, v2, v2
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %c.fabs = call half @llvm.fabs.f16(half %c)
   %max0 = call half @llvm.minimum.f16(half %a, half %b)
   %max1 = call half @llvm.minimum.f16(half %max0, half %c.fabs)
@@ -1434,19 +1489,30 @@ define half @v_fminimum3_f16_fabs_all(half %a, half %b, half %c) {
 ; GFX12-NEXT:    v_minimum3_f16 v0, |v0|, |v1|, |v2|
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fminimum3_f16_fabs_all:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f16_e64 v3, |v0|, |v1|
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, |v0|, |v1|
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT:    v_min_f16_e64 v1, v0, |v2|
-; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v2|
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fminimum3_f16_fabs_all:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_min_f16_e64 v3, |v0|, |v1|
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT:    v_cmp_o_f16_e64 vcc, |v0|, |v1|
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT:    v_min_f16_e64 v1, v0, |v2|
+; GFX942-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v2|
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_f16_fabs_all:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX950-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
+; GFX950-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
+; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, v1, v1
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, v2, v2
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call half @llvm.fabs.f16(half %a)
   %b.fabs = call half @llvm.fabs.f16(half %b)
   %c.fabs = call half @llvm.fabs.f16(half %c)
@@ -1466,19 +1532,30 @@ define half @v_fminimum3_f16_fneg_all(half %a, half %b, half %c) {
 ; GFX12-NEXT:    v_minimum3_f16 v0, -v0, -v1, -v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fminimum3_f16_fneg_all:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f16_e64 v3, -v0, -v1
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -v0, -v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT:    v_min_f16_e64 v1, v0, -v2
-; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, -v2
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fminimum3_f16_fneg_all:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_min_f16_e64 v3, -v0, -v1
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT:    v_cmp_o_f16_e64 vcc, -v0, -v1
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT:    v_min_f16_e64 v1, v0, -v2
+; GFX942-NEXT:    v_cmp_o_f16_e64 vcc, v0, -v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_f16_fneg_all:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX950-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
+; GFX950-NEXT:    v_xor_b32_e32 v2, 0x8000, v2
+; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, v1, v1
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, v2, v2
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %a.fneg = fneg half %a
   %b.fneg = fneg half %b
   %c.fneg = fneg half %c
@@ -1498,19 +1575,30 @@ define half @v_fminimum3_f16_fneg_fabs_all(half %a, half %b, half %c) {
 ; GFX12-NEXT:    v_minimum3_f16 v0, -|v0|, -|v1|, -|v2|
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fminimum3_f16_fneg_fabs_all:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f16_e64 v3, -|v0|, -|v1|
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -|v0|, -|v1|
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT:    v_min_f16_e64 v1, v0, -|v2|
-; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, -|v2|
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fminimum3_f16_fneg_fabs_all:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_min_f16_e64 v3, -|v0|, -|v1|
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT:    v_cmp_o_f16_e64 vcc, -|v0|, -|v1|
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT:    v_min_f16_e64 v1, v0, -|v2|
+; GFX942-NEXT:    v_cmp_o_f16_e64 vcc, v0, -|v2|
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_f16_fneg_fabs_all:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_or_b32_e32 v0, 0x8000, v0
+; GFX950-NEXT:    v_or_b32_e32 v1, 0x8000, v1
+; GFX950-NEXT:    v_or_b32_e32 v2, 0x8000, v2
+; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, v1, v1
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, v2, v2
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call half @llvm.fabs.f16(half %a)
   %b.fabs = call half @llvm.fabs.f16(half %b)
   %c.fabs = call half @llvm.fabs.f16(half %c)
@@ -1533,19 +1621,28 @@ define half @v_fminimum3_f16_fneg0(half %a, half %b, half %c) {
 ; GFX12-NEXT:    v_minimum3_f16 v0, -v0, v1, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fminimum3_f16_fneg0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f16_e64 v3, -v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -v0, v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT:    v_min_f16_e32 v1, v0, v2
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fminimum3_f16_fneg0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_min_f16_e64 v3, -v0, v1
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT:    v_cmp_o_f16_e64 vcc, -v0, v1
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT:    v_min_f16_e32 v1, v0, v2
+; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_f16_fneg0:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, v1, v1
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, v2, v2
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %a.fneg = fneg half %a
   %max0 = call half @llvm.minimum.f16(half %a.fneg, half %b)
   %max1 = call half @llvm.minimum.f16(half %max0, half %c)
@@ -1563,19 +1660,28 @@ define half @v_fminimum3_f16_fneg1(half %a, half %b, half %c) {
 ; GFX12-NEXT:    v_minimum3_f16 v0, v0, -v1, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fminimum3_f16_fneg1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f16_e64 v3, v0, -v1
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, -v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT:    v_min_f16_e32 v1, v0, v2
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fminimum3_f16_fneg1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_min_f16_e64 v3, v0, -v1
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT:    v_cmp_o_f16_e64 vcc, v0, -v1
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT:    v_min_f16_e32 v1, v0, v2
+; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_f16_fneg1:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
+; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, v1, v1
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, v2, v2
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %b.fneg = fneg half %b
   %max0 = call half @llvm.minimum.f16(half %a, half %b.fneg)
   %max1 = call half @llvm.minimum.f16(half %max0, half %c)
@@ -1593,19 +1699,28 @@ define half @v_fminimum3_f16_fneg2(half %a, half %b, half %c) {
 ; GFX12-NEXT:    v_minimum3_f16 v0, v0, v1, -v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fminimum3_f16_fneg2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f16_e32 v3, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT:    v_min_f16_e64 v1, v0, -v2
-; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, -v2
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fminimum3_f16_fneg2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_min_f16_e32 v3, v0, v1
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT:    v_min_f16_e64 v1, v0, -v2
+; GFX942-NEXT:    v_cmp_o_f16_e64 vcc, v0, -v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_f16_fneg2:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_xor_b32_e32 v2, 0x8000, v2
+; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, v1, v1
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, v2, v2
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %c.fneg = fneg half %c
   %max0 = call half @llvm.minimum.f16(half %a, half %b)
   %max1 = call half @llvm.minimum.f16(half %max0, half %c.fneg)
@@ -1623,19 +1738,28 @@ define half @v_fminimum3_f16_const0(half %b, half %c) {
 ; GFX12-NEXT:    v_minimum3_f16 v0, v0, 0x4800, v1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fminimum3_f16_const0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f16_e32 v2, 0x4800, v0
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT:    v_min_f16_e32 v2, v0, v1
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fminimum3_f16_const0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_min_f16_e32 v2, 0x4800, v0
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX942-NEXT:    v_min_f16_e32 v2, v0, v1
+; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_f16_const0:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    s_movk_i32 s0, 0x4800
+; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, s0, s0
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, v1, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call half @llvm.minimum.f16(half 8.0, half %b)
   %max1 = call half @llvm.minimum.f16(half %max0, half %c)
   ret half %max1
@@ -1652,19 +1776,27 @@ define half @v_fminimum3_f16__const2(half %a, half %b) {
 ; GFX12-NEXT:    v_minimum3_f16 v0, v0, v1, 0x4800
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fminimum3_f16__const2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f16_e32 v2, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT:    v_min_f16_e32 v1, 0x4800, v0
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fminimum3_f16__const2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_min_f16_e32 v2, v0, v1
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX942-NEXT:    v_min_f16_e32 v1, 0x4800, v0
+; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_f16__const2:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, v1, v1
+; GFX950-NEXT:    s_movk_i32 s0, 0x4800
+; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, s0, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call half @llvm.minimum.f16(half %a, half %b)
   %max1 = call half @llvm.minimum.f16(half %max0, half 8.0)
   ret half %max1
@@ -1681,19 +1813,27 @@ define half @v_fminimum3_f16_inlineimm0(half %b, half %c) {
 ; GFX12-NEXT:    v_minimum3_f16 v0, v0, 4.0, v1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fminimum3_f16_inlineimm0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f16_e32 v2, 4.0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT:    v_min_f16_e32 v2, v0, v1
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fminimum3_f16_inlineimm0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_min_f16_e32 v2, 4.0, v0
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX942-NEXT:    v_min_f16_e32 v2, v0, v1
+; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_f16_inlineimm0:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, 4.0, 4.0
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, v1, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call half @llvm.minimum.f16(half 4.0, half %b)
   %max1 = call half @llvm.minimum.f16(half %max0, half %c)
   ret half %max1
@@ -1710,19 +1850,27 @@ define half @v_fminimum3_f16__inlineimm(half %a, half %b) {
 ; GFX12-NEXT:    v_minimum3_f16 v0, v0, v1, 4.0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fminimum3_f16__inlineimm:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f16_e32 v2, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT:    v_min_f16_e32 v1, 4.0, v0
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fminimum3_f16__inlineimm:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_min_f16_e32 v2, v0, v1
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX942-NEXT:    v_min_f16_e32 v1, 4.0, v0
+; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_f16__inlineimm:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, v1, v1
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, 4.0, 4.0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call half @llvm.minimum.f16(half %a, half %b)
   %max1 = call half @llvm.minimum.f16(half %max0, half 4.0)
   ret half %max1
@@ -1741,19 +1889,28 @@ define half @v_fminimum3_f16_const1_const2(half %a) {
 ; GFX12-NEXT:    v_minimum3_f16 v0, v0, s0, 0x4c00
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fminimum3_f16_const1_const2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f16_e32 v1, 0x4800, v0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT:    v_min_f16_e32 v1, 0x4c00, v0
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fminimum3_f16_const1_const2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_min_f16_e32 v1, 0x4800, v0
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX942-NEXT:    v_min_f16_e32 v1, 0x4c00, v0
+; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_f16_const1_const2:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    s_movk_i32 s0, 0x4800
+; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, s0, s0
+; GFX950-NEXT:    s_movk_i32 s0, 0x4c00
+; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, s0, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call half @llvm.minimum.f16(half %a, half 8.0)
   %max1 = call half @llvm.minimum.f16(half %max0, half 16.0)
   ret half %max1
@@ -3620,20 +3777,30 @@ define <2 x half> @v_no_fminimum3_f16__multi_use(half %a, half %b, half %c) {
 ; GFX12-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_no_fminimum3_f16__multi_use:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f16_e32 v3, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT:    v_min_f16_e32 v1, v0, v2
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
-; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_no_fminimum3_f16__multi_use:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_min_f16_e32 v3, v0, v1
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT:    v_min_f16_e32 v1, v0, v2
+; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX942-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_no_fminimum3_f16__multi_use:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, v1, v1
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_pk_minimum3_f16 v1, v0, v2, v2
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call half @llvm.minimum.f16(half %a, half %b)
   %max1 = call half @llvm.minimum.f16(half %max0, half %c)
   %insert.0 = insertelement <2 x half> poison, half %max0, i32 0
@@ -3651,23 +3818,35 @@ define amdgpu_ps <2 x i32> @s_no_fminimum3_f16__multi_use(half inreg %a, half in
 ; GFX12-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX12-NEXT:    ; return to shader part epilog
 ;
-; GFX9-LABEL: s_no_fminimum3_f16__multi_use:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v0, s1
-; GFX9-NEXT:    v_min_f16_e32 v1, s0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT:    v_min_f16_e32 v1, s2, v0
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, s2, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX9-NEXT:    ; return to shader part epilog
+; GFX942-LABEL: s_no_fminimum3_f16__multi_use:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-NEXT:    v_min_f16_e32 v1, s0, v0
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX942-NEXT:    v_min_f16_e32 v1, s2, v0
+; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, s2, v0
+; GFX942-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX942-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX942-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX942-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX942-NEXT:    ; return to shader part epilog
+;
+; GFX950-LABEL: s_no_fminimum3_f16__multi_use:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    v_mov_b32_e32 v0, s0
+; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, s1, s1
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_pk_minimum3_f16 v1, v0, s2, s2
+; GFX950-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX950-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX950-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX950-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX950-NEXT:    ; return to shader part epilog
   %max0 = call half @llvm.minimum.f16(half %a, half %b)
   %max1 = call half @llvm.minimum.f16(half %max0, half %c)
   %cast0 = bitcast half %max0 to i16
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
index 4532571d5cf2a..e828a12442fb8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
@@ -42,11 +42,7 @@ define half @v_maximum_f16(half %src0, half %src1) {
 ; GFX950-LABEL: v_maximum_f16:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_max_f16_e32 v2, v0, v1
-; GFX950-NEXT:    v_mov_b32_e32 v3, 0x7e00
-; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v1
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_f16:
@@ -96,11 +92,17 @@ define half @v_maximum_f16__nnan(half %src0, half %src1) {
 ; GFX8-NEXT:    v_max_f16_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximum_f16__nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_f16__nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_f16__nnan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_f16__nnan:
 ; GFX10:       ; %bb.0:
@@ -162,11 +164,7 @@ define half @v_maximum_f16__nsz(half %src0, half %src1) {
 ; GFX950-LABEL: v_maximum_f16__nsz:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_max_f16_e32 v2, v0, v1
-; GFX950-NEXT:    v_mov_b32_e32 v3, 0x7e00
-; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v1
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_f16__nsz:
@@ -216,11 +214,17 @@ define half @v_maximum_f16__nnan_nsz(half %src0, half %src1) {
 ; GFX8-NEXT:    v_max_f16_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximum_f16__nnan_nsz:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_f16__nnan_nsz:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_f16__nnan_nsz:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_f16__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -286,11 +290,7 @@ define half @v_maximum_f16__nnan_src0(half %arg0, half %src1) {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    v_add_f16_e32 v0, 1.0, v0
-; GFX950-NEXT:    v_max_f16_e32 v2, v0, v1
-; GFX950-NEXT:    v_mov_b32_e32 v3, 0x7e00
-; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v1
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_f16__nnan_src0:
@@ -367,11 +367,7 @@ define half @v_maximum_f16__nnan_src1(half %src0, half %arg1) {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    v_add_f16_e32 v1, 1.0, v1
-; GFX950-NEXT:    v_max_f16_e32 v2, v0, v1
-; GFX950-NEXT:    v_mov_b32_e32 v3, 0x7e00
-; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v1
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_f16__nnan_src1:
@@ -458,12 +454,9 @@ define void @s_maximum_f16(half inreg %src0, half inreg %src1) {
 ; GFX950-LABEL: s_maximum_f16:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_mov_b32_e32 v0, s1
-; GFX950-NEXT:    v_max_f16_e32 v1, s0, v0
-; GFX950-NEXT:    v_mov_b32_e32 v2, 0x7e00
-; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX950-NEXT:    v_mov_b32_e32 v0, s0
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, s1, s1
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; use v0
@@ -2505,3 +2498,4 @@ define <16 x half> @v_maximum_v16f16(<16 x half> %src0, <16 x half> %src1) {
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GCN: {{.*}}
+; GFX9: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
index 0b9cb9682ea5f..9a2ef15737308 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
@@ -30,11 +30,7 @@ define half @v_minimum_f16(half %src0, half %src1) {
 ; GFX950-LABEL: v_minimum_f16:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_min_f16_e32 v2, v0, v1
-; GFX950-NEXT:    v_mov_b32_e32 v3, 0x7e00
-; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, v1, v1
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_f16:
@@ -74,11 +70,17 @@ define half @v_minimum_f16__nnan(half %src0, half %src1) {
 ; GFX8-NEXT:    v_min_f16_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimum_f16__nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_f16__nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_f16__nnan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, v1, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_f16__nnan:
 ; GFX10:       ; %bb.0:
@@ -127,11 +129,7 @@ define half @v_minimum_f16__nsz(half %src0, half %src1) {
 ; GFX950-LABEL: v_minimum_f16__nsz:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_min_f16_e32 v2, v0, v1
-; GFX950-NEXT:    v_mov_b32_e32 v3, 0x7e00
-; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, v1, v1
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_f16__nsz:
@@ -171,11 +169,17 @@ define half @v_minimum_f16__nnan_nsz(half %src0, half %src1) {
 ; GFX8-NEXT:    v_min_f16_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimum_f16__nnan_nsz:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_f16__nnan_nsz:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_f16__nnan_nsz:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, v1, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_f16__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -227,11 +231,7 @@ define half @v_minimum_f16__nnan_src0(half %arg0, half %src1) {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    v_add_f16_e32 v0, 1.0, v0
-; GFX950-NEXT:    v_min_f16_e32 v2, v0, v1
-; GFX950-NEXT:    v_mov_b32_e32 v3, 0x7e00
-; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, v1, v1
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_f16__nnan_src0:
@@ -294,11 +294,7 @@ define half @v_minimum_f16__nnan_src1(half %src0, half %arg1) {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    v_add_f16_e32 v1, 1.0, v1
-; GFX950-NEXT:    v_min_f16_e32 v2, v0, v1
-; GFX950-NEXT:    v_mov_b32_e32 v3, 0x7e00
-; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, v1, v1
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_f16__nnan_src1:
@@ -368,12 +364,9 @@ define void @s_minimum_f16(half inreg %src0, half inreg %src1) {
 ; GFX950-LABEL: s_minimum_f16:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_mov_b32_e32 v0, s1
-; GFX950-NEXT:    v_min_f16_e32 v1, s0, v0
-; GFX950-NEXT:    v_mov_b32_e32 v2, 0x7e00
-; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX950-NEXT:    v_mov_b32_e32 v0, s0
+; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, s1, s1
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; use v0
@@ -1924,3 +1917,4 @@ define <16 x half> @v_minimum_v16f16(<16 x half> %src0, <16 x half> %src1) {
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GCN: {{.*}}
+; GFX9: {{.*}}

From 098492a228f781a37997637e0953fd4e7faa2193 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Thu, 13 Feb 2025 14:19:51 -0800
Subject: [PATCH 191/282] [SLP] Check for PHI nodes (potentially cycles!) when
 checking dependencies

When checking for dependecies for gather nodes with users with the same
last instruction, cannot rely on the index order, if there is (even
potential!) cycle in the graph, which may cause order not work correctly
and cause compiler crash.

Fixes #127128

(cherry picked from commit ac217ee389d63124432e5e6890851a678f7a676b)
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 10 +++-
 .../X86/delayed-gather-emission.ll            |  2 +-
 .../X86/matching-gather-nodes-phi-users.ll    |  2 +-
 .../X86/perfect-matched-reused-bv.ll          | 13 ++--
 .../SLPVectorizer/X86/phi-node-with-cycle.ll  | 59 +++++++++++++++++++
 5 files changed, 77 insertions(+), 9 deletions(-)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/phi-node-with-cycle.ll

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 19963e780ebd3..7b20eda550095 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -13181,8 +13181,16 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
           continue;
         // If the user instruction is used for some reason in different
         // vectorized nodes - make it depend on index.
+        // If any vector node is PHI node, this dependency might not work
+        // because of cycle dependencies, so disable it.
         if (TEUseEI.UserTE != UseEI.UserTE &&
-            TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)
+            (TEUseEI.UserTE->Idx < UseEI.UserTE->Idx ||
+             any_of(
+                 VectorizableTree,
+                 [](const std::unique_ptr<TreeEntry> &TE) {
+                   return TE->State == TreeEntry::Vectorize &&
+                          TE->getOpcode() == Instruction::PHI;
+                 })))
           continue;
       }
 
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/delayed-gather-emission.ll b/llvm/test/Transforms/SLPVectorizer/X86/delayed-gather-emission.ll
index 5562291dbb6be..bf3f0c4df74e4 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/delayed-gather-emission.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/delayed-gather-emission.ll
@@ -31,7 +31,7 @@ define void @test() {
 ; CHECK-NEXT:    [[TOBOOL:%.*]] = fcmp une float [[I2]], 0.000000e+00
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <2 x i32> <i32 poison, i32 0>
 ; CHECK-NEXT:    [[TMP9]] = insertelement <2 x float> [[TMP8]], float [[I2]], i32 0
-; CHECK-NEXT:    [[TMP10]] = shufflevector <2 x float> [[TMP9]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP10]] = insertelement <2 x float> [[TMP2]], float [[I2]], i32 0
 ; CHECK-NEXT:    br i1 [[TOBOOL]], label [[BB1]], label [[BB2]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/matching-gather-nodes-phi-users.ll b/llvm/test/Transforms/SLPVectorizer/X86/matching-gather-nodes-phi-users.ll
index 166c819098c8c..d649465c9ff12 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/matching-gather-nodes-phi-users.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/matching-gather-nodes-phi-users.ll
@@ -8,7 +8,7 @@
 ; YAML: Function:        test
 ; YAML: Args:
 ; YAML:   - String:          'Stores SLP vectorized with cost '
-; YAML:   - Cost:            '-6'
+; YAML:   - Cost:            '-3'
 ; YAML:   - String:          ' and with tree size '
 ; YAML:   - TreeSize:        '14'
 ; YAML: ...
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/perfect-matched-reused-bv.ll b/llvm/test/Transforms/SLPVectorizer/X86/perfect-matched-reused-bv.ll
index 1053e0fc10669..c4a49242a5583 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/perfect-matched-reused-bv.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/perfect-matched-reused-bv.ll
@@ -7,16 +7,17 @@ define void @test() {
 ; CHECK-NEXT:  [[BB:.*]]:
 ; CHECK-NEXT:    br label %[[BB1:.*]]
 ; CHECK:       [[BB1]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x i32> [ zeroinitializer, %[[BB]] ], [ [[TMP4:%.*]], %[[BB4:.*]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = or <2 x i32> [[TMP0]], zeroinitializer
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[ADD6:%.*]], %[[BB4:.*]] ]
+; CHECK-NEXT:    [[PHI2:%.*]] = phi i32 [ 0, %[[BB]] ], [ 0, %[[BB4]] ]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[PHI2]], 0
+; CHECK-NEXT:    [[OR3:%.*]] = or i32 [[PHI]], 0
 ; CHECK-NEXT:    br i1 false, label %[[BB7:.*]], label %[[BB4]]
 ; CHECK:       [[BB4]]:
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> <i32 0, i32 poison>, <2 x i32> <i32 2, i32 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = add <2 x i32> zeroinitializer, [[TMP2]]
-; CHECK-NEXT:    [[TMP4]] = add <2 x i32> zeroinitializer, [[TMP2]]
+; CHECK-NEXT:    [[ADD6]] = add i32 [[PHI]], 0
 ; CHECK-NEXT:    br i1 false, label %[[BB7]], label %[[BB1]]
 ; CHECK:       [[BB7]]:
-; CHECK-NEXT:    [[TMP5:%.*]] = phi <2 x i32> [ [[TMP1]], %[[BB1]] ], [ [[TMP3]], %[[BB4]] ]
+; CHECK-NEXT:    [[PHI8:%.*]] = phi i32 [ [[OR]], %[[BB1]] ], [ 0, %[[BB4]] ]
+; CHECK-NEXT:    [[PHI9:%.*]] = phi i32 [ [[OR3]], %[[BB1]] ], [ [[ADD6]], %[[BB4]] ]
 ; CHECK-NEXT:    ret void
 ;
 bb:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi-node-with-cycle.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi-node-with-cycle.ll
new file mode 100644
index 0000000000000..22e7e6a8e6624
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/phi-node-with-cycle.ll
@@ -0,0 +1,59 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -mcpu=haswell < %s | FileCheck %s
+
+define void @test(float %0) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: float [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> <float 0.000000e+00, float poison>, float [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fdiv <2 x float> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = fdiv <2 x float> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    br label %[[BB6:.*]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul <2 x float> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = fsub <2 x float> zeroinitializer, [[TMP7]]
+; CHECK-NEXT:    br label %[[BB10:.*]]
+; CHECK:       [[BB9:.*]]:
+; CHECK-NEXT:    br label %[[BB10]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    [[TMP11:%.*]] = phi <2 x float> [ [[TMP8]], %[[BB6]] ], [ poison, %[[BB9]] ]
+; CHECK-NEXT:    br label %[[BB12:.*]]
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    [[TMP13:%.*]] = fmul <2 x float> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = fsub <2 x float> [[TMP11]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x float> [[TMP14]], i32 0
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x float> [[TMP14]], i32 1
+; CHECK-NEXT:    [[TMP17:%.*]] = fadd float [[TMP15]], [[TMP16]]
+; CHECK-NEXT:    [[TMP18:%.*]] = call float @llvm.fabs.f32(float [[TMP17]])
+; CHECK-NEXT:    ret void
+;
+  %2 = fdiv float 0.000000e+00, 0.000000e+00
+  %3 = fdiv float 0.000000e+00, 0.000000e+00
+  %4 = fdiv float %0, 0.000000e+00
+  br label %5
+
+5:
+  %6 = fmul float %4, 0.000000e+00
+  %7 = fsub float 0.000000e+00, %6
+  %8 = fmul float %3, 0.000000e+00
+  %9 = fsub float 0.000000e+00, %8
+  br label %11
+
+10:
+  br label %11
+
+11:
+  %12 = phi float [ %7, %5 ], [ 0.000000e+00, %10 ]
+  %13 = phi float [ %9, %5 ], [ 0.000000e+00, %10 ]
+  br label %14
+
+14:
+  %15 = fmul float %2, 0.000000e+00
+  %16 = fsub float %12, %15
+  %17 = fmul float %4, 0.000000e+00
+  %18 = fsub float %13, %17
+  %19 = fadd float %16, %18
+  %20 = call float @llvm.fabs.f32(float %19)
+  ret void
+}
+

From 160e6ace3e1a35b05d1f277aaf34034b1174c5b0 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Tue, 11 Feb 2025 14:32:30 +0100
Subject: [PATCH 192/282] [mlir][cmake] Do not export MLIR_MAIN_SRC_DIR and
 MLIR_INCLUDE_DIR (#125842)

MLIR_MAIN_SRC_DIR and MLIR_INCLUDE_DIR point to the source directory,
which is not installed. As such, the installed MLIRConfig.cmake also
should not reference it.

The comment indicates that these are needed for mlir_tablegen(), but I
don't see any related uses.

The motivation for this is the use in flang, where we end up inheriting
a meaningless MLIR_MAIN_SRC_DIR from a previous MLIR build, whose source
directory doesn't exist anymore, and that cannot be overridden with the
correct path, because it's not a cached variable.

Instead do what all the other projects do for LLVM_MAIN_SRC_DIR and
initialize MLIR_MAIN_SRC_DIR to CMAKE_CURRENT_SOURCE_DIR/../mlir.

For MLIR_INCLUDE_DIR there already is an exported MLIR_INCLUDE_DIRS,
which can be used instead.

(cherry picked from commit 82bd148a3f25439d7f52a32422dc1bcd2da03803)
---
 flang/CMakeLists.txt                                 | 7 ++++---
 flang/include/flang/Optimizer/Dialect/CMakeLists.txt | 2 +-
 mlir/cmake/modules/MLIRConfig.cmake.in               | 4 ----
 3 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/flang/CMakeLists.txt b/flang/CMakeLists.txt
index b619553ef8302..b24b177cc21cc 100644
--- a/flang/CMakeLists.txt
+++ b/flang/CMakeLists.txt
@@ -79,6 +79,8 @@ if(CMAKE_SIZEOF_VOID_P EQUAL 4)
   message(FATAL_ERROR "flang isn't supported on 32 bit CPUs")
 endif()
 
+set(MLIR_MAIN_SRC_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../mlir" CACHE PATH "Path to MLIR source tree")
+
 if (FLANG_STANDALONE_BUILD)
   set(FLANG_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
   set(CMAKE_INCLUDE_CURRENT_DIR ON)
@@ -240,10 +242,9 @@ else()
     set(FLANG_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
   endif()
 
-  set(MLIR_MAIN_SRC_DIR ${LLVM_MAIN_SRC_DIR}/../mlir ) # --src-root
-  set(MLIR_INCLUDE_DIR ${MLIR_MAIN_SRC_DIR}/include ) # --includedir
+  set(MLIR_INCLUDE_DIRS ${MLIR_MAIN_SRC_DIR}/include ) # --includedir
   set(MLIR_TABLEGEN_OUTPUT_DIR ${CMAKE_BINARY_DIR}/tools/mlir/include)
-  include_directories(SYSTEM ${MLIR_INCLUDE_DIR})
+  include_directories(SYSTEM ${MLIR_INCLUDE_DIRS})
   include_directories(SYSTEM ${MLIR_TABLEGEN_OUTPUT_DIR})
 endif()
 
diff --git a/flang/include/flang/Optimizer/Dialect/CMakeLists.txt b/flang/include/flang/Optimizer/Dialect/CMakeLists.txt
index 10ab213b30b02..73f388cbab6c9 100644
--- a/flang/include/flang/Optimizer/Dialect/CMakeLists.txt
+++ b/flang/include/flang/Optimizer/Dialect/CMakeLists.txt
@@ -37,7 +37,7 @@ set_target_properties(flang-doc PROPERTIES FOLDER "Flang/Docs")
 set(dialect_doc_filename "FIRLangRef")
 
 set(LLVM_TARGET_DEFINITIONS FIROps.td)
-tablegen(MLIR ${dialect_doc_filename}.md -gen-op-doc "-I${MLIR_INCLUDE_DIR}")
+tablegen(MLIR ${dialect_doc_filename}.md -gen-op-doc)
 set(GEN_DOC_FILE ${FLANG_BINARY_DIR}/docs/Dialect/${dialect_doc_filename}.md)
 add_custom_command(
         OUTPUT ${GEN_DOC_FILE}
diff --git a/mlir/cmake/modules/MLIRConfig.cmake.in b/mlir/cmake/modules/MLIRConfig.cmake.in
index 7076d94a32f2b..c695b5787af66 100644
--- a/mlir/cmake/modules/MLIRConfig.cmake.in
+++ b/mlir/cmake/modules/MLIRConfig.cmake.in
@@ -16,10 +16,6 @@ set(MLIR_INSTALL_AGGREGATE_OBJECTS "@MLIR_INSTALL_AGGREGATE_OBJECTS@")
 set(MLIR_ENABLE_BINDINGS_PYTHON "@MLIR_ENABLE_BINDINGS_PYTHON@")
 set(MLIR_ENABLE_EXECUTION_ENGINE "@MLIR_ENABLE_EXECUTION_ENGINE@")
 
-# For mlir_tablegen()
-set(MLIR_INCLUDE_DIR "@MLIR_INCLUDE_DIR@")
-set(MLIR_MAIN_SRC_DIR "@MLIR_MAIN_SRC_DIR@")
-
 set_property(GLOBAL PROPERTY MLIR_ALL_LIBS "@MLIR_ALL_LIBS@")
 set_property(GLOBAL PROPERTY MLIR_DIALECT_LIBS "@MLIR_DIALECT_LIBS@")
 set_property(GLOBAL PROPERTY MLIR_CONVERSION_LIBS "@MLIR_CONVERSION_LIBS@")

From 0bc2eb7458dcd763d063a58fb3ee9626bde8eae8 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Thu, 6 Feb 2025 14:13:43 -0600
Subject: [PATCH 193/282] [OpenMP] Fix misspelled symbol name (#126120)

Summary:
This is supposed to be `__llvm_rpc_client` but I screwed it up and
didn't notice at the time. Will need to be backported.

(cherry picked from commit b35749559ddd9b2d4e044ef71d13d888b8a3d8cb)
---
 llvm/lib/Transforms/IPO/Internalize.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/IPO/Internalize.cpp b/llvm/lib/Transforms/IPO/Internalize.cpp
index f0270600aa266..404102eef89fc 100644
--- a/llvm/lib/Transforms/IPO/Internalize.cpp
+++ b/llvm/lib/Transforms/IPO/Internalize.cpp
@@ -235,7 +235,7 @@ bool InternalizePass::internalizeModule(Module &M) {
 
   // Preserve the RPC interface for GPU host callbacks when internalizing.
   if (Triple(M.getTargetTriple()).isNVPTX())
-    AlwaysPreserved.insert("__llvm_rpc_server");
+    AlwaysPreserved.insert("__llvm_rpc_client");
 
   // Mark all functions not in the api as internal.
   IsWasm = Triple(M.getTargetTriple()).isOSBinFormatWasm();

From eb389b1259c35ea0f91b8aba133bca4a8ab9f4ab Mon Sep 17 00:00:00 2001
From: Chuanqi Xu <yedeng.yd@linux.alibaba.com>
Date: Fri, 21 Feb 2025 10:34:14 +0800
Subject: [PATCH 194/282] [C++20] [Modules] handling selectAny attribute for
 vardecl

Close https://github.com/llvm/llvm-project/issues/127963

The root cause of the problem seems to be that we didn't realize it
simply.

(cherry picked from commit 24c06a19be7bcf28b37e5eabbe65df95a2c0265a)
---
 clang/lib/Sema/SemaDecl.cpp      |  3 ++-
 clang/test/Modules/pr127943.cppm | 31 +++++++++++++++++++++++++++++++
 2 files changed, 33 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/Modules/pr127943.cppm

diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 1ecb9aff5f319..01f09aba8c2ad 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -4803,7 +4803,8 @@ bool Sema::checkVarDeclRedefinition(VarDecl *Old, VarDecl *New) {
       (New->getFormalLinkage() == Linkage::Internal || New->isInline() ||
        isa<VarTemplateSpecializationDecl>(New) ||
        New->getDescribedVarTemplate() || New->getNumTemplateParameterLists() ||
-       New->getDeclContext()->isDependentContext())) {
+       New->getDeclContext()->isDependentContext() ||
+       New->hasAttr<SelectAnyAttr>())) {
     // The previous definition is hidden, and multiple definitions are
     // permitted (in separate TUs). Demote this to a declaration.
     New->demoteThisDefinitionToDeclaration();
diff --git a/clang/test/Modules/pr127943.cppm b/clang/test/Modules/pr127943.cppm
new file mode 100644
index 0000000000000..7cc3be6903e6a
--- /dev/null
+++ b/clang/test/Modules/pr127943.cppm
@@ -0,0 +1,31 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// RUN: %clang_cc1 -std=c++20 %t/repro.cppm -fdeclspec -emit-module-interface -o %t/repro.pcm
+// RUN: %clang_cc1 -std=c++20 %t/source.cpp -fdeclspec -fsyntax-only -verify -fprebuilt-module-path=%t
+
+//--- repro_decl.hpp
+#pragma once
+
+extern "C"
+{
+    __declspec(selectany) int foo = 0;
+}
+
+//--- repro.cppm
+module;
+#include "repro_decl.hpp"
+
+export module repro;
+
+export inline int func()
+{
+    return foo;
+}
+
+//--- source.cpp
+// expected-no-diagnostics
+import repro;
+
+#include "repro_decl.hpp"

From 69ee30f9db773048d314db466aaa187b6843c3c0 Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Sat, 22 Feb 2025 12:06:34 +0800
Subject: [PATCH 195/282] [DAGCombiner] visitFREEZE: Early exit when N is
 deleted (#128161)

`N` may get merged with existing nodes inside the loop. Early exit when
it is deleted to avoid the crash.
Alternative solution: use `DAGNodeDeletedListener` to refresh the value
of N.

Closes https://github.com/llvm/llvm-project/issues/128143.

(cherry picked from commit 646e4f2eede9a39e46012dde9430cd289682e83c)
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  8 +++--
 llvm/test/CodeGen/X86/pr128143.ll             | 32 +++++++++++++++++++
 2 files changed, 37 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/pr128143.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index a0c703d2df8a2..1e023d4f76b2c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -16170,11 +16170,13 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
       DAG.UpdateNodeOperands(FrozenMaybePoisonOperand.getNode(),
                              MaybePoisonOperand);
     }
+
+    // This node has been merged with another.
+    if (N->getOpcode() == ISD::DELETED_NODE)
+      return SDValue(N, 0);
   }
 
-  // This node has been merged with another.
-  if (N->getOpcode() == ISD::DELETED_NODE)
-    return SDValue(N, 0);
+  assert(N->getOpcode() != ISD::DELETED_NODE && "Node was deleted!");
 
   // The whole node may have been updated, so the value we were holding
   // may no longer be valid. Re-fetch the operand we're `freeze`ing.
diff --git a/llvm/test/CodeGen/X86/pr128143.ll b/llvm/test/CodeGen/X86/pr128143.ll
new file mode 100644
index 0000000000000..2517ad9ebcb6b
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr128143.ll
@@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s
+
+@g_1 = external global i8
+@g_2 = external global i8
+
+; Make sure we don't crash on this test.
+
+define i1 @test(i1 %cmp1, i32 %x) {
+; CHECK-LABEL: test:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq g_2@GOTPCREL(%rip), %rcx
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    cmpq %rcx, g_1@GOTPCREL(%rip)
+; CHECK-NEXT:    setne %al
+; CHECK-NEXT:    cmpl %eax, %esi
+; CHECK-NEXT:    setb %cl
+; CHECK-NEXT:    orb %cl, %al
+; CHECK-NEXT:    andb %dil, %al
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK-NEXT:    retq
+entry:
+  %cmp2 = icmp ne ptr @g_1, @g_2
+  %fr = freeze ptr @g_1
+  %cmp3 = icmp ne ptr %fr, @g_2
+  %ext1 = zext i1 %cmp3 to i32
+  %sel1 = select i1 %cmp1, i1 %cmp2, i1 false
+  %cmp4 = icmp ult i32 %x, %ext1
+  %sel3 = select i1 %cmp1, i1 %cmp4, i1 false
+  %or = or i1 %sel1, %sel3
+  ret i1 %or
+}

From 350e0e2b45331279a003d3ab7c7178680e533158 Mon Sep 17 00:00:00 2001
From: David Truby <david.truby@arm.com>
Date: Fri, 21 Feb 2025 18:50:52 +0000
Subject: [PATCH 196/282] [flang] fix AArch64 PCS for struct following pointer
 (#127802)

Pointers are already handled as taking up a register in the ABI
handling, but the handling for structs was not taking this into account.
This patch changes the struct handling to acknowledge that pointer
arguments take up an integer register.

Fixes #123075

(cherry picked from commit 449f84fea652e31de418c3087d7e3628809241b4)
---
 flang/lib/Optimizer/CodeGen/Target.cpp          |  7 +++++++
 flang/test/Fir/struct-passing-aarch64-byval.fir | 14 ++++++++++++++
 2 files changed, 21 insertions(+)

diff --git a/flang/lib/Optimizer/CodeGen/Target.cpp b/flang/lib/Optimizer/CodeGen/Target.cpp
index 1bc673bb34e32..2a1eb0bc33f5c 100644
--- a/flang/lib/Optimizer/CodeGen/Target.cpp
+++ b/flang/lib/Optimizer/CodeGen/Target.cpp
@@ -930,6 +930,13 @@ struct TargetAArch64 : public GenericTarget<TargetAArch64> {
         .Case<fir::VectorType>([&](auto) {
           TODO(loc, "passing vector argument to C by value is not supported");
           return NRegs{};
+        })
+        .Default([&](auto ty) {
+          if (fir::conformsWithPassByRef(ty))
+            return NRegs{1, false}; // Pointers take 1 integer register
+          TODO(loc, "unsupported component type for BIND(C), VALUE derived "
+                    "type argument");
+          return NRegs{};
         });
   }
 
diff --git a/flang/test/Fir/struct-passing-aarch64-byval.fir b/flang/test/Fir/struct-passing-aarch64-byval.fir
index 27143459dde2f..087efba393014 100644
--- a/flang/test/Fir/struct-passing-aarch64-byval.fir
+++ b/flang/test/Fir/struct-passing-aarch64-byval.fir
@@ -71,3 +71,17 @@ func.func private @too_many_hfa(!fir.type<hfa_max{i:f128,j:f128,k:f128,l:f128}>,
 
 // CHECK-LABEL: func.func private @too_big(!fir.ref<!fir.type<too_big{i:!fir.array<5xi32>}>> {{{.*}}, llvm.byval = !fir.type<too_big{i:!fir.array<5xi32>}>})
 func.func private @too_big(!fir.type<too_big{i:!fir.array<5xi32>}>)
+
+// CHECK-LABEL: func.func private @pointer_type(!fir.ref<i64>, !fir.array<1xi64>)
+func.func private @pointer_type(!fir.ref<i64>, !fir.type<pointer_type{i:i64}>)
+
+// CHECK-LABEL: func.func private @pointer_type_too_many_int(!fir.ref<i64>, 
+// CHECK-SAME: !fir.array<2xi64>,
+// CHECK-SAME: !fir.array<2xi64>,
+// CHECK-SAME: !fir.array<2xi64>,
+// CHECK-SAME: !fir.ref<!fir.type<int_max{i:i64,j:i64}>> {{{.*}}, llvm.byval = !fir.type<int_max{i:i64,j:i64}>})
+func.func private @pointer_type_too_many_int(!fir.ref<i64>,
+                       !fir.type<int_max{i:i64,j:i64}>,
+                       !fir.type<int_max{i:i64,j:i64}>,
+                       !fir.type<int_max{i:i64,j:i64}>,
+                       !fir.type<int_max{i:i64,j:i64}>)

From d919b8d97b12c24a13e1e92aa66ef1421e20104b Mon Sep 17 00:00:00 2001
From: Younan Zhang <zyn7109@gmail.com>
Date: Tue, 25 Feb 2025 14:35:07 +0800
Subject: [PATCH 197/282] [Clang] Handle instantiating captures in
 addInstantiatedCapturesToScope() (#128478)

addInstantiatedCapturesToScope() might be called when transforming a
lambda body. In this situation, it would look into all the lambda's
parents and figure out all the instantiated captures. However, the
instantiated captures are not visible from lambda's class decl until the
lambda is rebuilt (i.e. after the lambda body transform). So this patch
corrects that by also examining the LambdaScopeInfo, serving as a
workaround for not having deferred lambda body instantiation in Clang
20, to avoid regressing some real-world use cases.

Fixes #128175

(cherry picked from commit ecc7e6ce4cd57a614985e95daf7027918cb8723e)
---
 clang/lib/Sema/SemaConcept.cpp              | 25 ++++++++++++++++++++-
 clang/test/SemaTemplate/concepts-lambda.cpp | 18 +++++++++++++++
 2 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp
index 8a77cbf8c9477..a7b609f7f3ce4 100644
--- a/clang/lib/Sema/SemaConcept.cpp
+++ b/clang/lib/Sema/SemaConcept.cpp
@@ -711,9 +711,32 @@ bool Sema::addInstantiatedCapturesToScope(
 
   unsigned Instantiated = 0;
 
+  // FIXME: This is a workaround for not having deferred lambda body
+  // instantiation.
+  // When transforming a lambda's body, if we encounter another call to a
+  // nested lambda that contains a constraint expression, we add all of the
+  // outer lambda's instantiated captures to the current instantiation scope to
+  // facilitate constraint evaluation. However, these captures don't appear in
+  // the CXXRecordDecl until after the lambda expression is rebuilt, so we
+  // pull them out from the corresponding LSI.
+  LambdaScopeInfo *InstantiatingScope = nullptr;
+  if (LambdaPattern->capture_size() && !LambdaClass->capture_size()) {
+    for (FunctionScopeInfo *Scope : llvm::reverse(FunctionScopes)) {
+      auto *LSI = dyn_cast<LambdaScopeInfo>(Scope);
+      if (!LSI ||
+          LSI->CallOperator->getTemplateInstantiationPattern() != PatternDecl)
+        continue;
+      InstantiatingScope = LSI;
+      break;
+    }
+    assert(InstantiatingScope);
+  }
+
   auto AddSingleCapture = [&](const ValueDecl *CapturedPattern,
                               unsigned Index) {
-    ValueDecl *CapturedVar = LambdaClass->getCapture(Index)->getCapturedVar();
+    ValueDecl *CapturedVar =
+        InstantiatingScope ? InstantiatingScope->Captures[Index].getVariable()
+                           : LambdaClass->getCapture(Index)->getCapturedVar();
     assert(CapturedVar->isInitCapture());
     Scope.InstantiatedLocal(CapturedPattern, CapturedVar);
   };
diff --git a/clang/test/SemaTemplate/concepts-lambda.cpp b/clang/test/SemaTemplate/concepts-lambda.cpp
index 306f86cfcb28f..dcb09c76d26b6 100644
--- a/clang/test/SemaTemplate/concepts-lambda.cpp
+++ b/clang/test/SemaTemplate/concepts-lambda.cpp
@@ -307,3 +307,21 @@ void test() {
 }
 
 }
+
+namespace GH128175 {
+
+template <class> void f() {
+  [i{0}] {
+    [&] {
+      [&] {
+        []()
+          requires true
+        {}();
+      }();
+    }();
+  }();
+}
+
+template void f<int>();
+
+}

From d6fd6e4d6acfab8744064b03f1a86e16dfe53ec6 Mon Sep 17 00:00:00 2001
From: Chuanqi Xu <yedeng.yd@linux.alibaba.com>
Date: Tue, 25 Feb 2025 13:29:42 +0800
Subject: [PATCH 198/282] [Serialization] Update DECL_LAST

Address post commit review at
https://github.com/llvm/llvm-project/pull/119333#pullrequestreview-2637471908

(cherry picked from commit 366daddfad9aa38ebb7d40055cf65f4ecb7dd6f9)
---
 clang/include/clang/Serialization/ASTBitCodes.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h
index 1b56ed2c9776b..d60cb655aa261 100644
--- a/clang/include/clang/Serialization/ASTBitCodes.h
+++ b/clang/include/clang/Serialization/ASTBitCodes.h
@@ -1522,13 +1522,13 @@ enum DeclCode {
   /// An ImplicitConceptSpecializationDecl record.
   DECL_IMPLICIT_CONCEPT_SPECIALIZATION,
 
-  // A decls specilization record.
+  // A decls specialization record.
   DECL_SPECIALIZATIONS,
 
-  // A decls specilization record.
+  // A decls specialization record.
   DECL_PARTIAL_SPECIALIZATIONS,
 
-  DECL_LAST = DECL_IMPLICIT_CONCEPT_SPECIALIZATION
+  DECL_LAST = DECL_PARTIAL_SPECIALIZATIONS
 };
 
 /// Record codes for each kind of statement or expression.

From c99be9188631e5f5a21a0041d85385bd05659a3d Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Tue, 25 Feb 2025 22:03:17 +0800
Subject: [PATCH 199/282] [X86][DAGCombiner] Skip x87 fp80 values in
 `combineFMulOrFDivWithIntPow2` (#128618)

f80 is not a valid IEEE floating-point type.
Closes https://github.com/llvm/llvm-project/issues/128528.

(cherry picked from commit 44d1dbd24c20a0ee93063dcf44d68e2b8f0bf77c)
---
 llvm/include/llvm/ADT/APFloat.h               |  1 +
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  7 +++--
 llvm/lib/Support/APFloat.cpp                  |  5 ++++
 .../X86/fold-int-pow2-with-fmul-or-fdiv.ll    | 28 +++++++++++++++++++
 4 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/ADT/APFloat.h b/llvm/include/llvm/ADT/APFloat.h
index 9792749230cbf..3bff205e7aa9e 100644
--- a/llvm/include/llvm/ADT/APFloat.h
+++ b/llvm/include/llvm/ADT/APFloat.h
@@ -353,6 +353,7 @@ struct APFloatBase {
   static bool semanticsHasSignedRepr(const fltSemantics &);
   static bool semanticsHasInf(const fltSemantics &);
   static bool semanticsHasNaN(const fltSemantics &);
+  static bool isIEEELikeFP(const fltSemantics &);
 
   // Returns true if any number described by \p Src can be precisely represented
   // by a normal (not subnormal) value in \p Dst.
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 1e023d4f76b2c..9d04568483677 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -17283,6 +17283,9 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
 // prefer it.
 SDValue DAGCombiner::combineFMulOrFDivWithIntPow2(SDNode *N) {
   EVT VT = N->getValueType(0);
+  if (!APFloat::isIEEELikeFP(VT.getFltSemantics()))
+    return SDValue();
+
   SDValue ConstOp, Pow2Op;
 
   std::optional<int> Mantissa;
@@ -17309,8 +17312,8 @@ SDValue DAGCombiner::combineFMulOrFDivWithIntPow2(SDNode *N) {
 
       const APFloat &APF = CFP->getValueAPF();
 
-      // Make sure we have normal/ieee constant.
-      if (!APF.isNormal() || !APF.isIEEE())
+      // Make sure we have normal constant.
+      if (!APF.isNormal())
         return false;
 
       // Make sure the floats exponent is within the bounds that this transform
diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp
index b0d92ae37fe8f..cbee7f48b8773 100644
--- a/llvm/lib/Support/APFloat.cpp
+++ b/llvm/lib/Support/APFloat.cpp
@@ -353,6 +353,11 @@ bool APFloatBase::semanticsHasNaN(const fltSemantics &semantics) {
   return semantics.nonFiniteBehavior != fltNonfiniteBehavior::FiniteOnly;
 }
 
+bool APFloatBase::isIEEELikeFP(const fltSemantics &semantics) {
+  // Keep in sync with Type::isIEEELikeFPTy
+  return SemanticsToEnum(semantics) <= S_IEEEquad;
+}
+
 bool APFloatBase::isRepresentableAsNormalIn(const fltSemantics &Src,
                                             const fltSemantics &Dst) {
   // Exponent range must be larger.
diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
index 2163121410553..15a9cbecd808a 100644
--- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -1662,3 +1662,31 @@ define float @fdiv_pow_shl_cnt32_okay(i32 %cnt) nounwind {
   %mul = fdiv float 0x3a20000000000000, %conv
   ret float %mul
 }
+
+define x86_fp80 @pr128528(i1 %cond) {
+; CHECK-SSE-LABEL: pr128528:
+; CHECK-SSE:       # %bb.0:
+; CHECK-SSE-NEXT:    testb $1, %dil
+; CHECK-SSE-NEXT:    movl $8, %eax
+; CHECK-SSE-NEXT:    movl $1, %ecx
+; CHECK-SSE-NEXT:    cmovnel %eax, %ecx
+; CHECK-SSE-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp)
+; CHECK-SSE-NEXT:    fildl -{{[0-9]+}}(%rsp)
+; CHECK-SSE-NEXT:    fmull {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
+; CHECK-SSE-NEXT:    retq
+;
+; CHECK-AVX-LABEL: pr128528:
+; CHECK-AVX:       # %bb.0:
+; CHECK-AVX-NEXT:    testb $1, %dil
+; CHECK-AVX-NEXT:    movl $8, %eax
+; CHECK-AVX-NEXT:    movl $1, %ecx
+; CHECK-AVX-NEXT:    cmovnel %eax, %ecx
+; CHECK-AVX-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp)
+; CHECK-AVX-NEXT:    fildl -{{[0-9]+}}(%rsp)
+; CHECK-AVX-NEXT:    fmull {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
+; CHECK-AVX-NEXT:    retq
+  %sub9 = select i1 %cond, i32 8, i32 1
+  %conv = uitofp i32 %sub9 to x86_fp80
+  %mul = fmul x86_fp80 %conv, 0xK4007D055555555555800
+  ret x86_fp80 %mul
+}

From 2cb3798958a454dd84799f5b41043bf95c272d64 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 19 Feb 2025 16:09:52 +0700
Subject: [PATCH 200/282] Revert "[clang][OpenCL][CodeGen][AMDGPU] Do not use
 `private` as the default AS for when `generic` is available (#112442)"

This reverts commit 6e0b0038cd65ce726ce404305a06e1cf33e36cca.

This breaks the rocm-device-libs build, so it should not ship in
the release.
---
 clang/lib/Basic/Targets/AMDGPU.cpp            |   6 +-
 clang/lib/CodeGen/CGBlocks.cpp                |   3 +-
 clang/lib/CodeGen/CGBuiltin.cpp               |  11 +-
 clang/test/CodeGen/scoped-fence-ops.c         | 181 ++++---
 .../CodeGenOpenCL/addr-space-struct-arg.cl    | 169 +++---
 .../amdgcn-automatic-variable.cl              |  36 +-
 .../amdgpu-abi-struct-arg-byref.cl            | 282 +++++-----
 .../CodeGenOpenCL/amdgpu-enqueue-kernel.cl    | 495 ++++++++----------
 clang/test/CodeGenOpenCL/amdgpu-nullptr.cl    |  28 +-
 clang/test/CodeGenOpenCL/atomic-ops.cl        |  11 +-
 .../atomics-unsafe-hw-remarks-gfx90a.cl       |   6 +-
 clang/test/CodeGenOpenCL/blocks.cl            |  23 +-
 clang/test/CodeGenOpenCL/builtins-alloca.cl   | 432 +--------------
 .../CodeGenOpenCL/builtins-amdgcn-gfx12.cl    | 143 ++---
 .../CodeGenOpenCL/builtins-amdgcn-gfx940.cl   |  30 +-
 .../builtins-fp-atomics-gfx12.cl              |   4 +-
 .../CodeGenOpenCL/builtins-fp-atomics-gfx8.cl |   2 +-
 .../builtins-fp-atomics-gfx90a.cl             |   2 +-
 .../enqueue-kernel-non-entry-block.cl         |   2 +-
 clang/test/CodeGenOpenCL/opencl_types.cl      |   2 +-
 clang/test/Index/pipe-size.cl                 |   4 +-
 21 files changed, 663 insertions(+), 1209 deletions(-)

diff --git a/clang/lib/Basic/Targets/AMDGPU.cpp b/clang/lib/Basic/Targets/AMDGPU.cpp
index 0d308cb6af969..9ea366af56a52 100644
--- a/clang/lib/Basic/Targets/AMDGPU.cpp
+++ b/clang/lib/Basic/Targets/AMDGPU.cpp
@@ -261,9 +261,9 @@ AMDGPUTargetInfo::AMDGPUTargetInfo(const llvm::Triple &Triple,
 void AMDGPUTargetInfo::adjust(DiagnosticsEngine &Diags, LangOptions &Opts) {
   TargetInfo::adjust(Diags, Opts);
   // ToDo: There are still a few places using default address space as private
-  // address space in OpenCL, which needs to be cleaned up, then the references
-  // to OpenCL can be removed from the following line.
-  setAddressSpaceMap((Opts.OpenCL && !Opts.OpenCLGenericAddressSpace) ||
+  // address space in OpenCL, which needs to be cleaned up, then Opts.OpenCL
+  // can be removed from the following line.
+  setAddressSpaceMap(/*DefaultIsPrivate=*/Opts.OpenCL ||
                      !isAMDGCN(getTriple()));
 }
 
diff --git a/clang/lib/CodeGen/CGBlocks.cpp b/clang/lib/CodeGen/CGBlocks.cpp
index a7584a95c8ca7..f38f86c792f69 100644
--- a/clang/lib/CodeGen/CGBlocks.cpp
+++ b/clang/lib/CodeGen/CGBlocks.cpp
@@ -1396,8 +1396,7 @@ void CodeGenFunction::setBlockContextParameter(const ImplicitParamDecl *D,
       DI->setLocation(D->getLocation());
       DI->EmitDeclareOfBlockLiteralArgVariable(
           *BlockInfo, D->getName(), argNum,
-          cast<llvm::AllocaInst>(alloc.getPointer()->stripPointerCasts()),
-          Builder);
+          cast<llvm::AllocaInst>(alloc.getPointer()), Builder);
     }
   }
 
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 7ec9d59bfed5c..5237533364294 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -6092,13 +6092,8 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
           /*IndexTypeQuals=*/0);
       auto Tmp = CreateMemTemp(SizeArrayTy, "block_sizes");
       llvm::Value *TmpPtr = Tmp.getPointer();
-      // The EmitLifetime* pair expect a naked Alloca as their last argument,
-      // however for cases where the default AS is not the Alloca AS, Tmp is
-      // actually the Alloca ascasted to the default AS, hence the
-      // stripPointerCasts()
-      llvm::Value *Alloca = TmpPtr->stripPointerCasts();
       llvm::Value *TmpSize = EmitLifetimeStart(
-          CGM.getDataLayout().getTypeAllocSize(Tmp.getElementType()), Alloca);
+          CGM.getDataLayout().getTypeAllocSize(Tmp.getElementType()), TmpPtr);
       llvm::Value *ElemPtr;
       // Each of the following arguments specifies the size of the corresponding
       // argument passed to the enqueued block.
@@ -6114,9 +6109,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
         Builder.CreateAlignedStore(
             V, GEP, CGM.getDataLayout().getPrefTypeAlign(SizeTy));
       }
-      // Return the Alloca itself rather than a potential ascast as this is only
-      // used by the paired EmitLifetimeEnd.
-      return std::tie(ElemPtr, TmpSize, Alloca);
+      return std::tie(ElemPtr, TmpSize, TmpPtr);
     };
 
     // Could have events and/or varargs.
diff --git a/clang/test/CodeGen/scoped-fence-ops.c b/clang/test/CodeGen/scoped-fence-ops.c
index d83ae05b0aea2..20cbb511a1758 100644
--- a/clang/test/CodeGen/scoped-fence-ops.c
+++ b/clang/test/CodeGen/scoped-fence-ops.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
 // RUN: %clang_cc1 %s -emit-llvm -o - -triple=amdgcn-amd-amdhsa -ffreestanding \
-// RUN:   -fvisibility=hidden | FileCheck --check-prefix=AMDGCN %s
+// RUN:   -fvisibility=hidden | FileCheck --check-prefixes=AMDGCN,AMDGCN-CL12 %s
 // RUN: %clang_cc1 %s -emit-llvm -o - -triple=amdgcn-amd-amdhsa -ffreestanding \
-// RUN:   -cl-std=CL2.0 -fvisibility=hidden | FileCheck --check-prefix=AMDGCN %s
+// RUN:   -cl-std=CL2.0 -fvisibility=hidden | FileCheck --check-prefixes=AMDGCN,AMDGCN-CL20 %s
 // RUN: %clang_cc1 %s -emit-llvm -o - -triple=spirv64-unknown-unknown -ffreestanding \
 // RUN:   -fvisibility=hidden | FileCheck --check-prefix=SPIRV %s
 // RUN: %clang_cc1 %s -emit-llvm -o - -triple=x86_64-unknown-linux-gnu -ffreestanding \
@@ -30,34 +30,62 @@ void fe1a() {
   __scoped_atomic_thread_fence(__ATOMIC_RELEASE, __MEMORY_SCOPE_WRKGRP);
 }
 
-// AMDGCN-LABEL: define hidden void @fe1b(
-// AMDGCN-SAME: i32 noundef [[ORD:%.*]]) #[[ATTR0]] {
-// AMDGCN-NEXT:  [[ENTRY:.*:]]
-// AMDGCN-NEXT:    [[ORD_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGCN-NEXT:    [[ORD_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ORD_ADDR]] to ptr
-// AMDGCN-NEXT:    store i32 [[ORD]], ptr [[ORD_ADDR_ASCAST]], align 4
-// AMDGCN-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ORD_ADDR_ASCAST]], align 4
-// AMDGCN-NEXT:    switch i32 [[TMP0]], label %[[ATOMIC_SCOPE_CONTINUE:.*]] [
-// AMDGCN-NEXT:      i32 1, label %[[ACQUIRE:.*]]
-// AMDGCN-NEXT:      i32 2, label %[[ACQUIRE]]
-// AMDGCN-NEXT:      i32 3, label %[[RELEASE:.*]]
-// AMDGCN-NEXT:      i32 4, label %[[ACQREL:.*]]
-// AMDGCN-NEXT:      i32 5, label %[[SEQCST:.*]]
-// AMDGCN-NEXT:    ]
-// AMDGCN:       [[ATOMIC_SCOPE_CONTINUE]]:
-// AMDGCN-NEXT:    ret void
-// AMDGCN:       [[ACQUIRE]]:
-// AMDGCN-NEXT:    fence syncscope("workgroup") acquire
-// AMDGCN-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
-// AMDGCN:       [[RELEASE]]:
-// AMDGCN-NEXT:    fence syncscope("workgroup") release
-// AMDGCN-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
-// AMDGCN:       [[ACQREL]]:
-// AMDGCN-NEXT:    fence syncscope("workgroup") acq_rel
-// AMDGCN-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
-// AMDGCN:       [[SEQCST]]:
-// AMDGCN-NEXT:    fence syncscope("workgroup") seq_cst
-// AMDGCN-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// AMDGCN-CL12-LABEL: define hidden void @fe1b(
+// AMDGCN-CL12-SAME: i32 noundef [[ORD:%.*]]) #[[ATTR0]] {
+// AMDGCN-CL12-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-CL12-NEXT:    [[ORD_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN-CL12-NEXT:    [[ORD_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ORD_ADDR]] to ptr
+// AMDGCN-CL12-NEXT:    store i32 [[ORD]], ptr [[ORD_ADDR_ASCAST]], align 4
+// AMDGCN-CL12-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ORD_ADDR_ASCAST]], align 4
+// AMDGCN-CL12-NEXT:    switch i32 [[TMP0]], label %[[ATOMIC_SCOPE_CONTINUE:.*]] [
+// AMDGCN-CL12-NEXT:      i32 1, label %[[ACQUIRE:.*]]
+// AMDGCN-CL12-NEXT:      i32 2, label %[[ACQUIRE]]
+// AMDGCN-CL12-NEXT:      i32 3, label %[[RELEASE:.*]]
+// AMDGCN-CL12-NEXT:      i32 4, label %[[ACQREL:.*]]
+// AMDGCN-CL12-NEXT:      i32 5, label %[[SEQCST:.*]]
+// AMDGCN-CL12-NEXT:    ]
+// AMDGCN-CL12:       [[ATOMIC_SCOPE_CONTINUE]]:
+// AMDGCN-CL12-NEXT:    ret void
+// AMDGCN-CL12:       [[ACQUIRE]]:
+// AMDGCN-CL12-NEXT:    fence syncscope("workgroup") acquire
+// AMDGCN-CL12-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// AMDGCN-CL12:       [[RELEASE]]:
+// AMDGCN-CL12-NEXT:    fence syncscope("workgroup") release
+// AMDGCN-CL12-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// AMDGCN-CL12:       [[ACQREL]]:
+// AMDGCN-CL12-NEXT:    fence syncscope("workgroup") acq_rel
+// AMDGCN-CL12-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// AMDGCN-CL12:       [[SEQCST]]:
+// AMDGCN-CL12-NEXT:    fence syncscope("workgroup") seq_cst
+// AMDGCN-CL12-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+//
+// AMDGCN-CL20-LABEL: define hidden void @fe1b(
+// AMDGCN-CL20-SAME: i32 noundef [[ORD:%.*]]) #[[ATTR0]] {
+// AMDGCN-CL20-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-CL20-NEXT:    [[ORD_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN-CL20-NEXT:    store i32 [[ORD]], ptr addrspace(5) [[ORD_ADDR]], align 4
+// AMDGCN-CL20-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[ORD_ADDR]], align 4
+// AMDGCN-CL20-NEXT:    switch i32 [[TMP0]], label %[[ATOMIC_SCOPE_CONTINUE:.*]] [
+// AMDGCN-CL20-NEXT:      i32 1, label %[[ACQUIRE:.*]]
+// AMDGCN-CL20-NEXT:      i32 2, label %[[ACQUIRE]]
+// AMDGCN-CL20-NEXT:      i32 3, label %[[RELEASE:.*]]
+// AMDGCN-CL20-NEXT:      i32 4, label %[[ACQREL:.*]]
+// AMDGCN-CL20-NEXT:      i32 5, label %[[SEQCST:.*]]
+// AMDGCN-CL20-NEXT:    ]
+// AMDGCN-CL20:       [[ATOMIC_SCOPE_CONTINUE]]:
+// AMDGCN-CL20-NEXT:    ret void
+// AMDGCN-CL20:       [[ACQUIRE]]:
+// AMDGCN-CL20-NEXT:    fence syncscope("workgroup") acquire
+// AMDGCN-CL20-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// AMDGCN-CL20:       [[RELEASE]]:
+// AMDGCN-CL20-NEXT:    fence syncscope("workgroup") release
+// AMDGCN-CL20-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// AMDGCN-CL20:       [[ACQREL]]:
+// AMDGCN-CL20-NEXT:    fence syncscope("workgroup") acq_rel
+// AMDGCN-CL20-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// AMDGCN-CL20:       [[SEQCST]]:
+// AMDGCN-CL20-NEXT:    fence syncscope("workgroup") seq_cst
+// AMDGCN-CL20-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
 //
 // SPIRV-LABEL: define hidden spir_func void @fe1b(
 // SPIRV-SAME: i32 noundef [[ORD:%.*]]) #[[ATTR0]] {
@@ -119,37 +147,68 @@ void fe1b(int ord) {
   __scoped_atomic_thread_fence(ord, __MEMORY_SCOPE_WRKGRP);
 }
 
-// AMDGCN-LABEL: define hidden void @fe1c(
-// AMDGCN-SAME: i32 noundef [[SCOPE:%.*]]) #[[ATTR0]] {
-// AMDGCN-NEXT:  [[ENTRY:.*:]]
-// AMDGCN-NEXT:    [[SCOPE_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGCN-NEXT:    [[SCOPE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCOPE_ADDR]] to ptr
-// AMDGCN-NEXT:    store i32 [[SCOPE]], ptr [[SCOPE_ADDR_ASCAST]], align 4
-// AMDGCN-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SCOPE_ADDR_ASCAST]], align 4
-// AMDGCN-NEXT:    switch i32 [[TMP0]], label %[[ATOMIC_SCOPE_CONTINUE:.*]] [
-// AMDGCN-NEXT:      i32 1, label %[[DEVICE_SCOPE:.*]]
-// AMDGCN-NEXT:      i32 0, label %[[SYSTEM_SCOPE:.*]]
-// AMDGCN-NEXT:      i32 2, label %[[WORKGROUP_SCOPE:.*]]
-// AMDGCN-NEXT:      i32 3, label %[[WAVEFRONT_SCOPE:.*]]
-// AMDGCN-NEXT:      i32 4, label %[[SINGLE_SCOPE:.*]]
-// AMDGCN-NEXT:    ]
-// AMDGCN:       [[ATOMIC_SCOPE_CONTINUE]]:
-// AMDGCN-NEXT:    ret void
-// AMDGCN:       [[DEVICE_SCOPE]]:
-// AMDGCN-NEXT:    fence syncscope("agent") release
-// AMDGCN-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
-// AMDGCN:       [[SYSTEM_SCOPE]]:
-// AMDGCN-NEXT:    fence release
-// AMDGCN-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
-// AMDGCN:       [[WORKGROUP_SCOPE]]:
-// AMDGCN-NEXT:    fence syncscope("workgroup") release
-// AMDGCN-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
-// AMDGCN:       [[WAVEFRONT_SCOPE]]:
-// AMDGCN-NEXT:    fence syncscope("wavefront") release
-// AMDGCN-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
-// AMDGCN:       [[SINGLE_SCOPE]]:
-// AMDGCN-NEXT:    fence syncscope("singlethread") release
-// AMDGCN-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// AMDGCN-CL12-LABEL: define hidden void @fe1c(
+// AMDGCN-CL12-SAME: i32 noundef [[SCOPE:%.*]]) #[[ATTR0]] {
+// AMDGCN-CL12-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-CL12-NEXT:    [[SCOPE_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN-CL12-NEXT:    [[SCOPE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCOPE_ADDR]] to ptr
+// AMDGCN-CL12-NEXT:    store i32 [[SCOPE]], ptr [[SCOPE_ADDR_ASCAST]], align 4
+// AMDGCN-CL12-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SCOPE_ADDR_ASCAST]], align 4
+// AMDGCN-CL12-NEXT:    switch i32 [[TMP0]], label %[[ATOMIC_SCOPE_CONTINUE:.*]] [
+// AMDGCN-CL12-NEXT:      i32 1, label %[[DEVICE_SCOPE:.*]]
+// AMDGCN-CL12-NEXT:      i32 0, label %[[SYSTEM_SCOPE:.*]]
+// AMDGCN-CL12-NEXT:      i32 2, label %[[WORKGROUP_SCOPE:.*]]
+// AMDGCN-CL12-NEXT:      i32 3, label %[[WAVEFRONT_SCOPE:.*]]
+// AMDGCN-CL12-NEXT:      i32 4, label %[[SINGLE_SCOPE:.*]]
+// AMDGCN-CL12-NEXT:    ]
+// AMDGCN-CL12:       [[ATOMIC_SCOPE_CONTINUE]]:
+// AMDGCN-CL12-NEXT:    ret void
+// AMDGCN-CL12:       [[DEVICE_SCOPE]]:
+// AMDGCN-CL12-NEXT:    fence syncscope("agent") release
+// AMDGCN-CL12-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// AMDGCN-CL12:       [[SYSTEM_SCOPE]]:
+// AMDGCN-CL12-NEXT:    fence release
+// AMDGCN-CL12-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// AMDGCN-CL12:       [[WORKGROUP_SCOPE]]:
+// AMDGCN-CL12-NEXT:    fence syncscope("workgroup") release
+// AMDGCN-CL12-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// AMDGCN-CL12:       [[WAVEFRONT_SCOPE]]:
+// AMDGCN-CL12-NEXT:    fence syncscope("wavefront") release
+// AMDGCN-CL12-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// AMDGCN-CL12:       [[SINGLE_SCOPE]]:
+// AMDGCN-CL12-NEXT:    fence syncscope("singlethread") release
+// AMDGCN-CL12-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+//
+// AMDGCN-CL20-LABEL: define hidden void @fe1c(
+// AMDGCN-CL20-SAME: i32 noundef [[SCOPE:%.*]]) #[[ATTR0]] {
+// AMDGCN-CL20-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-CL20-NEXT:    [[SCOPE_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN-CL20-NEXT:    store i32 [[SCOPE]], ptr addrspace(5) [[SCOPE_ADDR]], align 4
+// AMDGCN-CL20-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SCOPE_ADDR]], align 4
+// AMDGCN-CL20-NEXT:    switch i32 [[TMP0]], label %[[ATOMIC_SCOPE_CONTINUE:.*]] [
+// AMDGCN-CL20-NEXT:      i32 1, label %[[DEVICE_SCOPE:.*]]
+// AMDGCN-CL20-NEXT:      i32 0, label %[[SYSTEM_SCOPE:.*]]
+// AMDGCN-CL20-NEXT:      i32 2, label %[[WORKGROUP_SCOPE:.*]]
+// AMDGCN-CL20-NEXT:      i32 3, label %[[WAVEFRONT_SCOPE:.*]]
+// AMDGCN-CL20-NEXT:      i32 4, label %[[SINGLE_SCOPE:.*]]
+// AMDGCN-CL20-NEXT:    ]
+// AMDGCN-CL20:       [[ATOMIC_SCOPE_CONTINUE]]:
+// AMDGCN-CL20-NEXT:    ret void
+// AMDGCN-CL20:       [[DEVICE_SCOPE]]:
+// AMDGCN-CL20-NEXT:    fence syncscope("agent") release
+// AMDGCN-CL20-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// AMDGCN-CL20:       [[SYSTEM_SCOPE]]:
+// AMDGCN-CL20-NEXT:    fence release
+// AMDGCN-CL20-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// AMDGCN-CL20:       [[WORKGROUP_SCOPE]]:
+// AMDGCN-CL20-NEXT:    fence syncscope("workgroup") release
+// AMDGCN-CL20-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// AMDGCN-CL20:       [[WAVEFRONT_SCOPE]]:
+// AMDGCN-CL20-NEXT:    fence syncscope("wavefront") release
+// AMDGCN-CL20-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// AMDGCN-CL20:       [[SINGLE_SCOPE]]:
+// AMDGCN-CL20-NEXT:    fence syncscope("singlethread") release
+// AMDGCN-CL20-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
 //
 // SPIRV-LABEL: define hidden spir_func void @fe1c(
 // SPIRV-SAME: i32 noundef [[SCOPE:%.*]]) #[[ATTR0]] {
diff --git a/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl b/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl
index 57d056b0ff9d5..7377b5bcbc347 100644
--- a/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl
+++ b/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl
@@ -69,11 +69,9 @@ struct LargeStructOneMember g_s;
 // AMDGCN20-NEXT:  [[ENTRY:.*:]]
 // AMDGCN20-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5)
 // AMDGCN20-NEXT:    [[IN:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4, addrspace(5)
-// AMDGCN20-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
-// AMDGCN20-NEXT:    [[IN1:%.*]] = addrspacecast ptr addrspace(5) [[IN]] to ptr
-// AMDGCN20-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr [[IN1]], i32 0, i32 0
-// AMDGCN20-NEXT:    store [9 x i32] [[IN_COERCE]], ptr [[COERCE_DIVE]], align 4
-// AMDGCN20-NEXT:    [[TMP0:%.*]] = load [[STRUCT_MAT4X4]], ptr [[RETVAL_ASCAST]], align 4
+// AMDGCN20-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(5) [[IN]], i32 0, i32 0
+// AMDGCN20-NEXT:    store [9 x i32] [[IN_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 4
+// AMDGCN20-NEXT:    [[TMP0:%.*]] = load [[STRUCT_MAT4X4]], ptr addrspace(5) [[RETVAL]], align 4
 // AMDGCN20-NEXT:    ret [[STRUCT_MAT4X4]] [[TMP0]]
 //
 // SPIR-LABEL: define dso_local spir_func void @foo(
@@ -152,22 +150,19 @@ Mat4X4 __attribute__((noinline)) foo(Mat3X3 in) {
 // AMDGCN20-NEXT:    [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // AMDGCN20-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // AMDGCN20-NEXT:    [[TMP:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5)
-// AMDGCN20-NEXT:    [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR]] to ptr
-// AMDGCN20-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// AMDGCN20-NEXT:    [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr
-// AMDGCN20-NEXT:    store ptr addrspace(1) [[IN]], ptr [[IN_ADDR_ASCAST]], align 8
-// AMDGCN20-NEXT:    store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
-// AMDGCN20-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// AMDGCN20-NEXT:    store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8
+// AMDGCN20-NEXT:    store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// AMDGCN20-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // AMDGCN20-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT4X4]], ptr addrspace(1) [[TMP0]], i64 0
-// AMDGCN20-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_ASCAST]], align 8
+// AMDGCN20-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8
 // AMDGCN20-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr addrspace(1) [[TMP1]], i64 1
 // AMDGCN20-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(1) [[ARRAYIDX1]], i32 0, i32 0
 // AMDGCN20-NEXT:    [[TMP3:%.*]] = load [9 x i32], ptr addrspace(1) [[TMP2]], align 4
 // AMDGCN20-NEXT:    [[CALL:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP3]]) #[[ATTR3:[0-9]+]]
-// AMDGCN20-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT4X4]], ptr [[TMP_ASCAST]], i32 0, i32 0
+// AMDGCN20-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT4X4]], ptr addrspace(5) [[TMP]], i32 0, i32 0
 // AMDGCN20-NEXT:    [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL]], 0
-// AMDGCN20-NEXT:    store [16 x i32] [[TMP5]], ptr [[TMP4]], align 4
-// AMDGCN20-NEXT:    call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr align 4 [[TMP_ASCAST]], i64 64, i1 false)
+// AMDGCN20-NEXT:    store [16 x i32] [[TMP5]], ptr addrspace(5) [[TMP4]], align 4
+// AMDGCN20-NEXT:    call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 64, i1 false)
 // AMDGCN20-NEXT:    ret void
 //
 // SPIR-LABEL: define dso_local spir_kernel void @ker(
@@ -250,11 +245,10 @@ kernel void ker(global Mat3X3 *in, global Mat4X4 *out) {
 // AMDGCN-NEXT:    ret void
 //
 // AMDGCN20-LABEL: define dso_local void @foo_large(
-// AMDGCN20-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] {
+// AMDGCN20-SAME: ptr addrspace(5) dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] {
 // AMDGCN20-NEXT:  [[ENTRY:.*:]]
-// AMDGCN20-NEXT:    [[COERCE:%.*]] = alloca [[STRUCT_MAT32X32]], align 4, addrspace(5)
-// AMDGCN20-NEXT:    [[IN:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr
-// AMDGCN20-NEXT:    call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[IN]], ptr addrspace(5) align 4 [[TMP0]], i64 4096, i1 false)
+// AMDGCN20-NEXT:    [[IN:%.*]] = alloca [[STRUCT_MAT32X32]], align 4, addrspace(5)
+// AMDGCN20-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[IN]], ptr addrspace(5) align 4 [[TMP0]], i64 4096, i1 false)
 // AMDGCN20-NEXT:    ret void
 //
 // SPIR-LABEL: define dso_local spir_func void @foo_large(
@@ -325,18 +319,15 @@ Mat64X64 __attribute__((noinline)) foo_large(Mat32X32 in) {
 // AMDGCN20-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // AMDGCN20-NEXT:    [[TMP:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4, addrspace(5)
 // AMDGCN20-NEXT:    [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4, addrspace(5)
-// AMDGCN20-NEXT:    [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR]] to ptr
-// AMDGCN20-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// AMDGCN20-NEXT:    [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr
-// AMDGCN20-NEXT:    store ptr addrspace(1) [[IN]], ptr [[IN_ADDR_ASCAST]], align 8
-// AMDGCN20-NEXT:    store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
-// AMDGCN20-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// AMDGCN20-NEXT:    store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8
+// AMDGCN20-NEXT:    store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// AMDGCN20-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // AMDGCN20-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT64X64]], ptr addrspace(1) [[TMP0]], i64 0
-// AMDGCN20-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_ASCAST]], align 8
+// AMDGCN20-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8
 // AMDGCN20-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i64 1
 // AMDGCN20-NEXT:    call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i64 4096, i1 false)
-// AMDGCN20-NEXT:    call void @foo_large(ptr dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP_ASCAST]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]]
-// AMDGCN20-NEXT:    call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr align 4 [[TMP_ASCAST]], i64 16384, i1 false)
+// AMDGCN20-NEXT:    call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]]
+// AMDGCN20-NEXT:    call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 16384, i1 false)
 // AMDGCN20-NEXT:    ret void
 //
 // SPIR-LABEL: define dso_local spir_kernel void @ker_large(
@@ -428,14 +419,12 @@ kernel void ker_large(global Mat32X32 *in, global Mat64X64 *out) {
 // AMDGCN20-NEXT:  [[ENTRY:.*:]]
 // AMDGCN20-NEXT:    [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5)
 // AMDGCN20-NEXT:    [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
-// AMDGCN20-NEXT:    [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr
-// AMDGCN20-NEXT:    [[DOTCOMPOUNDLITERAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCOMPOUNDLITERAL]] to ptr
-// AMDGCN20-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0
-// AMDGCN20-NEXT:    store <2 x i32> [[U_COERCE]], ptr [[COERCE_DIVE]], align 8
-// AMDGCN20-NEXT:    store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8
-// AMDGCN20-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8
-// AMDGCN20-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0
-// AMDGCN20-NEXT:    store <2 x i32> [[TMP0]], ptr [[X]], align 8
+// AMDGCN20-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN20-NEXT:    store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8
+// AMDGCN20-NEXT:    store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN20-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN20-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN20-NEXT:    store <2 x i32> [[TMP0]], ptr addrspace(5) [[X]], align 8
 // AMDGCN20-NEXT:    ret void
 //
 // SPIR-LABEL: define dso_local spir_func void @FuncOneMember(
@@ -508,16 +497,14 @@ void FuncOneMember(struct StructOneMember u) {
 // AMDGCN20-LABEL: define dso_local void @FuncOneLargeMember(
 // AMDGCN20-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] {
 // AMDGCN20-NEXT:  [[ENTRY:.*:]]
-// AMDGCN20-NEXT:    [[COERCE:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5)
+// AMDGCN20-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5)
 // AMDGCN20-NEXT:    [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
-// AMDGCN20-NEXT:    [[U:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr
-// AMDGCN20-NEXT:    [[DOTCOMPOUNDLITERAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCOMPOUNDLITERAL]] to ptr
-// AMDGCN20-NEXT:    call void @llvm.memcpy.p0.p5.i64(ptr align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false)
-// AMDGCN20-NEXT:    store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8
-// AMDGCN20-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8
-// AMDGCN20-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr [[U]], i32 0, i32 0
-// AMDGCN20-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x <2 x i32>], ptr [[X]], i64 0, i64 0
-// AMDGCN20-NEXT:    store <2 x i32> [[TMP1]], ptr [[ARRAYIDX]], align 8
+// AMDGCN20-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false)
+// AMDGCN20-NEXT:    store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN20-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN20-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN20-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x <2 x i32>], ptr addrspace(5) [[X]], i64 0, i64 0
+// AMDGCN20-NEXT:    store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8
 // AMDGCN20-NEXT:    ret void
 //
 // SPIR-LABEL: define dso_local spir_func void @FuncOneLargeMember(
@@ -656,10 +643,7 @@ kernel void test_indirect_arg_local(void) {
 // AMDGCN20-SAME: ) #[[ATTR0]] {
 // AMDGCN20-NEXT:  [[ENTRY:.*:]]
 // AMDGCN20-NEXT:    [[P_S:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5)
-// AMDGCN20-NEXT:    [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5)
-// AMDGCN20-NEXT:    [[P_S_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[P_S]] to ptr
-// AMDGCN20-NEXT:    call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr align 8 [[P_S_ASCAST]], i64 800, i1 false)
-// AMDGCN20-NEXT:    call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]]
+// AMDGCN20-NEXT:    call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[P_S]]) #[[ATTR3]]
 // AMDGCN20-NEXT:    ret void
 //
 // SPIR-LABEL: define dso_local spir_func void @test_indirect_arg_private(
@@ -710,11 +694,10 @@ void test_indirect_arg_private(void) {
 // AMDGCN20-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10:![0-9]+]] !kernel_arg_access_qual [[META11:![0-9]+]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13:![0-9]+]] {
 // AMDGCN20-NEXT:  [[ENTRY:.*:]]
 // AMDGCN20-NEXT:    [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5)
-// AMDGCN20-NEXT:    [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr
-// AMDGCN20-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0
-// AMDGCN20-NEXT:    store <2 x i32> [[U_COERCE]], ptr [[COERCE_DIVE]], align 8
-// AMDGCN20-NEXT:    [[COERCE_DIVE2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0
-// AMDGCN20-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[COERCE_DIVE2]], align 8
+// AMDGCN20-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN20-NEXT:    store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8
+// AMDGCN20-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN20-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE1]], align 8
 // AMDGCN20-NEXT:    call void @FuncOneMember(<2 x i32> [[TMP0]]) #[[ATTR3]]
 // AMDGCN20-NEXT:    ret void
 //
@@ -777,9 +760,8 @@ kernel void KernelOneMember(struct StructOneMember u) {
 // AMDGCN20-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META14:![0-9]+]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] {
 // AMDGCN20-NEXT:  [[ENTRY:.*:]]
 // AMDGCN20-NEXT:    [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
-// AMDGCN20-NEXT:    [[U_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[U_ADDR]] to ptr
-// AMDGCN20-NEXT:    store ptr addrspace(1) [[U]], ptr [[U_ADDR_ASCAST]], align 8
-// AMDGCN20-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[U_ADDR_ASCAST]], align 8
+// AMDGCN20-NEXT:    store ptr addrspace(1) [[U]], ptr addrspace(5) [[U_ADDR]], align 8
+// AMDGCN20-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[U_ADDR]], align 8
 // AMDGCN20-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER:%.*]], ptr addrspace(1) [[TMP0]], i32 0, i32 0
 // AMDGCN20-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[COERCE_DIVE]], align 8
 // AMDGCN20-NEXT:    call void @FuncOneMember(<2 x i32> [[TMP1]]) #[[ATTR3]]
@@ -843,13 +825,10 @@ kernel void KernelOneMemberSpir(global struct StructOneMember* u) {
 // AMDGCN20-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] {
 // AMDGCN20-NEXT:  [[ENTRY:.*:]]
 // AMDGCN20-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5)
-// AMDGCN20-NEXT:    [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5)
-// AMDGCN20-NEXT:    [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr
-// AMDGCN20-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0
+// AMDGCN20-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
 // AMDGCN20-NEXT:    [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], 0
-// AMDGCN20-NEXT:    store [100 x <2 x i32>] [[TMP1]], ptr [[TMP0]], align 8
-// AMDGCN20-NEXT:    call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr align 8 [[U1]], i64 800, i1 false)
-// AMDGCN20-NEXT:    call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]]
+// AMDGCN20-NEXT:    store [100 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8
+// AMDGCN20-NEXT:    call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR3]]
 // AMDGCN20-NEXT:    ret void
 //
 // SPIR-LABEL: define dso_local spir_kernel void @KernelLargeOneMember(
@@ -915,16 +894,14 @@ kernel void KernelLargeOneMember(struct LargeStructOneMember u) {
 // AMDGCN20-NEXT:  [[ENTRY:.*:]]
 // AMDGCN20-NEXT:    [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER:%.*]], align 8, addrspace(5)
 // AMDGCN20-NEXT:    [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
-// AMDGCN20-NEXT:    [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr
-// AMDGCN20-NEXT:    [[DOTCOMPOUNDLITERAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCOMPOUNDLITERAL]] to ptr
-// AMDGCN20-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0
-// AMDGCN20-NEXT:    store <2 x i32> [[U_COERCE0]], ptr [[TMP0]], align 8
-// AMDGCN20-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1
-// AMDGCN20-NEXT:    store <2 x i32> [[U_COERCE1]], ptr [[TMP1]], align 8
-// AMDGCN20-NEXT:    store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8
-// AMDGCN20-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8
-// AMDGCN20-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1
-// AMDGCN20-NEXT:    store <2 x i32> [[TMP2]], ptr [[Y]], align 8
+// AMDGCN20-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN20-NEXT:    store <2 x i32> [[U_COERCE0]], ptr addrspace(5) [[TMP0]], align 8
+// AMDGCN20-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN20-NEXT:    store <2 x i32> [[U_COERCE1]], ptr addrspace(5) [[TMP1]], align 8
+// AMDGCN20-NEXT:    store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN20-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN20-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN20-NEXT:    store <2 x i32> [[TMP2]], ptr addrspace(5) [[Y]], align 8
 // AMDGCN20-NEXT:    ret void
 //
 // SPIR-LABEL: define dso_local spir_func void @FuncTwoMember(
@@ -1001,16 +978,14 @@ void FuncTwoMember(struct StructTwoMember u) {
 // AMDGCN20-LABEL: define dso_local void @FuncLargeTwoMember(
 // AMDGCN20-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] {
 // AMDGCN20-NEXT:  [[ENTRY:.*:]]
-// AMDGCN20-NEXT:    [[COERCE:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5)
+// AMDGCN20-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5)
 // AMDGCN20-NEXT:    [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
-// AMDGCN20-NEXT:    [[U:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr
-// AMDGCN20-NEXT:    [[DOTCOMPOUNDLITERAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCOMPOUNDLITERAL]] to ptr
-// AMDGCN20-NEXT:    call void @llvm.memcpy.p0.p5.i64(ptr align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false)
-// AMDGCN20-NEXT:    store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8
-// AMDGCN20-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8
-// AMDGCN20-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr [[U]], i32 0, i32 1
-// AMDGCN20-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [20 x <2 x i32>], ptr [[Y]], i64 0, i64 0
-// AMDGCN20-NEXT:    store <2 x i32> [[TMP1]], ptr [[ARRAYIDX]], align 8
+// AMDGCN20-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false)
+// AMDGCN20-NEXT:    store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN20-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN20-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN20-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [20 x <2 x i32>], ptr addrspace(5) [[Y]], i64 0, i64 0
+// AMDGCN20-NEXT:    store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8
 // AMDGCN20-NEXT:    ret void
 //
 // SPIR-LABEL: define dso_local spir_func void @FuncLargeTwoMember(
@@ -1082,17 +1057,16 @@ void FuncLargeTwoMember(struct LargeStructTwoMember u) {
 // AMDGCN20-SAME: [[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] {
 // AMDGCN20-NEXT:  [[ENTRY:.*:]]
 // AMDGCN20-NEXT:    [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5)
-// AMDGCN20-NEXT:    [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr
-// AMDGCN20-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0
+// AMDGCN20-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
 // AMDGCN20-NEXT:    [[TMP1:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 0
-// AMDGCN20-NEXT:    store <2 x i32> [[TMP1]], ptr [[TMP0]], align 8
-// AMDGCN20-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1
+// AMDGCN20-NEXT:    store <2 x i32> [[TMP1]], ptr addrspace(5) [[TMP0]], align 8
+// AMDGCN20-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
 // AMDGCN20-NEXT:    [[TMP3:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 1
-// AMDGCN20-NEXT:    store <2 x i32> [[TMP3]], ptr [[TMP2]], align 8
-// AMDGCN20-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0
-// AMDGCN20-NEXT:    [[TMP5:%.*]] = load <2 x i32>, ptr [[TMP4]], align 8
-// AMDGCN20-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1
-// AMDGCN20-NEXT:    [[TMP7:%.*]] = load <2 x i32>, ptr [[TMP6]], align 8
+// AMDGCN20-NEXT:    store <2 x i32> [[TMP3]], ptr addrspace(5) [[TMP2]], align 8
+// AMDGCN20-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN20-NEXT:    [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP4]], align 8
+// AMDGCN20-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN20-NEXT:    [[TMP7:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP6]], align 8
 // AMDGCN20-NEXT:    call void @FuncTwoMember(<2 x i32> [[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR3]]
 // AMDGCN20-NEXT:    ret void
 //
@@ -1164,16 +1138,13 @@ kernel void KernelTwoMember(struct StructTwoMember u) {
 // AMDGCN20-SAME: [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] {
 // AMDGCN20-NEXT:  [[ENTRY:.*:]]
 // AMDGCN20-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5)
-// AMDGCN20-NEXT:    [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5)
-// AMDGCN20-NEXT:    [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr
-// AMDGCN20-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0
+// AMDGCN20-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
 // AMDGCN20-NEXT:    [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 0
-// AMDGCN20-NEXT:    store [40 x <2 x i32>] [[TMP1]], ptr [[TMP0]], align 8
-// AMDGCN20-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1
+// AMDGCN20-NEXT:    store [40 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8
+// AMDGCN20-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
 // AMDGCN20-NEXT:    [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 1
-// AMDGCN20-NEXT:    store [20 x <2 x i32>] [[TMP3]], ptr [[TMP2]], align 8
-// AMDGCN20-NEXT:    call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr align 8 [[U1]], i64 480, i1 false)
-// AMDGCN20-NEXT:    call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]]
+// AMDGCN20-NEXT:    store [20 x <2 x i32>] [[TMP3]], ptr addrspace(5) [[TMP2]], align 8
+// AMDGCN20-NEXT:    call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR3]]
 // AMDGCN20-NEXT:    ret void
 //
 // SPIR-LABEL: define dso_local spir_kernel void @KernelLargeTwoMember(
diff --git a/clang/test/CodeGenOpenCL/amdgcn-automatic-variable.cl b/clang/test/CodeGenOpenCL/amdgcn-automatic-variable.cl
index dba6519966eb5..c847f5850b223 100644
--- a/clang/test/CodeGenOpenCL/amdgcn-automatic-variable.cl
+++ b/clang/test/CodeGenOpenCL/amdgcn-automatic-variable.cl
@@ -15,9 +15,8 @@
 // CL20-SAME: ptr noundef [[X:%.*]]) #[[ATTR0:[0-9]+]] {
 // CL20-NEXT:  [[ENTRY:.*:]]
 // CL20-NEXT:    [[X_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// CL20-NEXT:    [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr
-// CL20-NEXT:    store ptr [[X]], ptr [[X_ADDR_ASCAST]], align 8
-// CL20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[X_ADDR_ASCAST]], align 8
+// CL20-NEXT:    store ptr [[X]], ptr addrspace(5) [[X_ADDR]], align 8
+// CL20-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[X_ADDR]], align 8
 // CL20-NEXT:    store i32 1, ptr [[TMP0]], align 4
 // CL20-NEXT:    ret void
 //
@@ -55,27 +54,25 @@ void func1(int *x) {
 // CL20-NEXT:    [[LP1:%.*]] = alloca ptr, align 8, addrspace(5)
 // CL20-NEXT:    [[LP2:%.*]] = alloca ptr, align 8, addrspace(5)
 // CL20-NEXT:    [[LVC:%.*]] = alloca i32, align 4, addrspace(5)
+// CL20-NEXT:    store i32 1, ptr addrspace(5) [[LV1]], align 4
+// CL20-NEXT:    store i32 2, ptr addrspace(5) [[LV2]], align 4
+// CL20-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x i32], ptr addrspace(5) [[LA]], i64 0, i64 0
+// CL20-NEXT:    store i32 3, ptr addrspace(5) [[ARRAYIDX]], align 4
 // CL20-NEXT:    [[LV1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[LV1]] to ptr
-// CL20-NEXT:    [[LV2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[LV2]] to ptr
-// CL20-NEXT:    [[LA_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[LA]] to ptr
-// CL20-NEXT:    [[LP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[LP1]] to ptr
-// CL20-NEXT:    [[LP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[LP2]] to ptr
-// CL20-NEXT:    [[LVC_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[LVC]] to ptr
-// CL20-NEXT:    store i32 1, ptr [[LV1_ASCAST]], align 4
-// CL20-NEXT:    store i32 2, ptr [[LV2_ASCAST]], align 4
-// CL20-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x i32], ptr [[LA_ASCAST]], i64 0, i64 0
-// CL20-NEXT:    store i32 3, ptr [[ARRAYIDX]], align 4
-// CL20-NEXT:    store ptr [[LV1_ASCAST]], ptr [[LP1_ASCAST]], align 8
-// CL20-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [100 x i32], ptr [[LA_ASCAST]], i64 0, i64 0
-// CL20-NEXT:    store ptr [[ARRAYDECAY]], ptr [[LP2_ASCAST]], align 8
-// CL20-NEXT:    call void @func1(ptr noundef [[LV1_ASCAST]]) #[[ATTR2:[0-9]+]]
-// CL20-NEXT:    store i32 4, ptr [[LVC_ASCAST]], align 4
-// CL20-NEXT:    store i32 4, ptr [[LV1_ASCAST]], align 4
+// CL20-NEXT:    store ptr [[LV1_ASCAST]], ptr addrspace(5) [[LP1]], align 8
+// CL20-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [100 x i32], ptr addrspace(5) [[LA]], i64 0, i64 0
+// CL20-NEXT:    [[ARRAYDECAY_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ARRAYDECAY]] to ptr
+// CL20-NEXT:    store ptr [[ARRAYDECAY_ASCAST]], ptr addrspace(5) [[LP2]], align 8
+// CL20-NEXT:    [[LV1_ASCAST1:%.*]] = addrspacecast ptr addrspace(5) [[LV1]] to ptr
+// CL20-NEXT:    call void @func1(ptr noundef [[LV1_ASCAST1]]) #[[ATTR2:[0-9]+]]
+// CL20-NEXT:    store i32 4, ptr addrspace(5) [[LVC]], align 4
+// CL20-NEXT:    store i32 4, ptr addrspace(5) [[LV1]], align 4
 // CL20-NEXT:    ret void
 //
 void func2(void) {
   int lv1;
   lv1 = 1;
+
   int lv2 = 2;
 
   int la[100];
@@ -102,8 +99,7 @@ void func2(void) {
 // CL20-SAME: ) #[[ATTR0]] {
 // CL20-NEXT:  [[ENTRY:.*:]]
 // CL20-NEXT:    [[A:%.*]] = alloca [16 x [1 x float]], align 4, addrspace(5)
-// CL20-NEXT:    [[A_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A]] to ptr
-// CL20-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[A_ASCAST]], i8 0, i64 64, i1 false)
+// CL20-NEXT:    call void @llvm.memset.p5.i64(ptr addrspace(5) align 4 [[A]], i8 0, i64 64, i1 false)
 // CL20-NEXT:    ret void
 //
 void func3(void) {
diff --git a/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl b/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl
index 084281a8cada4..a5f682646d338 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // RUN: %clang_cc1 %s -emit-llvm -o - -cl-std=CL2.0 -O0 -triple amdgcn | FileCheck -check-prefix=AMDGCN %s
 
 typedef int int2 __attribute__((ext_vector_type(2)));
@@ -42,16 +42,14 @@ struct LargeStructOneMember g_s;
 #endif
 
 
-// AMDGCN-LABEL: define dso_local %struct.Mat4X4 @foo(
-// AMDGCN-SAME: [9 x i32] [[IN_COERCE:%.*]]) #[[ATTR0:[0-9]+]] {
-// AMDGCN-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-LABEL: define dso_local %struct.Mat4X4 @foo
+// AMDGCN-SAME: ([9 x i32] [[IN_COERCE:%.*]]) #[[ATTR0:[0-9]+]] {
+// AMDGCN-NEXT:  entry:
 // AMDGCN-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5)
 // AMDGCN-NEXT:    [[IN:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4, addrspace(5)
-// AMDGCN-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
-// AMDGCN-NEXT:    [[IN1:%.*]] = addrspacecast ptr addrspace(5) [[IN]] to ptr
-// AMDGCN-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr [[IN1]], i32 0, i32 0
-// AMDGCN-NEXT:    store [9 x i32] [[IN_COERCE]], ptr [[COERCE_DIVE]], align 4
-// AMDGCN-NEXT:    [[TMP0:%.*]] = load [[STRUCT_MAT4X4]], ptr [[RETVAL_ASCAST]], align 4
+// AMDGCN-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(5) [[IN]], i32 0, i32 0
+// AMDGCN-NEXT:    store [9 x i32] [[IN_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 4
+// AMDGCN-NEXT:    [[TMP0:%.*]] = load [[STRUCT_MAT4X4]], ptr addrspace(5) [[RETVAL]], align 4
 // AMDGCN-NEXT:    ret [[STRUCT_MAT4X4]] [[TMP0]]
 //
 Mat4X4 __attribute__((noinline)) foo(Mat3X3 in) {
@@ -62,40 +60,36 @@ Mat4X4 __attribute__((noinline)) foo(Mat3X3 in) {
 // Expect two mem copies: one for the argument "in", and one for
 // the return value.
 
-// AMDGCN-LABEL: define dso_local amdgpu_kernel void @ker(
-// AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] {
-// AMDGCN-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-LABEL: define dso_local amdgpu_kernel void @ker
+// AMDGCN-SAME: (ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] {
+// AMDGCN-NEXT:  entry:
 // AMDGCN-NEXT:    [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // AMDGCN-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // AMDGCN-NEXT:    [[TMP:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5)
-// AMDGCN-NEXT:    [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR]] to ptr
-// AMDGCN-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// AMDGCN-NEXT:    [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr
-// AMDGCN-NEXT:    store ptr addrspace(1) [[IN]], ptr [[IN_ADDR_ASCAST]], align 8
-// AMDGCN-NEXT:    store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
-// AMDGCN-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT:    store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8
+// AMDGCN-NEXT:    store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// AMDGCN-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // AMDGCN-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT4X4]], ptr addrspace(1) [[TMP0]], i64 0
-// AMDGCN-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8
 // AMDGCN-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr addrspace(1) [[TMP1]], i64 1
 // AMDGCN-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(1) [[ARRAYIDX1]], i32 0, i32 0
 // AMDGCN-NEXT:    [[TMP3:%.*]] = load [9 x i32], ptr addrspace(1) [[TMP2]], align 4
 // AMDGCN-NEXT:    [[CALL:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP3]]) #[[ATTR3:[0-9]+]]
-// AMDGCN-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT4X4]], ptr [[TMP_ASCAST]], i32 0, i32 0
+// AMDGCN-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT4X4]], ptr addrspace(5) [[TMP]], i32 0, i32 0
 // AMDGCN-NEXT:    [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL]], 0
-// AMDGCN-NEXT:    store [16 x i32] [[TMP5]], ptr [[TMP4]], align 4
-// AMDGCN-NEXT:    call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr align 4 [[TMP_ASCAST]], i64 64, i1 false)
+// AMDGCN-NEXT:    store [16 x i32] [[TMP5]], ptr addrspace(5) [[TMP4]], align 4
+// AMDGCN-NEXT:    call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 64, i1 false)
 // AMDGCN-NEXT:    ret void
 //
 kernel void ker(global Mat3X3 *in, global Mat4X4 *out) {
   out[0] = foo(in[1]);
 }
 
-// AMDGCN-LABEL: define dso_local void @foo_large(
-// AMDGCN-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] {
-// AMDGCN-NEXT:  [[ENTRY:.*:]]
-// AMDGCN-NEXT:    [[COERCE:%.*]] = alloca [[STRUCT_MAT32X32]], align 4, addrspace(5)
-// AMDGCN-NEXT:    [[IN:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr
-// AMDGCN-NEXT:    call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[IN]], ptr addrspace(5) align 4 [[TMP0]], i64 4096, i1 false)
+// AMDGCN-LABEL: define dso_local void @foo_large
+// AMDGCN-SAME: (ptr addrspace(5) dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] {
+// AMDGCN-NEXT:  entry:
+// AMDGCN-NEXT:    [[IN:%.*]] = alloca [[STRUCT_MAT32X32]], align 4, addrspace(5)
+// AMDGCN-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[IN]], ptr addrspace(5) align 4 [[TMP0]], i64 4096, i1 false)
 // AMDGCN-NEXT:    ret void
 //
 Mat64X64 __attribute__((noinline)) foo_large(Mat32X32 in) {
@@ -103,63 +97,56 @@ Mat64X64 __attribute__((noinline)) foo_large(Mat32X32 in) {
   return out;
 }
 
-// AMDGCN-LABEL: define dso_local amdgpu_kernel void @ker_large(
-// AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8:![0-9]+]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] {
-// AMDGCN-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-LABEL: define dso_local amdgpu_kernel void @ker_large
+// AMDGCN-SAME: (ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8:![0-9]+]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] {
+// AMDGCN-NEXT:  entry:
 // AMDGCN-NEXT:    [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // AMDGCN-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // AMDGCN-NEXT:    [[TMP:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4, addrspace(5)
 // AMDGCN-NEXT:    [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4, addrspace(5)
-// AMDGCN-NEXT:    [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR]] to ptr
-// AMDGCN-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// AMDGCN-NEXT:    [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr
-// AMDGCN-NEXT:    store ptr addrspace(1) [[IN]], ptr [[IN_ADDR_ASCAST]], align 8
-// AMDGCN-NEXT:    store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
-// AMDGCN-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT:    store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8
+// AMDGCN-NEXT:    store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// AMDGCN-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // AMDGCN-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT64X64]], ptr addrspace(1) [[TMP0]], i64 0
-// AMDGCN-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8
 // AMDGCN-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i64 1
 // AMDGCN-NEXT:    call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i64 4096, i1 false)
-// AMDGCN-NEXT:    call void @foo_large(ptr dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP_ASCAST]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]]
-// AMDGCN-NEXT:    call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr align 4 [[TMP_ASCAST]], i64 16384, i1 false)
+// AMDGCN-NEXT:    call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]]
+// AMDGCN-NEXT:    call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 16384, i1 false)
 // AMDGCN-NEXT:    ret void
 //
 kernel void ker_large(global Mat32X32 *in, global Mat64X64 *out) {
   out[0] = foo_large(in[1]);
 }
 
-// AMDGCN-LABEL: define dso_local void @FuncOneMember(
-// AMDGCN-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR0]] {
-// AMDGCN-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-LABEL: define dso_local void @FuncOneMember
+// AMDGCN-SAME: (<2 x i32> [[U_COERCE:%.*]]) #[[ATTR0]] {
+// AMDGCN-NEXT:  entry:
 // AMDGCN-NEXT:    [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5)
 // AMDGCN-NEXT:    [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
-// AMDGCN-NEXT:    [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr
-// AMDGCN-NEXT:    [[DOTCOMPOUNDLITERAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCOMPOUNDLITERAL]] to ptr
-// AMDGCN-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0
-// AMDGCN-NEXT:    store <2 x i32> [[U_COERCE]], ptr [[COERCE_DIVE]], align 8
-// AMDGCN-NEXT:    store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8
-// AMDGCN-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8
-// AMDGCN-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0
-// AMDGCN-NEXT:    store <2 x i32> [[TMP0]], ptr [[X]], align 8
+// AMDGCN-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN-NEXT:    store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8
+// AMDGCN-NEXT:    store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN-NEXT:    store <2 x i32> [[TMP0]], ptr addrspace(5) [[X]], align 8
 // AMDGCN-NEXT:    ret void
 //
 void FuncOneMember(struct StructOneMember u) {
   u.x = (int2)(0, 0);
 }
 
-// AMDGCN-LABEL: define dso_local void @FuncOneLargeMember(
-// AMDGCN-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] {
-// AMDGCN-NEXT:  [[ENTRY:.*:]]
-// AMDGCN-NEXT:    [[COERCE:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5)
+// AMDGCN-LABEL: define dso_local void @FuncOneLargeMember
+// AMDGCN-SAME: (ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] {
+// AMDGCN-NEXT:  entry:
+// AMDGCN-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5)
 // AMDGCN-NEXT:    [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
-// AMDGCN-NEXT:    [[U:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr
-// AMDGCN-NEXT:    [[DOTCOMPOUNDLITERAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCOMPOUNDLITERAL]] to ptr
-// AMDGCN-NEXT:    call void @llvm.memcpy.p0.p5.i64(ptr align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false)
-// AMDGCN-NEXT:    store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8
-// AMDGCN-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8
-// AMDGCN-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr [[U]], i32 0, i32 0
-// AMDGCN-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x <2 x i32>], ptr [[X]], i64 0, i64 0
-// AMDGCN-NEXT:    store <2 x i32> [[TMP1]], ptr [[ARRAYIDX]], align 8
+// AMDGCN-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false)
+// AMDGCN-NEXT:    store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x <2 x i32>], ptr addrspace(5) [[X]], i64 0, i64 0
+// AMDGCN-NEXT:    store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8
 // AMDGCN-NEXT:    ret void
 //
 void FuncOneLargeMember(struct LargeStructOneMember u) {
@@ -167,9 +154,9 @@ void FuncOneLargeMember(struct LargeStructOneMember u) {
 }
 
 #if (__OPENCL_C_VERSION__ == 200) || (__OPENCL_C_VERSION__ >= 300 && defined(__opencl_c_program_scope_global_variables))
-// AMDGCN-LABEL: define dso_local void @test_indirect_arg_globl(
-// AMDGCN-SAME: ) #[[ATTR0]] {
-// AMDGCN-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-LABEL: define dso_local void @test_indirect_arg_globl
+// AMDGCN-SAME: () #[[ATTR0]] {
+// AMDGCN-NEXT:  entry:
 // AMDGCN-NEXT:    [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5)
 // AMDGCN-NEXT:    call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr addrspace(1) align 8 @g_s, i64 800, i1 false)
 // AMDGCN-NEXT:    call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]]
@@ -180,9 +167,9 @@ void test_indirect_arg_globl(void) {
 }
 #endif
 
-// AMDGCN-LABEL: define dso_local amdgpu_kernel void @test_indirect_arg_local(
-// AMDGCN-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] {
-// AMDGCN-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-LABEL: define dso_local amdgpu_kernel void @test_indirect_arg_local
+// AMDGCN-SAME: () #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] {
+// AMDGCN-NEXT:  entry:
 // AMDGCN-NEXT:    [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5)
 // AMDGCN-NEXT:    call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i64 800, i1 false)
 // AMDGCN-NEXT:    call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]]
@@ -193,14 +180,11 @@ kernel void test_indirect_arg_local(void) {
   FuncOneLargeMember(l_s);
 }
 
-// AMDGCN-LABEL: define dso_local void @test_indirect_arg_private(
-// AMDGCN-SAME: ) #[[ATTR0]] {
-// AMDGCN-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-LABEL: define dso_local void @test_indirect_arg_private
+// AMDGCN-SAME: () #[[ATTR0]] {
+// AMDGCN-NEXT:  entry:
 // AMDGCN-NEXT:    [[P_S:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5)
-// AMDGCN-NEXT:    [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5)
-// AMDGCN-NEXT:    [[P_S_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[P_S]] to ptr
-// AMDGCN-NEXT:    call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr align 8 [[P_S_ASCAST]], i64 800, i1 false)
-// AMDGCN-NEXT:    call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]]
+// AMDGCN-NEXT:    call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[P_S]]) #[[ATTR3]]
 // AMDGCN-NEXT:    ret void
 //
 void test_indirect_arg_private(void) {
@@ -208,15 +192,14 @@ void test_indirect_arg_private(void) {
   FuncOneLargeMember(p_s);
 }
 
-// AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelOneMember(
-// AMDGCN-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10:![0-9]+]] !kernel_arg_access_qual [[META11:![0-9]+]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13:![0-9]+]] {
-// AMDGCN-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelOneMember
+// AMDGCN-SAME: (<2 x i32> [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10:![0-9]+]] !kernel_arg_access_qual [[META11:![0-9]+]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13:![0-9]+]] {
+// AMDGCN-NEXT:  entry:
 // AMDGCN-NEXT:    [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5)
-// AMDGCN-NEXT:    [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr
-// AMDGCN-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0
-// AMDGCN-NEXT:    store <2 x i32> [[U_COERCE]], ptr [[COERCE_DIVE]], align 8
-// AMDGCN-NEXT:    [[COERCE_DIVE2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0
-// AMDGCN-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[COERCE_DIVE2]], align 8
+// AMDGCN-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN-NEXT:    store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8
+// AMDGCN-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE1]], align 8
 // AMDGCN-NEXT:    call void @FuncOneMember(<2 x i32> [[TMP0]]) #[[ATTR3]]
 // AMDGCN-NEXT:    ret void
 //
@@ -224,13 +207,12 @@ kernel void KernelOneMember(struct StructOneMember u) {
   FuncOneMember(u);
 }
 
-// AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelOneMemberSpir(
-// AMDGCN-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META14:![0-9]+]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] {
-// AMDGCN-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelOneMemberSpir
+// AMDGCN-SAME: (ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META14:![0-9]+]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN-NEXT:  entry:
 // AMDGCN-NEXT:    [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
-// AMDGCN-NEXT:    [[U_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[U_ADDR]] to ptr
-// AMDGCN-NEXT:    store ptr addrspace(1) [[U]], ptr [[U_ADDR_ASCAST]], align 8
-// AMDGCN-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[U_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT:    store ptr addrspace(1) [[U]], ptr addrspace(5) [[U_ADDR]], align 8
+// AMDGCN-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[U_ADDR]], align 8
 // AMDGCN-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER:%.*]], ptr addrspace(1) [[TMP0]], i32 0, i32 0
 // AMDGCN-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[COERCE_DIVE]], align 8
 // AMDGCN-NEXT:    call void @FuncOneMember(<2 x i32> [[TMP1]]) #[[ATTR3]]
@@ -240,78 +222,70 @@ kernel void KernelOneMemberSpir(global struct StructOneMember* u) {
   FuncOneMember(*u);
 }
 
-// AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember(
-// AMDGCN-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] {
-// AMDGCN-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember
+// AMDGCN-SAME: ([[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN-NEXT:  entry:
 // AMDGCN-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5)
-// AMDGCN-NEXT:    [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5)
-// AMDGCN-NEXT:    [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr
-// AMDGCN-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0
+// AMDGCN-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
 // AMDGCN-NEXT:    [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], 0
-// AMDGCN-NEXT:    store [100 x <2 x i32>] [[TMP1]], ptr [[TMP0]], align 8
-// AMDGCN-NEXT:    call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr align 8 [[U1]], i64 800, i1 false)
-// AMDGCN-NEXT:    call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]]
+// AMDGCN-NEXT:    store [100 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8
+// AMDGCN-NEXT:    call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR3]]
 // AMDGCN-NEXT:    ret void
 //
 kernel void KernelLargeOneMember(struct LargeStructOneMember u) {
   FuncOneLargeMember(u);
 }
 
-// AMDGCN-LABEL: define dso_local void @FuncTwoMember(
-// AMDGCN-SAME: <2 x i32> [[U_COERCE0:%.*]], <2 x i32> [[U_COERCE1:%.*]]) #[[ATTR0]] {
-// AMDGCN-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-LABEL: define dso_local void @FuncTwoMember
+// AMDGCN-SAME: (<2 x i32> [[U_COERCE0:%.*]], <2 x i32> [[U_COERCE1:%.*]]) #[[ATTR0]] {
+// AMDGCN-NEXT:  entry:
 // AMDGCN-NEXT:    [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER:%.*]], align 8, addrspace(5)
 // AMDGCN-NEXT:    [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
-// AMDGCN-NEXT:    [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr
-// AMDGCN-NEXT:    [[DOTCOMPOUNDLITERAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCOMPOUNDLITERAL]] to ptr
-// AMDGCN-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0
-// AMDGCN-NEXT:    store <2 x i32> [[U_COERCE0]], ptr [[TMP0]], align 8
-// AMDGCN-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1
-// AMDGCN-NEXT:    store <2 x i32> [[U_COERCE1]], ptr [[TMP1]], align 8
-// AMDGCN-NEXT:    store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8
-// AMDGCN-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8
-// AMDGCN-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1
-// AMDGCN-NEXT:    store <2 x i32> [[TMP2]], ptr [[Y]], align 8
+// AMDGCN-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN-NEXT:    store <2 x i32> [[U_COERCE0]], ptr addrspace(5) [[TMP0]], align 8
+// AMDGCN-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN-NEXT:    store <2 x i32> [[U_COERCE1]], ptr addrspace(5) [[TMP1]], align 8
+// AMDGCN-NEXT:    store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN-NEXT:    store <2 x i32> [[TMP2]], ptr addrspace(5) [[Y]], align 8
 // AMDGCN-NEXT:    ret void
 //
 void FuncTwoMember(struct StructTwoMember u) {
   u.y = (int2)(0, 0);
 }
 
-// AMDGCN-LABEL: define dso_local void @FuncLargeTwoMember(
-// AMDGCN-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] {
-// AMDGCN-NEXT:  [[ENTRY:.*:]]
-// AMDGCN-NEXT:    [[COERCE:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5)
+// AMDGCN-LABEL: define dso_local void @FuncLargeTwoMember
+// AMDGCN-SAME: (ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] {
+// AMDGCN-NEXT:  entry:
+// AMDGCN-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5)
 // AMDGCN-NEXT:    [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
-// AMDGCN-NEXT:    [[U:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr
-// AMDGCN-NEXT:    [[DOTCOMPOUNDLITERAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCOMPOUNDLITERAL]] to ptr
-// AMDGCN-NEXT:    call void @llvm.memcpy.p0.p5.i64(ptr align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false)
-// AMDGCN-NEXT:    store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8
-// AMDGCN-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8
-// AMDGCN-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr [[U]], i32 0, i32 1
-// AMDGCN-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [20 x <2 x i32>], ptr [[Y]], i64 0, i64 0
-// AMDGCN-NEXT:    store <2 x i32> [[TMP1]], ptr [[ARRAYIDX]], align 8
+// AMDGCN-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false)
+// AMDGCN-NEXT:    store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [20 x <2 x i32>], ptr addrspace(5) [[Y]], i64 0, i64 0
+// AMDGCN-NEXT:    store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8
 // AMDGCN-NEXT:    ret void
 //
 void FuncLargeTwoMember(struct LargeStructTwoMember u) {
   u.y[0] = (int2)(0, 0);
 }
 
-// AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember(
-// AMDGCN-SAME: [[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] {
-// AMDGCN-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember
+// AMDGCN-SAME: ([[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN-NEXT:  entry:
 // AMDGCN-NEXT:    [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5)
-// AMDGCN-NEXT:    [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr
-// AMDGCN-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0
+// AMDGCN-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
 // AMDGCN-NEXT:    [[TMP1:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 0
-// AMDGCN-NEXT:    store <2 x i32> [[TMP1]], ptr [[TMP0]], align 8
-// AMDGCN-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1
+// AMDGCN-NEXT:    store <2 x i32> [[TMP1]], ptr addrspace(5) [[TMP0]], align 8
+// AMDGCN-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
 // AMDGCN-NEXT:    [[TMP3:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 1
-// AMDGCN-NEXT:    store <2 x i32> [[TMP3]], ptr [[TMP2]], align 8
-// AMDGCN-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0
-// AMDGCN-NEXT:    [[TMP5:%.*]] = load <2 x i32>, ptr [[TMP4]], align 8
-// AMDGCN-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1
-// AMDGCN-NEXT:    [[TMP7:%.*]] = load <2 x i32>, ptr [[TMP6]], align 8
+// AMDGCN-NEXT:    store <2 x i32> [[TMP3]], ptr addrspace(5) [[TMP2]], align 8
+// AMDGCN-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN-NEXT:    [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP4]], align 8
+// AMDGCN-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN-NEXT:    [[TMP7:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP6]], align 8
 // AMDGCN-NEXT:    call void @FuncTwoMember(<2 x i32> [[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR3]]
 // AMDGCN-NEXT:    ret void
 //
@@ -319,39 +293,19 @@ kernel void KernelTwoMember(struct StructTwoMember u) {
   FuncTwoMember(u);
 }
 
-// AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember(
-// AMDGCN-SAME: [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] {
-// AMDGCN-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember
+// AMDGCN-SAME: ([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN-NEXT:  entry:
 // AMDGCN-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5)
-// AMDGCN-NEXT:    [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5)
-// AMDGCN-NEXT:    [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr
-// AMDGCN-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0
+// AMDGCN-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
 // AMDGCN-NEXT:    [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 0
-// AMDGCN-NEXT:    store [40 x <2 x i32>] [[TMP1]], ptr [[TMP0]], align 8
-// AMDGCN-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1
+// AMDGCN-NEXT:    store [40 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8
+// AMDGCN-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
 // AMDGCN-NEXT:    [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 1
-// AMDGCN-NEXT:    store [20 x <2 x i32>] [[TMP3]], ptr [[TMP2]], align 8
-// AMDGCN-NEXT:    call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr align 8 [[U1]], i64 480, i1 false)
-// AMDGCN-NEXT:    call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]]
+// AMDGCN-NEXT:    store [20 x <2 x i32>] [[TMP3]], ptr addrspace(5) [[TMP2]], align 8
+// AMDGCN-NEXT:    call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR3]]
 // AMDGCN-NEXT:    ret void
 //
 kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
   FuncLargeTwoMember(u);
 }
-//.
-// AMDGCN: [[META4]] = !{i32 1, i32 1}
-// AMDGCN: [[META5]] = !{!"none", !"none"}
-// AMDGCN: [[META6]] = !{!"Mat3X3*", !"Mat4X4*"}
-// AMDGCN: [[META7]] = !{!"", !""}
-// AMDGCN: [[META8]] = !{!"Mat32X32*", !"Mat64X64*"}
-// AMDGCN: [[META9]] = !{}
-// AMDGCN: [[META10]] = !{i32 0}
-// AMDGCN: [[META11]] = !{!"none"}
-// AMDGCN: [[META12]] = !{!"struct StructOneMember"}
-// AMDGCN: [[META13]] = !{!""}
-// AMDGCN: [[META14]] = !{i32 1}
-// AMDGCN: [[META15]] = !{!"struct StructOneMember*"}
-// AMDGCN: [[META16]] = !{!"struct LargeStructOneMember"}
-// AMDGCN: [[META17]] = !{!"struct StructTwoMember"}
-// AMDGCN: [[META18]] = !{!"struct LargeStructTwoMember"}
-//.
diff --git a/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl b/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
index ace34dd0ca6dc..41bbc484b315b 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
@@ -70,13 +70,11 @@ kernel void test_target_features_kernel(global int *i) {
 // NOCPU-NEXT:  entry:
 // NOCPU-NEXT:    [[ID_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
 // NOCPU-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
-// NOCPU-NEXT:    [[ID_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ID_ADDR]] to ptr
-// NOCPU-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// NOCPU-NEXT:    store i64 [[ID]], ptr [[ID_ADDR_ASCAST]], align 8
-// NOCPU-NEXT:    store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
-// NOCPU-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ID_ADDR_ASCAST]], align 8
-// NOCPU-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
-// NOCPU-NEXT:    [[TMP2:%.*]] = load i64, ptr [[ID_ADDR_ASCAST]], align 8
+// NOCPU-NEXT:    store i64 [[ID]], ptr addrspace(5) [[ID_ADDR]], align 8
+// NOCPU-NEXT:    store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// NOCPU-NEXT:    [[TMP0:%.*]] = load i64, ptr addrspace(5) [[ID_ADDR]], align 8
+// NOCPU-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// NOCPU-NEXT:    [[TMP2:%.*]] = load i64, ptr addrspace(5) [[ID_ADDR]], align 8
 // NOCPU-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[TMP1]], i64 [[TMP2]]
 // NOCPU-NEXT:    store i64 [[TMP0]], ptr addrspace(1) [[ARRAYIDX]], align 8
 // NOCPU-NEXT:    ret void
@@ -103,108 +101,96 @@ kernel void test_target_features_kernel(global int *i) {
 // NOCPU-NEXT:    [[BLOCK20:%.*]] = alloca ptr, align 8, addrspace(5)
 // NOCPU-NEXT:    [[BLOCK21:%.*]] = alloca <{ i32, i32, ptr, i64, ptr addrspace(1) }>, align 8, addrspace(5)
 // NOCPU-NEXT:    [[VARTMP27:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4, addrspace(5)
-// NOCPU-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// NOCPU-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
-// NOCPU-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
-// NOCPU-NEXT:    [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
-// NOCPU-NEXT:    [[DEFAULT_QUEUE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DEFAULT_QUEUE]] to ptr
-// NOCPU-NEXT:    [[FLAGS_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[FLAGS]] to ptr
-// NOCPU-NEXT:    [[NDRANGE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[NDRANGE]] to ptr
-// NOCPU-NEXT:    [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr
-// NOCPU-NEXT:    [[BLOCK_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK]] to ptr
-// NOCPU-NEXT:    [[TMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VARTMP2]] to ptr
-// NOCPU-NEXT:    [[BLOCK3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK3]] to ptr
-// NOCPU-NEXT:    [[TMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VARTMP11]] to ptr
-// NOCPU-NEXT:    [[BLOCK12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK12]] to ptr
-// NOCPU-NEXT:    [[BLOCK_SIZES_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK_SIZES]] to ptr
-// NOCPU-NEXT:    [[BLOCK20_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK20]] to ptr
+// NOCPU-NEXT:    store ptr addrspace(1) [[A]], ptr addrspace(5) [[A_ADDR]], align 8
+// NOCPU-NEXT:    store i8 [[B]], ptr addrspace(5) [[B_ADDR]], align 1
+// NOCPU-NEXT:    store ptr addrspace(1) [[C]], ptr addrspace(5) [[C_ADDR]], align 8
+// NOCPU-NEXT:    store i64 [[D]], ptr addrspace(5) [[D_ADDR]], align 8
+// NOCPU-NEXT:    store i32 0, ptr addrspace(5) [[FLAGS]], align 4
+// NOCPU-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8
+// NOCPU-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4
+// NOCPU-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[TMP]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false)
+// NOCPU-NEXT:    [[BLOCK_SIZE:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr addrspace(5) [[BLOCK]], i32 0, i32 0
+// NOCPU-NEXT:    store i32 25, ptr addrspace(5) [[BLOCK_SIZE]], align 8
+// NOCPU-NEXT:    [[BLOCK_ALIGN:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr addrspace(5) [[BLOCK]], i32 0, i32 1
+// NOCPU-NEXT:    store i32 8, ptr addrspace(5) [[BLOCK_ALIGN]], align 4
+// NOCPU-NEXT:    [[BLOCK_INVOKE:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr addrspace(5) [[BLOCK]], i32 0, i32 2
+// NOCPU-NEXT:    store ptr @__test_block_invoke, ptr addrspace(5) [[BLOCK_INVOKE]], align 8
+// NOCPU-NEXT:    [[BLOCK_CAPTURED:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr addrspace(5) [[BLOCK]], i32 0, i32 3
+// NOCPU-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8
+// NOCPU-NEXT:    store ptr addrspace(1) [[TMP2]], ptr addrspace(5) [[BLOCK_CAPTURED]], align 8
+// NOCPU-NEXT:    [[BLOCK_CAPTURED1:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr addrspace(5) [[BLOCK]], i32 0, i32 4
+// NOCPU-NEXT:    [[TMP3:%.*]] = load i8, ptr addrspace(5) [[B_ADDR]], align 1
+// NOCPU-NEXT:    store i8 [[TMP3]], ptr addrspace(5) [[BLOCK_CAPTURED1]], align 8
+// NOCPU-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK]] to ptr
+// NOCPU-NEXT:    [[TMP5:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP0]], i32 [[TMP1]], ptr addrspace(5) [[TMP]], ptr @__test_block_invoke_kernel, ptr [[TMP4]])
+// NOCPU-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8
+// NOCPU-NEXT:    [[TMP7:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4
+// NOCPU-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[VARTMP2]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false)
+// NOCPU-NEXT:    [[BLOCK_SIZE4:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 0
+// NOCPU-NEXT:    store i32 41, ptr addrspace(5) [[BLOCK_SIZE4]], align 8
+// NOCPU-NEXT:    [[BLOCK_ALIGN5:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 1
+// NOCPU-NEXT:    store i32 8, ptr addrspace(5) [[BLOCK_ALIGN5]], align 4
+// NOCPU-NEXT:    [[BLOCK_INVOKE6:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 2
+// NOCPU-NEXT:    store ptr @__test_block_invoke_2, ptr addrspace(5) [[BLOCK_INVOKE6]], align 8
+// NOCPU-NEXT:    [[BLOCK_CAPTURED7:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 3
+// NOCPU-NEXT:    [[TMP8:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8
+// NOCPU-NEXT:    store ptr addrspace(1) [[TMP8]], ptr addrspace(5) [[BLOCK_CAPTURED7]], align 8
+// NOCPU-NEXT:    [[BLOCK_CAPTURED8:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 6
+// NOCPU-NEXT:    [[TMP9:%.*]] = load i8, ptr addrspace(5) [[B_ADDR]], align 1
+// NOCPU-NEXT:    store i8 [[TMP9]], ptr addrspace(5) [[BLOCK_CAPTURED8]], align 8
+// NOCPU-NEXT:    [[BLOCK_CAPTURED9:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 4
+// NOCPU-NEXT:    [[TMP10:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[C_ADDR]], align 8
+// NOCPU-NEXT:    store ptr addrspace(1) [[TMP10]], ptr addrspace(5) [[BLOCK_CAPTURED9]], align 8
+// NOCPU-NEXT:    [[BLOCK_CAPTURED10:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 5
+// NOCPU-NEXT:    [[TMP11:%.*]] = load i64, ptr addrspace(5) [[D_ADDR]], align 8
+// NOCPU-NEXT:    store i64 [[TMP11]], ptr addrspace(5) [[BLOCK_CAPTURED10]], align 8
+// NOCPU-NEXT:    [[TMP12:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK3]] to ptr
+// NOCPU-NEXT:    [[TMP13:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP6]], i32 [[TMP7]], ptr addrspace(5) [[VARTMP2]], ptr @__test_block_invoke_2_kernel, ptr [[TMP12]])
+// NOCPU-NEXT:    [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8
+// NOCPU-NEXT:    [[TMP15:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4
+// NOCPU-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[VARTMP11]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false)
+// NOCPU-NEXT:    [[BLOCK_SIZE13:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 0
+// NOCPU-NEXT:    store i32 41, ptr addrspace(5) [[BLOCK_SIZE13]], align 8
+// NOCPU-NEXT:    [[BLOCK_ALIGN14:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 1
+// NOCPU-NEXT:    store i32 8, ptr addrspace(5) [[BLOCK_ALIGN14]], align 4
+// NOCPU-NEXT:    [[BLOCK_INVOKE15:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 2
+// NOCPU-NEXT:    store ptr @__test_block_invoke_3, ptr addrspace(5) [[BLOCK_INVOKE15]], align 8
+// NOCPU-NEXT:    [[BLOCK_CAPTURED16:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 3
+// NOCPU-NEXT:    [[TMP16:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8
+// NOCPU-NEXT:    store ptr addrspace(1) [[TMP16]], ptr addrspace(5) [[BLOCK_CAPTURED16]], align 8
+// NOCPU-NEXT:    [[BLOCK_CAPTURED17:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 6
+// NOCPU-NEXT:    [[TMP17:%.*]] = load i8, ptr addrspace(5) [[B_ADDR]], align 1
+// NOCPU-NEXT:    store i8 [[TMP17]], ptr addrspace(5) [[BLOCK_CAPTURED17]], align 8
+// NOCPU-NEXT:    [[BLOCK_CAPTURED18:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 4
+// NOCPU-NEXT:    [[TMP18:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[C_ADDR]], align 8
+// NOCPU-NEXT:    store ptr addrspace(1) [[TMP18]], ptr addrspace(5) [[BLOCK_CAPTURED18]], align 8
+// NOCPU-NEXT:    [[BLOCK_CAPTURED19:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 5
+// NOCPU-NEXT:    [[TMP19:%.*]] = load i64, ptr addrspace(5) [[D_ADDR]], align 8
+// NOCPU-NEXT:    store i64 [[TMP19]], ptr addrspace(5) [[BLOCK_CAPTURED19]], align 8
+// NOCPU-NEXT:    [[TMP20:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK12]] to ptr
+// NOCPU-NEXT:    [[TMP21:%.*]] = getelementptr [1 x i64], ptr addrspace(5) [[BLOCK_SIZES]], i32 0, i32 0
+// NOCPU-NEXT:    store i64 100, ptr addrspace(5) [[TMP21]], align 8
+// NOCPU-NEXT:    [[TMP22:%.*]] = call i32 @__enqueue_kernel_varargs(ptr addrspace(1) [[TMP14]], i32 [[TMP15]], ptr addrspace(5) [[VARTMP11]], ptr @__test_block_invoke_3_kernel, ptr [[TMP20]], i32 1, ptr addrspace(5) [[TMP21]])
+// NOCPU-NEXT:    [[BLOCK_SIZE22:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr addrspace(5) [[BLOCK21]], i32 0, i32 0
+// NOCPU-NEXT:    store i32 32, ptr addrspace(5) [[BLOCK_SIZE22]], align 8
+// NOCPU-NEXT:    [[BLOCK_ALIGN23:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr addrspace(5) [[BLOCK21]], i32 0, i32 1
+// NOCPU-NEXT:    store i32 8, ptr addrspace(5) [[BLOCK_ALIGN23]], align 4
+// NOCPU-NEXT:    [[BLOCK_INVOKE24:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr addrspace(5) [[BLOCK21]], i32 0, i32 2
+// NOCPU-NEXT:    store ptr @__test_block_invoke_4, ptr addrspace(5) [[BLOCK_INVOKE24]], align 8
+// NOCPU-NEXT:    [[BLOCK_CAPTURED25:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr addrspace(5) [[BLOCK21]], i32 0, i32 3
+// NOCPU-NEXT:    [[TMP23:%.*]] = load i64, ptr addrspace(5) [[D_ADDR]], align 8
+// NOCPU-NEXT:    store i64 [[TMP23]], ptr addrspace(5) [[BLOCK_CAPTURED25]], align 8
+// NOCPU-NEXT:    [[BLOCK_CAPTURED26:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr addrspace(5) [[BLOCK21]], i32 0, i32 4
+// NOCPU-NEXT:    [[TMP24:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[C_ADDR]], align 8
+// NOCPU-NEXT:    store ptr addrspace(1) [[TMP24]], ptr addrspace(5) [[BLOCK_CAPTURED26]], align 8
 // NOCPU-NEXT:    [[BLOCK21_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK21]] to ptr
-// NOCPU-NEXT:    [[TMP27_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VARTMP27]] to ptr
-// NOCPU-NEXT:    store ptr addrspace(1) [[A]], ptr [[A_ADDR_ASCAST]], align 8
-// NOCPU-NEXT:    store i8 [[B]], ptr [[B_ADDR_ASCAST]], align 1
-// NOCPU-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR_ASCAST]], align 8
-// NOCPU-NEXT:    store i64 [[D]], ptr [[D_ADDR_ASCAST]], align 8
-// NOCPU-NEXT:    store i32 0, ptr [[FLAGS_ASCAST]], align 4
-// NOCPU-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[DEFAULT_QUEUE_ASCAST]], align 8
-// NOCPU-NEXT:    [[TMP1:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4
-// NOCPU-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false)
-// NOCPU-NEXT:    [[BLOCK_SIZE:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[BLOCK_ASCAST]], i32 0, i32 0
-// NOCPU-NEXT:    store i32 25, ptr [[BLOCK_SIZE]], align 8
-// NOCPU-NEXT:    [[BLOCK_ALIGN:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[BLOCK_ASCAST]], i32 0, i32 1
-// NOCPU-NEXT:    store i32 8, ptr [[BLOCK_ALIGN]], align 4
-// NOCPU-NEXT:    [[BLOCK_INVOKE:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[BLOCK_ASCAST]], i32 0, i32 2
-// NOCPU-NEXT:    store ptr @__test_block_invoke, ptr [[BLOCK_INVOKE]], align 8
-// NOCPU-NEXT:    [[BLOCK_CAPTURED:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[BLOCK_ASCAST]], i32 0, i32 3
-// NOCPU-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[A_ADDR_ASCAST]], align 8
-// NOCPU-NEXT:    store ptr addrspace(1) [[TMP2]], ptr [[BLOCK_CAPTURED]], align 8
-// NOCPU-NEXT:    [[BLOCK_CAPTURED1:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[BLOCK_ASCAST]], i32 0, i32 4
-// NOCPU-NEXT:    [[TMP3:%.*]] = load i8, ptr [[B_ADDR_ASCAST]], align 1
-// NOCPU-NEXT:    store i8 [[TMP3]], ptr [[BLOCK_CAPTURED1]], align 8
-// NOCPU-NEXT:    [[TMP4:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP0]], i32 [[TMP1]], ptr [[TMP_ASCAST]], ptr @__test_block_invoke_kernel, ptr [[BLOCK_ASCAST]])
-// NOCPU-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr [[DEFAULT_QUEUE_ASCAST]], align 8
-// NOCPU-NEXT:    [[TMP6:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4
-// NOCPU-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP2_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false)
-// NOCPU-NEXT:    [[BLOCK_SIZE4:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK3_ASCAST]], i32 0, i32 0
-// NOCPU-NEXT:    store i32 41, ptr [[BLOCK_SIZE4]], align 8
-// NOCPU-NEXT:    [[BLOCK_ALIGN5:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK3_ASCAST]], i32 0, i32 1
-// NOCPU-NEXT:    store i32 8, ptr [[BLOCK_ALIGN5]], align 4
-// NOCPU-NEXT:    [[BLOCK_INVOKE6:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK3_ASCAST]], i32 0, i32 2
-// NOCPU-NEXT:    store ptr @__test_block_invoke_2, ptr [[BLOCK_INVOKE6]], align 8
-// NOCPU-NEXT:    [[BLOCK_CAPTURED7:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK3_ASCAST]], i32 0, i32 3
-// NOCPU-NEXT:    [[TMP7:%.*]] = load ptr addrspace(1), ptr [[A_ADDR_ASCAST]], align 8
-// NOCPU-NEXT:    store ptr addrspace(1) [[TMP7]], ptr [[BLOCK_CAPTURED7]], align 8
-// NOCPU-NEXT:    [[BLOCK_CAPTURED8:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK3_ASCAST]], i32 0, i32 6
-// NOCPU-NEXT:    [[TMP8:%.*]] = load i8, ptr [[B_ADDR_ASCAST]], align 1
-// NOCPU-NEXT:    store i8 [[TMP8]], ptr [[BLOCK_CAPTURED8]], align 8
-// NOCPU-NEXT:    [[BLOCK_CAPTURED9:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK3_ASCAST]], i32 0, i32 4
-// NOCPU-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr [[C_ADDR_ASCAST]], align 8
-// NOCPU-NEXT:    store ptr addrspace(1) [[TMP9]], ptr [[BLOCK_CAPTURED9]], align 8
-// NOCPU-NEXT:    [[BLOCK_CAPTURED10:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK3_ASCAST]], i32 0, i32 5
-// NOCPU-NEXT:    [[TMP10:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8
-// NOCPU-NEXT:    store i64 [[TMP10]], ptr [[BLOCK_CAPTURED10]], align 8
-// NOCPU-NEXT:    [[TMP11:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP5]], i32 [[TMP6]], ptr [[TMP2_ASCAST]], ptr @__test_block_invoke_2_kernel, ptr [[BLOCK3_ASCAST]])
-// NOCPU-NEXT:    [[TMP12:%.*]] = load ptr addrspace(1), ptr [[DEFAULT_QUEUE_ASCAST]], align 8
-// NOCPU-NEXT:    [[TMP13:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4
-// NOCPU-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP11_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false)
-// NOCPU-NEXT:    [[BLOCK_SIZE13:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK12_ASCAST]], i32 0, i32 0
-// NOCPU-NEXT:    store i32 41, ptr [[BLOCK_SIZE13]], align 8
-// NOCPU-NEXT:    [[BLOCK_ALIGN14:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK12_ASCAST]], i32 0, i32 1
-// NOCPU-NEXT:    store i32 8, ptr [[BLOCK_ALIGN14]], align 4
-// NOCPU-NEXT:    [[BLOCK_INVOKE15:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK12_ASCAST]], i32 0, i32 2
-// NOCPU-NEXT:    store ptr @__test_block_invoke_3, ptr [[BLOCK_INVOKE15]], align 8
-// NOCPU-NEXT:    [[BLOCK_CAPTURED16:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK12_ASCAST]], i32 0, i32 3
-// NOCPU-NEXT:    [[TMP14:%.*]] = load ptr addrspace(1), ptr [[A_ADDR_ASCAST]], align 8
-// NOCPU-NEXT:    store ptr addrspace(1) [[TMP14]], ptr [[BLOCK_CAPTURED16]], align 8
-// NOCPU-NEXT:    [[BLOCK_CAPTURED17:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK12_ASCAST]], i32 0, i32 6
-// NOCPU-NEXT:    [[TMP15:%.*]] = load i8, ptr [[B_ADDR_ASCAST]], align 1
-// NOCPU-NEXT:    store i8 [[TMP15]], ptr [[BLOCK_CAPTURED17]], align 8
-// NOCPU-NEXT:    [[BLOCK_CAPTURED18:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK12_ASCAST]], i32 0, i32 4
-// NOCPU-NEXT:    [[TMP16:%.*]] = load ptr addrspace(1), ptr [[C_ADDR_ASCAST]], align 8
-// NOCPU-NEXT:    store ptr addrspace(1) [[TMP16]], ptr [[BLOCK_CAPTURED18]], align 8
-// NOCPU-NEXT:    [[BLOCK_CAPTURED19:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK12_ASCAST]], i32 0, i32 5
-// NOCPU-NEXT:    [[TMP17:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8
-// NOCPU-NEXT:    store i64 [[TMP17]], ptr [[BLOCK_CAPTURED19]], align 8
-// NOCPU-NEXT:    [[TMP18:%.*]] = getelementptr [1 x i64], ptr [[BLOCK_SIZES_ASCAST]], i32 0, i32 0
-// NOCPU-NEXT:    store i64 100, ptr [[TMP18]], align 8
-// NOCPU-NEXT:    [[TMP19:%.*]] = call i32 @__enqueue_kernel_varargs(ptr addrspace(1) [[TMP12]], i32 [[TMP13]], ptr [[TMP11_ASCAST]], ptr @__test_block_invoke_3_kernel, ptr [[BLOCK12_ASCAST]], i32 1, ptr [[TMP18]])
-// NOCPU-NEXT:    [[BLOCK_SIZE22:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[BLOCK21_ASCAST]], i32 0, i32 0
-// NOCPU-NEXT:    store i32 32, ptr [[BLOCK_SIZE22]], align 8
-// NOCPU-NEXT:    [[BLOCK_ALIGN23:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[BLOCK21_ASCAST]], i32 0, i32 1
-// NOCPU-NEXT:    store i32 8, ptr [[BLOCK_ALIGN23]], align 4
-// NOCPU-NEXT:    [[BLOCK_INVOKE24:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[BLOCK21_ASCAST]], i32 0, i32 2
-// NOCPU-NEXT:    store ptr @__test_block_invoke_4, ptr [[BLOCK_INVOKE24]], align 8
-// NOCPU-NEXT:    [[BLOCK_CAPTURED25:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[BLOCK21_ASCAST]], i32 0, i32 3
-// NOCPU-NEXT:    [[TMP20:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8
-// NOCPU-NEXT:    store i64 [[TMP20]], ptr [[BLOCK_CAPTURED25]], align 8
-// NOCPU-NEXT:    [[BLOCK_CAPTURED26:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[BLOCK21_ASCAST]], i32 0, i32 4
-// NOCPU-NEXT:    [[TMP21:%.*]] = load ptr addrspace(1), ptr [[C_ADDR_ASCAST]], align 8
-// NOCPU-NEXT:    store ptr addrspace(1) [[TMP21]], ptr [[BLOCK_CAPTURED26]], align 8
-// NOCPU-NEXT:    store ptr [[BLOCK21_ASCAST]], ptr [[BLOCK20_ASCAST]], align 8
-// NOCPU-NEXT:    [[TMP22:%.*]] = load ptr addrspace(1), ptr [[DEFAULT_QUEUE_ASCAST]], align 8
-// NOCPU-NEXT:    [[TMP23:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4
-// NOCPU-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP27_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false)
-// NOCPU-NEXT:    [[TMP24:%.*]] = load ptr, ptr [[BLOCK20_ASCAST]], align 8
-// NOCPU-NEXT:    [[TMP25:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP22]], i32 [[TMP23]], ptr [[TMP27_ASCAST]], ptr @__test_block_invoke_4_kernel, ptr [[BLOCK21_ASCAST]])
+// NOCPU-NEXT:    store ptr [[BLOCK21_ASCAST]], ptr addrspace(5) [[BLOCK20]], align 8
+// NOCPU-NEXT:    [[TMP25:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8
+// NOCPU-NEXT:    [[TMP26:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4
+// NOCPU-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[VARTMP27]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false)
+// NOCPU-NEXT:    [[TMP27:%.*]] = load ptr, ptr addrspace(5) [[BLOCK20]], align 8
+// NOCPU-NEXT:    [[TMP28:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK21]] to ptr
+// NOCPU-NEXT:    [[TMP29:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP25]], i32 [[TMP26]], ptr addrspace(5) [[VARTMP27]], ptr @__test_block_invoke_4_kernel, ptr [[TMP28]])
 // NOCPU-NEXT:    ret void
 //
 //
@@ -214,10 +200,8 @@ kernel void test_target_features_kernel(global int *i) {
 // NOCPU-NEXT:  entry:
 // NOCPU-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // NOCPU-NEXT:    [[BLOCK_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// NOCPU-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]] to ptr
-// NOCPU-NEXT:    [[BLOCK_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK_ADDR]] to ptr
-// NOCPU-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST]], align 8
-// NOCPU-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr [[BLOCK_ADDR_ASCAST]], align 8
+// NOCPU-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]], align 8
+// NOCPU-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[BLOCK_ADDR]], align 8
 // NOCPU-NEXT:    [[BLOCK_CAPTURE_ADDR:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 4
 // NOCPU-NEXT:    [[TMP0:%.*]] = load i8, ptr [[BLOCK_CAPTURE_ADDR]], align 8
 // NOCPU-NEXT:    [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 3
@@ -244,10 +228,8 @@ kernel void test_target_features_kernel(global int *i) {
 // NOCPU-NEXT:  entry:
 // NOCPU-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // NOCPU-NEXT:    [[BLOCK_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// NOCPU-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]] to ptr
-// NOCPU-NEXT:    [[BLOCK_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK_ADDR]] to ptr
-// NOCPU-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST]], align 8
-// NOCPU-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr [[BLOCK_ADDR_ASCAST]], align 8
+// NOCPU-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]], align 8
+// NOCPU-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[BLOCK_ADDR]], align 8
 // NOCPU-NEXT:    [[BLOCK_CAPTURE_ADDR:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 6
 // NOCPU-NEXT:    [[TMP0:%.*]] = load i8, ptr [[BLOCK_CAPTURE_ADDR]], align 8
 // NOCPU-NEXT:    [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 3
@@ -281,12 +263,9 @@ kernel void test_target_features_kernel(global int *i) {
 // NOCPU-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // NOCPU-NEXT:    [[LP_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
 // NOCPU-NEXT:    [[BLOCK_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// NOCPU-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]] to ptr
-// NOCPU-NEXT:    [[LP_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[LP_ADDR]] to ptr
-// NOCPU-NEXT:    [[BLOCK_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK_ADDR]] to ptr
-// NOCPU-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST]], align 8
-// NOCPU-NEXT:    store ptr addrspace(3) [[LP]], ptr [[LP_ADDR_ASCAST]], align 4
-// NOCPU-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr [[BLOCK_ADDR_ASCAST]], align 8
+// NOCPU-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]], align 8
+// NOCPU-NEXT:    store ptr addrspace(3) [[LP]], ptr addrspace(5) [[LP_ADDR]], align 4
+// NOCPU-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[BLOCK_ADDR]], align 8
 // NOCPU-NEXT:    [[BLOCK_CAPTURE_ADDR:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 6
 // NOCPU-NEXT:    [[TMP0:%.*]] = load i8, ptr [[BLOCK_CAPTURE_ADDR]], align 8
 // NOCPU-NEXT:    [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 3
@@ -299,7 +278,7 @@ kernel void test_target_features_kernel(global int *i) {
 // NOCPU-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR3]], align 8
 // NOCPU-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[TMP3]], i64 0
 // NOCPU-NEXT:    store i64 [[TMP2]], ptr addrspace(1) [[ARRAYIDX4]], align 8
-// NOCPU-NEXT:    [[TMP4:%.*]] = load ptr addrspace(3), ptr [[LP_ADDR_ASCAST]], align 4
+// NOCPU-NEXT:    [[TMP4:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[LP_ADDR]], align 4
 // NOCPU-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr addrspace(3) [[TMP4]], i64 0
 // NOCPU-NEXT:    store i32 1, ptr addrspace(3) [[ARRAYIDX5]], align 4
 // NOCPU-NEXT:    ret void
@@ -322,10 +301,8 @@ kernel void test_target_features_kernel(global int *i) {
 // NOCPU-NEXT:  entry:
 // NOCPU-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // NOCPU-NEXT:    [[BLOCK_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// NOCPU-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]] to ptr
-// NOCPU-NEXT:    [[BLOCK_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK_ADDR]] to ptr
-// NOCPU-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST]], align 8
-// NOCPU-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr [[BLOCK_ADDR_ASCAST]], align 8
+// NOCPU-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]], align 8
+// NOCPU-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[BLOCK_ADDR]], align 8
 // NOCPU-NEXT:    [[BLOCK_CAPTURE_ADDR:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 3
 // NOCPU-NEXT:    [[TMP0:%.*]] = load i64, ptr [[BLOCK_CAPTURE_ADDR]], align 8
 // NOCPU-NEXT:    [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 4
@@ -354,18 +331,13 @@ kernel void test_target_features_kernel(global int *i) {
 // NOCPU-NEXT:    [[FLAGS:%.*]] = alloca i32, align 4, addrspace(5)
 // NOCPU-NEXT:    [[NDRANGE:%.*]] = alloca [[STRUCT_NDRANGE_T:%.*]], align 4, addrspace(5)
 // NOCPU-NEXT:    [[TMP:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4, addrspace(5)
-// NOCPU-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
-// NOCPU-NEXT:    [[DEFAULT_QUEUE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DEFAULT_QUEUE]] to ptr
-// NOCPU-NEXT:    [[FLAGS_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[FLAGS]] to ptr
-// NOCPU-NEXT:    [[NDRANGE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[NDRANGE]] to ptr
-// NOCPU-NEXT:    [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr
-// NOCPU-NEXT:    store ptr addrspace(1) [[I]], ptr [[I_ADDR_ASCAST]], align 8
-// NOCPU-NEXT:    store i32 0, ptr [[FLAGS_ASCAST]], align 4
+// NOCPU-NEXT:    store ptr addrspace(1) [[I]], ptr addrspace(5) [[I_ADDR]], align 8
+// NOCPU-NEXT:    store i32 0, ptr addrspace(5) [[FLAGS]], align 4
 // NOCPU-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.memtime()
-// NOCPU-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[DEFAULT_QUEUE_ASCAST]], align 8
-// NOCPU-NEXT:    [[TMP2:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4
-// NOCPU-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false)
-// NOCPU-NEXT:    [[TMP3:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP1]], i32 [[TMP2]], ptr [[TMP_ASCAST]], ptr @__test_target_features_kernel_block_invoke_kernel, ptr addrspacecast (ptr addrspace(1) @__block_literal_global to ptr))
+// NOCPU-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8
+// NOCPU-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4
+// NOCPU-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[TMP]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false)
+// NOCPU-NEXT:    [[TMP3:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP1]], i32 [[TMP2]], ptr addrspace(5) [[TMP]], ptr @__test_target_features_kernel_block_invoke_kernel, ptr addrspacecast (ptr addrspace(1) @__block_literal_global to ptr))
 // NOCPU-NEXT:    ret void
 //
 //
@@ -375,10 +347,8 @@ kernel void test_target_features_kernel(global int *i) {
 // NOCPU-NEXT:  entry:
 // NOCPU-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // NOCPU-NEXT:    [[BLOCK_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// NOCPU-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]] to ptr
-// NOCPU-NEXT:    [[BLOCK_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK_ADDR]] to ptr
-// NOCPU-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST]], align 8
-// NOCPU-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr [[BLOCK_ADDR_ASCAST]], align 8
+// NOCPU-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]], align 8
+// NOCPU-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[BLOCK_ADDR]], align 8
 // NOCPU-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.memtime()
 // NOCPU-NEXT:    ret void
 //
@@ -413,13 +383,11 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-NEXT:  entry:
 // GFX900-NEXT:    [[ID_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
 // GFX900-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
-// GFX900-NEXT:    [[ID_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ID_ADDR]] to ptr
-// GFX900-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// GFX900-NEXT:    store i64 [[ID]], ptr [[ID_ADDR_ASCAST]], align 8, !tbaa [[TBAA3:![0-9]+]]
-// GFX900-NEXT:    store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8, !tbaa [[TBAA7:![0-9]+]]
-// GFX900-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ID_ADDR_ASCAST]], align 8, !tbaa [[TBAA3]]
-// GFX900-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8, !tbaa [[TBAA7]]
-// GFX900-NEXT:    [[TMP2:%.*]] = load i64, ptr [[ID_ADDR_ASCAST]], align 8, !tbaa [[TBAA3]]
+// GFX900-NEXT:    store i64 [[ID]], ptr addrspace(5) [[ID_ADDR]], align 8, !tbaa [[TBAA3:![0-9]+]]
+// GFX900-NEXT:    store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8, !tbaa [[TBAA7:![0-9]+]]
+// GFX900-NEXT:    [[TMP0:%.*]] = load i64, ptr addrspace(5) [[ID_ADDR]], align 8, !tbaa [[TBAA3]]
+// GFX900-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8, !tbaa [[TBAA7]]
+// GFX900-NEXT:    [[TMP2:%.*]] = load i64, ptr addrspace(5) [[ID_ADDR]], align 8, !tbaa [[TBAA3]]
 // GFX900-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[TMP1]], i64 [[TMP2]]
 // GFX900-NEXT:    store i64 [[TMP0]], ptr addrspace(1) [[ARRAYIDX]], align 8, !tbaa [[TBAA3]]
 // GFX900-NEXT:    ret void
@@ -446,114 +414,102 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-NEXT:    [[BLOCK20:%.*]] = alloca ptr, align 8, addrspace(5)
 // GFX900-NEXT:    [[BLOCK21:%.*]] = alloca <{ i32, i32, ptr, i64, ptr addrspace(1) }>, align 8, addrspace(5)
 // GFX900-NEXT:    [[VARTMP27:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4, addrspace(5)
-// GFX900-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// GFX900-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
-// GFX900-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
-// GFX900-NEXT:    [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
-// GFX900-NEXT:    [[DEFAULT_QUEUE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DEFAULT_QUEUE]] to ptr
-// GFX900-NEXT:    [[FLAGS_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[FLAGS]] to ptr
-// GFX900-NEXT:    [[NDRANGE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[NDRANGE]] to ptr
-// GFX900-NEXT:    [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr
-// GFX900-NEXT:    [[BLOCK_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK]] to ptr
-// GFX900-NEXT:    [[TMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VARTMP2]] to ptr
-// GFX900-NEXT:    [[BLOCK3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK3]] to ptr
-// GFX900-NEXT:    [[TMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VARTMP11]] to ptr
-// GFX900-NEXT:    [[BLOCK12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK12]] to ptr
-// GFX900-NEXT:    [[BLOCK_SIZES_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK_SIZES]] to ptr
-// GFX900-NEXT:    [[BLOCK20_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK20]] to ptr
-// GFX900-NEXT:    [[BLOCK21_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK21]] to ptr
-// GFX900-NEXT:    [[TMP27_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VARTMP27]] to ptr
-// GFX900-NEXT:    store ptr addrspace(1) [[A]], ptr [[A_ADDR_ASCAST]], align 8, !tbaa [[TBAA14:![0-9]+]]
-// GFX900-NEXT:    store i8 [[B]], ptr [[B_ADDR_ASCAST]], align 1, !tbaa [[TBAA16:![0-9]+]]
-// GFX900-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR_ASCAST]], align 8, !tbaa [[TBAA7]]
-// GFX900-NEXT:    store i64 [[D]], ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[TBAA3]]
+// GFX900-NEXT:    store ptr addrspace(1) [[A]], ptr addrspace(5) [[A_ADDR]], align 8, !tbaa [[TBAA14:![0-9]+]]
+// GFX900-NEXT:    store i8 [[B]], ptr addrspace(5) [[B_ADDR]], align 1, !tbaa [[TBAA16:![0-9]+]]
+// GFX900-NEXT:    store ptr addrspace(1) [[C]], ptr addrspace(5) [[C_ADDR]], align 8, !tbaa [[TBAA7]]
+// GFX900-NEXT:    store i64 [[D]], ptr addrspace(5) [[D_ADDR]], align 8, !tbaa [[TBAA3]]
 // GFX900-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR8:[0-9]+]]
 // GFX900-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR8]]
-// GFX900-NEXT:    store i32 0, ptr [[FLAGS_ASCAST]], align 4, !tbaa [[TBAA17:![0-9]+]]
+// GFX900-NEXT:    store i32 0, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA17:![0-9]+]]
 // GFX900-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR8]]
-// GFX900-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[DEFAULT_QUEUE_ASCAST]], align 8, !tbaa [[TBAA19:![0-9]+]]
-// GFX900-NEXT:    [[TMP1:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4, !tbaa [[TBAA17]]
-// GFX900-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT21:![0-9]+]]
-// GFX900-NEXT:    [[BLOCK_SIZE:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[BLOCK_ASCAST]], i32 0, i32 0
-// GFX900-NEXT:    store i32 25, ptr [[BLOCK_SIZE]], align 8
-// GFX900-NEXT:    [[BLOCK_ALIGN:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[BLOCK_ASCAST]], i32 0, i32 1
-// GFX900-NEXT:    store i32 8, ptr [[BLOCK_ALIGN]], align 4
-// GFX900-NEXT:    [[BLOCK_INVOKE:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[BLOCK_ASCAST]], i32 0, i32 2
-// GFX900-NEXT:    store ptr @__test_block_invoke, ptr [[BLOCK_INVOKE]], align 8
-// GFX900-NEXT:    [[BLOCK_CAPTURED:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[BLOCK_ASCAST]], i32 0, i32 3
-// GFX900-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[A_ADDR_ASCAST]], align 8, !tbaa [[TBAA14]]
-// GFX900-NEXT:    store ptr addrspace(1) [[TMP2]], ptr [[BLOCK_CAPTURED]], align 8, !tbaa [[TBAA14]]
-// GFX900-NEXT:    [[BLOCK_CAPTURED1:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[BLOCK_ASCAST]], i32 0, i32 4
-// GFX900-NEXT:    [[TMP3:%.*]] = load i8, ptr [[B_ADDR_ASCAST]], align 1, !tbaa [[TBAA16]]
-// GFX900-NEXT:    store i8 [[TMP3]], ptr [[BLOCK_CAPTURED1]], align 8, !tbaa [[TBAA16]]
-// GFX900-NEXT:    [[TMP4:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP0]], i32 [[TMP1]], ptr [[TMP_ASCAST]], ptr @__test_block_invoke_kernel, ptr [[BLOCK_ASCAST]])
-// GFX900-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr [[DEFAULT_QUEUE_ASCAST]], align 8, !tbaa [[TBAA19]]
-// GFX900-NEXT:    [[TMP6:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4, !tbaa [[TBAA17]]
-// GFX900-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP2_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT21]]
-// GFX900-NEXT:    [[BLOCK_SIZE4:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK3_ASCAST]], i32 0, i32 0
-// GFX900-NEXT:    store i32 41, ptr [[BLOCK_SIZE4]], align 8
-// GFX900-NEXT:    [[BLOCK_ALIGN5:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK3_ASCAST]], i32 0, i32 1
-// GFX900-NEXT:    store i32 8, ptr [[BLOCK_ALIGN5]], align 4
-// GFX900-NEXT:    [[BLOCK_INVOKE6:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK3_ASCAST]], i32 0, i32 2
-// GFX900-NEXT:    store ptr @__test_block_invoke_2, ptr [[BLOCK_INVOKE6]], align 8
-// GFX900-NEXT:    [[BLOCK_CAPTURED7:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK3_ASCAST]], i32 0, i32 3
-// GFX900-NEXT:    [[TMP7:%.*]] = load ptr addrspace(1), ptr [[A_ADDR_ASCAST]], align 8, !tbaa [[TBAA14]]
-// GFX900-NEXT:    store ptr addrspace(1) [[TMP7]], ptr [[BLOCK_CAPTURED7]], align 8, !tbaa [[TBAA14]]
-// GFX900-NEXT:    [[BLOCK_CAPTURED8:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK3_ASCAST]], i32 0, i32 6
-// GFX900-NEXT:    [[TMP8:%.*]] = load i8, ptr [[B_ADDR_ASCAST]], align 1, !tbaa [[TBAA16]]
-// GFX900-NEXT:    store i8 [[TMP8]], ptr [[BLOCK_CAPTURED8]], align 8, !tbaa [[TBAA16]]
-// GFX900-NEXT:    [[BLOCK_CAPTURED9:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK3_ASCAST]], i32 0, i32 4
-// GFX900-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr [[C_ADDR_ASCAST]], align 8, !tbaa [[TBAA7]]
-// GFX900-NEXT:    store ptr addrspace(1) [[TMP9]], ptr [[BLOCK_CAPTURED9]], align 8, !tbaa [[TBAA7]]
-// GFX900-NEXT:    [[BLOCK_CAPTURED10:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK3_ASCAST]], i32 0, i32 5
-// GFX900-NEXT:    [[TMP10:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[TBAA3]]
-// GFX900-NEXT:    store i64 [[TMP10]], ptr [[BLOCK_CAPTURED10]], align 8, !tbaa [[TBAA3]]
-// GFX900-NEXT:    [[TMP11:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP5]], i32 [[TMP6]], ptr [[TMP2_ASCAST]], ptr @__test_block_invoke_2_kernel, ptr [[BLOCK3_ASCAST]])
-// GFX900-NEXT:    [[TMP12:%.*]] = load ptr addrspace(1), ptr [[DEFAULT_QUEUE_ASCAST]], align 8, !tbaa [[TBAA19]]
-// GFX900-NEXT:    [[TMP13:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4, !tbaa [[TBAA17]]
-// GFX900-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP11_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT21]]
-// GFX900-NEXT:    [[BLOCK_SIZE13:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK12_ASCAST]], i32 0, i32 0
-// GFX900-NEXT:    store i32 41, ptr [[BLOCK_SIZE13]], align 8
-// GFX900-NEXT:    [[BLOCK_ALIGN14:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK12_ASCAST]], i32 0, i32 1
-// GFX900-NEXT:    store i32 8, ptr [[BLOCK_ALIGN14]], align 4
-// GFX900-NEXT:    [[BLOCK_INVOKE15:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK12_ASCAST]], i32 0, i32 2
-// GFX900-NEXT:    store ptr @__test_block_invoke_3, ptr [[BLOCK_INVOKE15]], align 8
-// GFX900-NEXT:    [[BLOCK_CAPTURED16:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK12_ASCAST]], i32 0, i32 3
-// GFX900-NEXT:    [[TMP14:%.*]] = load ptr addrspace(1), ptr [[A_ADDR_ASCAST]], align 8, !tbaa [[TBAA14]]
-// GFX900-NEXT:    store ptr addrspace(1) [[TMP14]], ptr [[BLOCK_CAPTURED16]], align 8, !tbaa [[TBAA14]]
-// GFX900-NEXT:    [[BLOCK_CAPTURED17:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK12_ASCAST]], i32 0, i32 6
-// GFX900-NEXT:    [[TMP15:%.*]] = load i8, ptr [[B_ADDR_ASCAST]], align 1, !tbaa [[TBAA16]]
-// GFX900-NEXT:    store i8 [[TMP15]], ptr [[BLOCK_CAPTURED17]], align 8, !tbaa [[TBAA16]]
-// GFX900-NEXT:    [[BLOCK_CAPTURED18:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK12_ASCAST]], i32 0, i32 4
-// GFX900-NEXT:    [[TMP16:%.*]] = load ptr addrspace(1), ptr [[C_ADDR_ASCAST]], align 8, !tbaa [[TBAA7]]
-// GFX900-NEXT:    store ptr addrspace(1) [[TMP16]], ptr [[BLOCK_CAPTURED18]], align 8, !tbaa [[TBAA7]]
-// GFX900-NEXT:    [[BLOCK_CAPTURED19:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK12_ASCAST]], i32 0, i32 5
-// GFX900-NEXT:    [[TMP17:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[TBAA3]]
-// GFX900-NEXT:    store i64 [[TMP17]], ptr [[BLOCK_CAPTURED19]], align 8, !tbaa [[TBAA3]]
+// GFX900-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[TBAA19:![0-9]+]]
+// GFX900-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA17]]
+// GFX900-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[TMP]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT21:![0-9]+]]
+// GFX900-NEXT:    [[BLOCK_SIZE:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr addrspace(5) [[BLOCK]], i32 0, i32 0
+// GFX900-NEXT:    store i32 25, ptr addrspace(5) [[BLOCK_SIZE]], align 8
+// GFX900-NEXT:    [[BLOCK_ALIGN:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr addrspace(5) [[BLOCK]], i32 0, i32 1
+// GFX900-NEXT:    store i32 8, ptr addrspace(5) [[BLOCK_ALIGN]], align 4
+// GFX900-NEXT:    [[BLOCK_INVOKE:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr addrspace(5) [[BLOCK]], i32 0, i32 2
+// GFX900-NEXT:    store ptr @__test_block_invoke, ptr addrspace(5) [[BLOCK_INVOKE]], align 8
+// GFX900-NEXT:    [[BLOCK_CAPTURED:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr addrspace(5) [[BLOCK]], i32 0, i32 3
+// GFX900-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8, !tbaa [[TBAA14]]
+// GFX900-NEXT:    store ptr addrspace(1) [[TMP2]], ptr addrspace(5) [[BLOCK_CAPTURED]], align 8, !tbaa [[TBAA14]]
+// GFX900-NEXT:    [[BLOCK_CAPTURED1:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr addrspace(5) [[BLOCK]], i32 0, i32 4
+// GFX900-NEXT:    [[TMP3:%.*]] = load i8, ptr addrspace(5) [[B_ADDR]], align 1, !tbaa [[TBAA16]]
+// GFX900-NEXT:    store i8 [[TMP3]], ptr addrspace(5) [[BLOCK_CAPTURED1]], align 8, !tbaa [[TBAA16]]
+// GFX900-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK]] to ptr
+// GFX900-NEXT:    [[TMP5:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP0]], i32 [[TMP1]], ptr addrspace(5) [[TMP]], ptr @__test_block_invoke_kernel, ptr [[TMP4]])
+// GFX900-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[TBAA19]]
+// GFX900-NEXT:    [[TMP7:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA17]]
+// GFX900-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[VARTMP2]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT21]]
+// GFX900-NEXT:    [[BLOCK_SIZE4:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 0
+// GFX900-NEXT:    store i32 41, ptr addrspace(5) [[BLOCK_SIZE4]], align 8
+// GFX900-NEXT:    [[BLOCK_ALIGN5:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 1
+// GFX900-NEXT:    store i32 8, ptr addrspace(5) [[BLOCK_ALIGN5]], align 4
+// GFX900-NEXT:    [[BLOCK_INVOKE6:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 2
+// GFX900-NEXT:    store ptr @__test_block_invoke_2, ptr addrspace(5) [[BLOCK_INVOKE6]], align 8
+// GFX900-NEXT:    [[BLOCK_CAPTURED7:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 3
+// GFX900-NEXT:    [[TMP8:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8, !tbaa [[TBAA14]]
+// GFX900-NEXT:    store ptr addrspace(1) [[TMP8]], ptr addrspace(5) [[BLOCK_CAPTURED7]], align 8, !tbaa [[TBAA14]]
+// GFX900-NEXT:    [[BLOCK_CAPTURED8:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 6
+// GFX900-NEXT:    [[TMP9:%.*]] = load i8, ptr addrspace(5) [[B_ADDR]], align 1, !tbaa [[TBAA16]]
+// GFX900-NEXT:    store i8 [[TMP9]], ptr addrspace(5) [[BLOCK_CAPTURED8]], align 8, !tbaa [[TBAA16]]
+// GFX900-NEXT:    [[BLOCK_CAPTURED9:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 4
+// GFX900-NEXT:    [[TMP10:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[C_ADDR]], align 8, !tbaa [[TBAA7]]
+// GFX900-NEXT:    store ptr addrspace(1) [[TMP10]], ptr addrspace(5) [[BLOCK_CAPTURED9]], align 8, !tbaa [[TBAA7]]
+// GFX900-NEXT:    [[BLOCK_CAPTURED10:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 5
+// GFX900-NEXT:    [[TMP11:%.*]] = load i64, ptr addrspace(5) [[D_ADDR]], align 8, !tbaa [[TBAA3]]
+// GFX900-NEXT:    store i64 [[TMP11]], ptr addrspace(5) [[BLOCK_CAPTURED10]], align 8, !tbaa [[TBAA3]]
+// GFX900-NEXT:    [[TMP12:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK3]] to ptr
+// GFX900-NEXT:    [[TMP13:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP6]], i32 [[TMP7]], ptr addrspace(5) [[VARTMP2]], ptr @__test_block_invoke_2_kernel, ptr [[TMP12]])
+// GFX900-NEXT:    [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[TBAA19]]
+// GFX900-NEXT:    [[TMP15:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA17]]
+// GFX900-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[VARTMP11]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT21]]
+// GFX900-NEXT:    [[BLOCK_SIZE13:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 0
+// GFX900-NEXT:    store i32 41, ptr addrspace(5) [[BLOCK_SIZE13]], align 8
+// GFX900-NEXT:    [[BLOCK_ALIGN14:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 1
+// GFX900-NEXT:    store i32 8, ptr addrspace(5) [[BLOCK_ALIGN14]], align 4
+// GFX900-NEXT:    [[BLOCK_INVOKE15:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 2
+// GFX900-NEXT:    store ptr @__test_block_invoke_3, ptr addrspace(5) [[BLOCK_INVOKE15]], align 8
+// GFX900-NEXT:    [[BLOCK_CAPTURED16:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 3
+// GFX900-NEXT:    [[TMP16:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8, !tbaa [[TBAA14]]
+// GFX900-NEXT:    store ptr addrspace(1) [[TMP16]], ptr addrspace(5) [[BLOCK_CAPTURED16]], align 8, !tbaa [[TBAA14]]
+// GFX900-NEXT:    [[BLOCK_CAPTURED17:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 6
+// GFX900-NEXT:    [[TMP17:%.*]] = load i8, ptr addrspace(5) [[B_ADDR]], align 1, !tbaa [[TBAA16]]
+// GFX900-NEXT:    store i8 [[TMP17]], ptr addrspace(5) [[BLOCK_CAPTURED17]], align 8, !tbaa [[TBAA16]]
+// GFX900-NEXT:    [[BLOCK_CAPTURED18:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 4
+// GFX900-NEXT:    [[TMP18:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[C_ADDR]], align 8, !tbaa [[TBAA7]]
+// GFX900-NEXT:    store ptr addrspace(1) [[TMP18]], ptr addrspace(5) [[BLOCK_CAPTURED18]], align 8, !tbaa [[TBAA7]]
+// GFX900-NEXT:    [[BLOCK_CAPTURED19:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 5
+// GFX900-NEXT:    [[TMP19:%.*]] = load i64, ptr addrspace(5) [[D_ADDR]], align 8, !tbaa [[TBAA3]]
+// GFX900-NEXT:    store i64 [[TMP19]], ptr addrspace(5) [[BLOCK_CAPTURED19]], align 8, !tbaa [[TBAA3]]
+// GFX900-NEXT:    [[TMP20:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK12]] to ptr
 // GFX900-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[BLOCK_SIZES]]) #[[ATTR8]]
-// GFX900-NEXT:    [[TMP18:%.*]] = getelementptr [1 x i64], ptr [[BLOCK_SIZES_ASCAST]], i32 0, i32 0
-// GFX900-NEXT:    store i64 100, ptr [[TMP18]], align 8
-// GFX900-NEXT:    [[TMP19:%.*]] = call i32 @__enqueue_kernel_varargs(ptr addrspace(1) [[TMP12]], i32 [[TMP13]], ptr [[TMP11_ASCAST]], ptr @__test_block_invoke_3_kernel, ptr [[BLOCK12_ASCAST]], i32 1, ptr [[TMP18]])
+// GFX900-NEXT:    [[TMP21:%.*]] = getelementptr [1 x i64], ptr addrspace(5) [[BLOCK_SIZES]], i32 0, i32 0
+// GFX900-NEXT:    store i64 100, ptr addrspace(5) [[TMP21]], align 8
+// GFX900-NEXT:    [[TMP22:%.*]] = call i32 @__enqueue_kernel_varargs(ptr addrspace(1) [[TMP14]], i32 [[TMP15]], ptr addrspace(5) [[VARTMP11]], ptr @__test_block_invoke_3_kernel, ptr [[TMP20]], i32 1, ptr addrspace(5) [[TMP21]])
 // GFX900-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[BLOCK_SIZES]]) #[[ATTR8]]
 // GFX900-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[BLOCK20]]) #[[ATTR8]]
-// GFX900-NEXT:    [[BLOCK_SIZE22:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[BLOCK21_ASCAST]], i32 0, i32 0
-// GFX900-NEXT:    store i32 32, ptr [[BLOCK_SIZE22]], align 8
-// GFX900-NEXT:    [[BLOCK_ALIGN23:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[BLOCK21_ASCAST]], i32 0, i32 1
-// GFX900-NEXT:    store i32 8, ptr [[BLOCK_ALIGN23]], align 4
-// GFX900-NEXT:    [[BLOCK_INVOKE24:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[BLOCK21_ASCAST]], i32 0, i32 2
-// GFX900-NEXT:    store ptr @__test_block_invoke_4, ptr [[BLOCK_INVOKE24]], align 8
-// GFX900-NEXT:    [[BLOCK_CAPTURED25:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[BLOCK21_ASCAST]], i32 0, i32 3
-// GFX900-NEXT:    [[TMP20:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[TBAA3]]
-// GFX900-NEXT:    store i64 [[TMP20]], ptr [[BLOCK_CAPTURED25]], align 8, !tbaa [[TBAA3]]
-// GFX900-NEXT:    [[BLOCK_CAPTURED26:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[BLOCK21_ASCAST]], i32 0, i32 4
-// GFX900-NEXT:    [[TMP21:%.*]] = load ptr addrspace(1), ptr [[C_ADDR_ASCAST]], align 8, !tbaa [[TBAA7]]
-// GFX900-NEXT:    store ptr addrspace(1) [[TMP21]], ptr [[BLOCK_CAPTURED26]], align 8, !tbaa [[TBAA7]]
-// GFX900-NEXT:    store ptr [[BLOCK21_ASCAST]], ptr [[BLOCK20_ASCAST]], align 8, !tbaa [[TBAA16]]
-// GFX900-NEXT:    [[TMP22:%.*]] = load ptr addrspace(1), ptr [[DEFAULT_QUEUE_ASCAST]], align 8, !tbaa [[TBAA19]]
-// GFX900-NEXT:    [[TMP23:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4, !tbaa [[TBAA17]]
-// GFX900-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP27_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT21]]
-// GFX900-NEXT:    [[TMP24:%.*]] = load ptr, ptr [[BLOCK20_ASCAST]], align 8, !tbaa [[TBAA16]]
-// GFX900-NEXT:    [[TMP25:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP22]], i32 [[TMP23]], ptr [[TMP27_ASCAST]], ptr @__test_block_invoke_4_kernel, ptr [[BLOCK21_ASCAST]])
+// GFX900-NEXT:    [[BLOCK_SIZE22:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr addrspace(5) [[BLOCK21]], i32 0, i32 0
+// GFX900-NEXT:    store i32 32, ptr addrspace(5) [[BLOCK_SIZE22]], align 8
+// GFX900-NEXT:    [[BLOCK_ALIGN23:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr addrspace(5) [[BLOCK21]], i32 0, i32 1
+// GFX900-NEXT:    store i32 8, ptr addrspace(5) [[BLOCK_ALIGN23]], align 4
+// GFX900-NEXT:    [[BLOCK_INVOKE24:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr addrspace(5) [[BLOCK21]], i32 0, i32 2
+// GFX900-NEXT:    store ptr @__test_block_invoke_4, ptr addrspace(5) [[BLOCK_INVOKE24]], align 8
+// GFX900-NEXT:    [[BLOCK_CAPTURED25:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr addrspace(5) [[BLOCK21]], i32 0, i32 3
+// GFX900-NEXT:    [[TMP23:%.*]] = load i64, ptr addrspace(5) [[D_ADDR]], align 8, !tbaa [[TBAA3]]
+// GFX900-NEXT:    store i64 [[TMP23]], ptr addrspace(5) [[BLOCK_CAPTURED25]], align 8, !tbaa [[TBAA3]]
+// GFX900-NEXT:    [[BLOCK_CAPTURED26:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr addrspace(5) [[BLOCK21]], i32 0, i32 4
+// GFX900-NEXT:    [[TMP24:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[C_ADDR]], align 8, !tbaa [[TBAA7]]
+// GFX900-NEXT:    store ptr addrspace(1) [[TMP24]], ptr addrspace(5) [[BLOCK_CAPTURED26]], align 8, !tbaa [[TBAA7]]
+// GFX900-NEXT:    [[BLOCK21_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK21]] to ptr
+// GFX900-NEXT:    store ptr [[BLOCK21_ASCAST]], ptr addrspace(5) [[BLOCK20]], align 8, !tbaa [[TBAA16]]
+// GFX900-NEXT:    [[TMP25:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[TBAA19]]
+// GFX900-NEXT:    [[TMP26:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA17]]
+// GFX900-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[VARTMP27]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT21]]
+// GFX900-NEXT:    [[TMP27:%.*]] = load ptr, ptr addrspace(5) [[BLOCK20]], align 8, !tbaa [[TBAA16]]
+// GFX900-NEXT:    [[TMP28:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK21]] to ptr
+// GFX900-NEXT:    [[TMP29:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP25]], i32 [[TMP26]], ptr addrspace(5) [[VARTMP27]], ptr @__test_block_invoke_4_kernel, ptr [[TMP28]])
 // GFX900-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[BLOCK20]]) #[[ATTR8]]
 // GFX900-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR8]]
 // GFX900-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR8]]
@@ -566,8 +522,7 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR5:[0-9]+]] {
 // GFX900-NEXT:  entry:
 // GFX900-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// GFX900-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]] to ptr
-// GFX900-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST]], align 8
+// GFX900-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]], align 8
 // GFX900-NEXT:    [[BLOCK_CAPTURE_ADDR:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 4
 // GFX900-NEXT:    [[TMP0:%.*]] = load i8, ptr [[BLOCK_CAPTURE_ADDR]], align 8, !tbaa [[TBAA16]]
 // GFX900-NEXT:    [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 3
@@ -593,8 +548,7 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR5]] {
 // GFX900-NEXT:  entry:
 // GFX900-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// GFX900-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]] to ptr
-// GFX900-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST]], align 8
+// GFX900-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]], align 8
 // GFX900-NEXT:    [[BLOCK_CAPTURE_ADDR:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 6
 // GFX900-NEXT:    [[TMP0:%.*]] = load i8, ptr [[BLOCK_CAPTURE_ADDR]], align 8, !tbaa [[TBAA16]]
 // GFX900-NEXT:    [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 3
@@ -627,10 +581,8 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-NEXT:  entry:
 // GFX900-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // GFX900-NEXT:    [[LP_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
-// GFX900-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]] to ptr
-// GFX900-NEXT:    [[LP_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[LP_ADDR]] to ptr
-// GFX900-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST]], align 8
-// GFX900-NEXT:    store ptr addrspace(3) [[LP]], ptr [[LP_ADDR_ASCAST]], align 4, !tbaa [[TBAA26:![0-9]+]]
+// GFX900-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]], align 8
+// GFX900-NEXT:    store ptr addrspace(3) [[LP]], ptr addrspace(5) [[LP_ADDR]], align 4, !tbaa [[TBAA26:![0-9]+]]
 // GFX900-NEXT:    [[BLOCK_CAPTURE_ADDR:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 6
 // GFX900-NEXT:    [[TMP0:%.*]] = load i8, ptr [[BLOCK_CAPTURE_ADDR]], align 8, !tbaa [[TBAA16]]
 // GFX900-NEXT:    [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 3
@@ -643,7 +595,7 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR3]], align 8, !tbaa [[TBAA7]]
 // GFX900-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[TMP3]], i64 0
 // GFX900-NEXT:    store i64 [[TMP2]], ptr addrspace(1) [[ARRAYIDX4]], align 8, !tbaa [[TBAA3]]
-// GFX900-NEXT:    [[TMP4:%.*]] = load ptr addrspace(3), ptr [[LP_ADDR_ASCAST]], align 4, !tbaa [[TBAA26]]
+// GFX900-NEXT:    [[TMP4:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[LP_ADDR]], align 4, !tbaa [[TBAA26]]
 // GFX900-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr addrspace(3) [[TMP4]], i64 0
 // GFX900-NEXT:    store i32 1, ptr addrspace(3) [[ARRAYIDX5]], align 4, !tbaa [[TBAA17]]
 // GFX900-NEXT:    ret void
@@ -665,8 +617,7 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR5]] {
 // GFX900-NEXT:  entry:
 // GFX900-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// GFX900-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]] to ptr
-// GFX900-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST]], align 8
+// GFX900-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]], align 8
 // GFX900-NEXT:    [[BLOCK_CAPTURE_ADDR:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 3
 // GFX900-NEXT:    [[TMP0:%.*]] = load i64, ptr [[BLOCK_CAPTURE_ADDR]], align 8, !tbaa [[TBAA3]]
 // GFX900-NEXT:    [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 4
@@ -695,21 +646,16 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-NEXT:    [[FLAGS:%.*]] = alloca i32, align 4, addrspace(5)
 // GFX900-NEXT:    [[NDRANGE:%.*]] = alloca [[STRUCT_NDRANGE_T:%.*]], align 4, addrspace(5)
 // GFX900-NEXT:    [[TMP:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4, addrspace(5)
-// GFX900-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
-// GFX900-NEXT:    [[DEFAULT_QUEUE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DEFAULT_QUEUE]] to ptr
-// GFX900-NEXT:    [[FLAGS_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[FLAGS]] to ptr
-// GFX900-NEXT:    [[NDRANGE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[NDRANGE]] to ptr
-// GFX900-NEXT:    [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr
-// GFX900-NEXT:    store ptr addrspace(1) [[I]], ptr [[I_ADDR_ASCAST]], align 8, !tbaa [[TBAA33:![0-9]+]]
+// GFX900-NEXT:    store ptr addrspace(1) [[I]], ptr addrspace(5) [[I_ADDR]], align 8, !tbaa [[TBAA33:![0-9]+]]
 // GFX900-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR8]]
 // GFX900-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR8]]
-// GFX900-NEXT:    store i32 0, ptr [[FLAGS_ASCAST]], align 4, !tbaa [[TBAA17]]
+// GFX900-NEXT:    store i32 0, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA17]]
 // GFX900-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR8]]
 // GFX900-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.memtime()
-// GFX900-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[DEFAULT_QUEUE_ASCAST]], align 8, !tbaa [[TBAA19]]
-// GFX900-NEXT:    [[TMP2:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4, !tbaa [[TBAA17]]
-// GFX900-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT21]]
-// GFX900-NEXT:    [[TMP3:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP1]], i32 [[TMP2]], ptr [[TMP_ASCAST]], ptr @__test_target_features_kernel_block_invoke_kernel, ptr addrspacecast (ptr addrspace(1) @__block_literal_global to ptr))
+// GFX900-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[TBAA19]]
+// GFX900-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA17]]
+// GFX900-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[TMP]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT21]]
+// GFX900-NEXT:    [[TMP3:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP1]], i32 [[TMP2]], ptr addrspace(5) [[TMP]], ptr @__test_target_features_kernel_block_invoke_kernel, ptr addrspacecast (ptr addrspace(1) @__block_literal_global to ptr))
 // GFX900-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR8]]
 // GFX900-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR8]]
 // GFX900-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR8]]
@@ -721,8 +667,7 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR5]] {
 // GFX900-NEXT:  entry:
 // GFX900-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// GFX900-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]] to ptr
-// GFX900-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST]], align 8
+// GFX900-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]], align 8
 // GFX900-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.memtime()
 // GFX900-NEXT:    ret void
 //
diff --git a/clang/test/CodeGenOpenCL/amdgpu-nullptr.cl b/clang/test/CodeGenOpenCL/amdgpu-nullptr.cl
index a0c106bca83c9..7012c5a24ca3d 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-nullptr.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-nullptr.cl
@@ -139,12 +139,12 @@ void test_static_var_local(void) {
 
 // Test function-scope variable initialization.
 // NOOPT-LABEL: @test_func_scope_var_private(
-// NOOPT: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr %sp1{{.*}}, align 4
-// NOOPT: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr %sp2{{.*}}, align 4
-// NOOPT: store ptr addrspace(5) null, ptr %sp3{{.*}}, align 4
-// NOOPT: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr %sp4{{.*}}, align 4
-// NOOPT: call void @llvm.memcpy.p0.p4.i64(ptr align 8 %SS1{{.*}}, ptr addrspace(4) align 8 @__const.test_func_scope_var_private.SS1, i64 32, i1 false)
-// NOOPT: call void @llvm.memset.p0.i64(ptr align 8 %SS2{{.*}}, i8 0, i64 24, i1 false)
+// NOOPT: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(5) %sp1, align 4
+// NOOPT: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(5) %sp2, align 4
+// NOOPT: store ptr addrspace(5) null, ptr addrspace(5) %sp3, align 4
+// NOOPT: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(5) %sp4, align 4
+// NOOPT: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 %SS1, ptr addrspace(4) align 8 @__const.test_func_scope_var_private.SS1, i64 32, i1 false)
+// NOOPT: call void @llvm.memset.p5.i64(ptr addrspace(5) align 8 %SS2, i8 0, i64 24, i1 false)
 void test_func_scope_var_private(void) {
   private char *sp1 = 0;
   private char *sp2 = NULL;
@@ -157,12 +157,12 @@ void test_func_scope_var_private(void) {
 
 // Test function-scope variable initialization.
 // NOOPT-LABEL: @test_func_scope_var_local(
-// NOOPT: store ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr %sp1{{.*}}, align 4
-// NOOPT: store ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr %sp2{{.*}}, align 4
-// NOOPT: store ptr addrspace(3) null, ptr %sp3{{.*}}, align 4
-// NOOPT: store ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr %sp4{{.*}}, align 4
-// NOOPT: call void @llvm.memcpy.p0.p4.i64(ptr align 8 %SS1{{.*}}, ptr addrspace(4) align 8 @__const.test_func_scope_var_local.SS1, i64 32, i1 false)
-// NOOPT: call void @llvm.memset.p0.i64(ptr align 8 %SS2{{.*}}, i8 0, i64 24, i1 false)
+// NOOPT: store ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(5) %sp1, align 4
+// NOOPT: store ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(5) %sp2, align 4
+// NOOPT: store ptr addrspace(3) null, ptr addrspace(5) %sp3, align 4
+// NOOPT: store ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(5) %sp4, align 4
+// NOOPT: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 %SS1, ptr addrspace(4) align 8 @__const.test_func_scope_var_local.SS1, i64 32, i1 false)
+// NOOPT: call void @llvm.memset.p5.i64(ptr addrspace(5) align 8 %SS2, i8 0, i64 24, i1 false)
 void test_func_scope_var_local(void) {
   local char *sp1 = 0;
   local char *sp2 = NULL;
@@ -603,7 +603,7 @@ int test_and_ptr(private char* p1, local char* p2) {
 // Test folding of null pointer in function scope.
 // NOOPT-LABEL: test_fold_private
 // NOOPT: call void @test_fold_callee
-// NOOPT: store ptr addrspace(1) null, ptr %glob{{.*}}, align 8
+// NOOPT: store ptr addrspace(1) null, ptr addrspace(5) %glob, align 8
 // NOOPT: %{{.*}} = sub i64 %{{.*}}, 0
 // NOOPT: call void @test_fold_callee
 // NOOPT: %[[SEXT:.*]] = sext i32 ptrtoint (ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)) to i32) to i64
@@ -619,7 +619,7 @@ void test_fold_private(void) {
 
 // NOOPT-LABEL: test_fold_local
 // NOOPT: call void @test_fold_callee
-// NOOPT: store ptr addrspace(1) null, ptr %glob{{.*}}, align 8
+// NOOPT: store ptr addrspace(1) null, ptr addrspace(5) %glob, align 8
 // NOOPT: %{{.*}} = sub i64 %{{.*}}, 0
 // NOOPT: call void @test_fold_callee
 // NOOPT: %[[SEXT:.*]] = sext i32 ptrtoint (ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)) to i32) to i64
diff --git a/clang/test/CodeGenOpenCL/atomic-ops.cl b/clang/test/CodeGenOpenCL/atomic-ops.cl
index 1d850261e5e81..9c1775cc04303 100644
--- a/clang/test/CodeGenOpenCL/atomic-ops.cl
+++ b/clang/test/CodeGenOpenCL/atomic-ops.cl
@@ -344,14 +344,11 @@ int test_volatile(volatile atomic_int *i) {
   // CHECK-LABEL: @test_volatile
   // CHECK:      %[[i_addr:.*]] = alloca ptr
   // CHECK-NEXT: %[[atomicdst:.*]] = alloca i32
-  // CHECK-NEXT: %[[retval_ascast:.*]] = addrspacecast ptr addrspace(5) {{.*}} to ptr
-  // CHECK-NEXT: %[[i_addr_ascast:.*]] = addrspacecast ptr addrspace(5) %[[i_addr]] to ptr
-  // CHECK-NEXT: %[[atomicdst_ascast:.*]] = addrspacecast ptr addrspace(5) %[[atomicdst]] to ptr
-  // CHECK-NEXT: store ptr %i, ptr %[[i_addr_ascast]]
-  // CHECK-NEXT: %[[addr:.*]] = load ptr, ptr %[[i_addr_ascast]]
+  // CHECK-NEXT: store ptr %i, ptr addrspace(5) %[[i_addr]]
+  // CHECK-NEXT: %[[addr:.*]] = load ptr, ptr addrspace(5) %[[i_addr]]
   // CHECK-NEXT: %[[res:.*]] = load atomic volatile i32, ptr %[[addr]] syncscope("workgroup") seq_cst, align 4{{$}}
-  // CHECK-NEXT: store i32 %[[res]], ptr %[[atomicdst_ascast]]
-  // CHECK-NEXT: %[[retval:.*]] = load i32, ptr %[[atomicdst_ascast]]
+  // CHECK-NEXT: store i32 %[[res]], ptr addrspace(5) %[[atomicdst]]
+  // CHECK-NEXT: %[[retval:.*]] = load i32, ptr addrspace(5) %[[atomicdst]]
   // CHECK-NEXT: ret i32 %[[retval]]
   return __opencl_atomic_load(i, memory_order_seq_cst, memory_scope_work_group);
 }
diff --git a/clang/test/CodeGenOpenCL/atomics-unsafe-hw-remarks-gfx90a.cl b/clang/test/CodeGenOpenCL/atomics-unsafe-hw-remarks-gfx90a.cl
index 0dafb44f12a3c..7d684bc185a58 100644
--- a/clang/test/CodeGenOpenCL/atomics-unsafe-hw-remarks-gfx90a.cl
+++ b/clang/test/CodeGenOpenCL/atomics-unsafe-hw-remarks-gfx90a.cl
@@ -31,9 +31,9 @@ typedef enum memory_scope {
 // GFX90A-HW-REMARK: Hardware instruction generated for atomic fadd operation at memory scope agent-one-as due to an unsafe request. [-Rpass=si-lower]
 // GFX90A-HW-REMARK: Hardware instruction generated for atomic fadd operation at memory scope workgroup-one-as due to an unsafe request. [-Rpass=si-lower]
 
-// GFX90A-HW-REMARK: global_atomic_add_f32 v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off glc
-// GFX90A-HW-REMARK: global_atomic_add_f32 v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off glc
-// GFX90A-HW-REMARK: global_atomic_add_f32 v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off glc
+// GFX90A-HW-REMARK: global_atomic_add_f32 v0, v[0:1], v2, off glc
+// GFX90A-HW-REMARK: global_atomic_add_f32 v0, v[0:1], v2, off glc
+// GFX90A-HW-REMARK: global_atomic_add_f32 v0, v[0:1], v2, off glc
 // GFX90A-HW-LABEL: @atomic_unsafe_hw
 // GFX90A-HW:   atomicrmw fadd ptr addrspace(1) %{{.*}}, float %{{.*}} syncscope("workgroup-one-as") monotonic, align 4
 // GFX90A-HW:   atomicrmw fadd ptr addrspace(1) %{{.*}}, float %{{.*}} syncscope("agent-one-as") monotonic, align 4
diff --git a/clang/test/CodeGenOpenCL/blocks.cl b/clang/test/CodeGenOpenCL/blocks.cl
index 161f1406c96cb..e04722f657cfa 100644
--- a/clang/test/CodeGenOpenCL/blocks.cl
+++ b/clang/test/CodeGenOpenCL/blocks.cl
@@ -25,13 +25,13 @@ void foo(){
   // COMMON-NOT: %block.reserved
   // COMMON-NOT: %block.descriptor
   // SPIR: %[[block_size:.*]] = getelementptr inbounds nuw <{ i32, i32, ptr addrspace(4), i32 }>, ptr %block, i32 0, i32 0
-  // AMDGCN: %[[block_size:.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i32 }>, ptr %block{{.*}}, i32 0, i32 0
+  // AMDGCN: %[[block_size:.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i32 }>, ptr addrspace(5) %block, i32 0, i32 0
   // SPIR: store i32 16, ptr %[[block_size]]
-  // AMDGCN: store i32 20, ptr %[[block_size]]
+  // AMDGCN: store i32 20, ptr addrspace(5) %[[block_size]]
   // SPIR: %[[block_align:.*]] = getelementptr inbounds nuw <{ i32, i32, ptr addrspace(4), i32 }>, ptr %block, i32 0, i32 1
-  // AMDGCN: %[[block_align:.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i32 }>, ptr %block{{.*}}, i32 0, i32 1
+  // AMDGCN: %[[block_align:.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i32 }>, ptr addrspace(5) %block, i32 0, i32 1
   // SPIR: store i32 4, ptr %[[block_align]]
-  // AMDGCN: store i32 8, ptr %[[block_align]]
+  // AMDGCN: store i32 8, ptr addrspace(5) %[[block_align]]
   // SPIR: %[[block_invoke:.*]] = getelementptr inbounds nuw <{ i32, i32, ptr addrspace(4), i32 }>, ptr %[[block:.*]], i32 0, i32 2
   // SPIR: store ptr addrspace(4) addrspacecast (ptr @__foo_block_invoke to ptr addrspace(4)), ptr %[[block_invoke]]
   // SPIR: %[[block_captured:.*]] = getelementptr inbounds nuw <{ i32, i32, ptr addrspace(4), i32 }>, ptr %[[block]], i32 0, i32 3
@@ -41,13 +41,14 @@ void foo(){
   // SPIR: store ptr addrspace(4) %[[blk_gen_ptr]], ptr %[[block_B:.*]],
   // SPIR: %[[block_literal:.*]] = load ptr addrspace(4), ptr %[[block_B]]
   // SPIR: call {{.*}}i32 @__foo_block_invoke(ptr addrspace(4) noundef %[[block_literal]])
-  // AMDGCN: %[[block_invoke:.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i32 }>, ptr %[[block:.*]], i32 0, i32 2
-  // AMDGCN: store ptr @__foo_block_invoke, ptr %[[block_invoke]]
-  // AMDGCN: %[[block_captured:.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i32 }>, ptr %[[block]], i32 0, i32 3
-  // AMDGCN: %[[i_value:.*]] = load i32, ptr %i
-  // AMDGCN: store i32 %[[i_value]], ptr %[[block_captured]],
-  // AMDGCN: store ptr %[[block]], ptr %[[block_B:.*]],
-  // AMDGCN: %[[block_literal:.*]] = load ptr, ptr %[[block_B]]
+  // AMDGCN: %[[block_invoke:.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i32 }>, ptr addrspace(5) %[[block:.*]], i32 0, i32 2
+  // AMDGCN: store ptr @__foo_block_invoke, ptr addrspace(5) %[[block_invoke]]
+  // AMDGCN: %[[block_captured:.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i32 }>, ptr addrspace(5) %[[block]], i32 0, i32 3
+  // AMDGCN: %[[i_value:.*]] = load i32, ptr addrspace(5) %i
+  // AMDGCN: store i32 %[[i_value]], ptr addrspace(5) %[[block_captured]],
+  // AMDGCN: %[[blk_gen_ptr:.*]] = addrspacecast ptr addrspace(5) %[[block]] to ptr
+  // AMDGCN: store ptr %[[blk_gen_ptr]], ptr addrspace(5) %[[block_B:.*]],
+  // AMDGCN: %[[block_literal:.*]] = load ptr, ptr addrspace(5) %[[block_B]]
   // AMDGCN: call {{.*}}i32 @__foo_block_invoke(ptr noundef %[[block_literal]])
 
   int (^ block_B)(void) = ^{
diff --git a/clang/test/CodeGenOpenCL/builtins-alloca.cl b/clang/test/CodeGenOpenCL/builtins-alloca.cl
index 85b449e45a0f1..474e95e74e006 100644
--- a/clang/test/CodeGenOpenCL/builtins-alloca.cl
+++ b/clang/test/CodeGenOpenCL/builtins-alloca.cl
@@ -1,12 +1,12 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
 // RUN: %clang_cc1 %s -O0 -triple amdgcn-amd-amdhsa -cl-std=CL1.2 \
-// RUN:     -emit-llvm -o - | FileCheck --check-prefixes=OPENCL12 %s
+// RUN:     -emit-llvm -o - | FileCheck --check-prefixes=OPENCL %s
 // RUN: %clang_cc1 %s -O0 -triple amdgcn-amd-amdhsa -cl-std=CL2.0 \
-// RUN:     -emit-llvm -o - | FileCheck --check-prefixes=OPENCL20 %s
+// RUN:     -emit-llvm -o - | FileCheck --check-prefixes=OPENCL %s
 // RUN: %clang_cc1 %s -O0 -triple amdgcn-amd-amdhsa -cl-std=CL3.0 \
-// RUN:     -emit-llvm -o - | FileCheck --check-prefixes=OPENCL30 %s
+// RUN:     -emit-llvm -o - | FileCheck --check-prefixes=OPENCL %s
 // RUN: %clang_cc1 %s -O0 -triple amdgcn-amd-amdhsa -cl-std=CL3.0 -cl-ext=+__opencl_c_generic_address_space \
-// RUN:     -emit-llvm -o - | FileCheck --check-prefixes=OPENCL30GAS %s
+// RUN:     -emit-llvm -o - | FileCheck --check-prefixes=OPENCL %s
 
 // OPENCL-LABEL: define dso_local void @test1_builtin_alloca(
 // OPENCL-SAME: i32 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] {
@@ -20,61 +20,6 @@
 // OPENCL-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[MUL]], align 8, addrspace(5)
 // OPENCL-NEXT:    store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR]], align 4
 // OPENCL-NEXT:    ret void
-// OPENCL12-LABEL: define dso_local void @test1_builtin_alloca(
-// OPENCL12-SAME: i32 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] {
-// OPENCL12-NEXT:  [[ENTRY:.*:]]
-// OPENCL12-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// OPENCL12-NEXT:    [[ALLOC_PTR:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
-// OPENCL12-NEXT:    store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
-// OPENCL12-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
-// OPENCL12-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
-// OPENCL12-NEXT:    [[MUL:%.*]] = mul i64 [[CONV]], 4
-// OPENCL12-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[MUL]], align 8, addrspace(5)
-// OPENCL12-NEXT:    store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR]], align 4
-// OPENCL12-NEXT:    ret void
-//
-// OPENCL20-LABEL: define dso_local void @test1_builtin_alloca(
-// OPENCL20-SAME: i32 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] {
-// OPENCL20-NEXT:  [[ENTRY:.*:]]
-// OPENCL20-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// OPENCL20-NEXT:    [[ALLOC_PTR:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
-// OPENCL20-NEXT:    [[N_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_ADDR]] to ptr
-// OPENCL20-NEXT:    [[ALLOC_PTR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOC_PTR]] to ptr
-// OPENCL20-NEXT:    store i32 [[N]], ptr [[N_ADDR_ASCAST]], align 4
-// OPENCL20-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4
-// OPENCL20-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
-// OPENCL20-NEXT:    [[MUL:%.*]] = mul i64 [[CONV]], 4
-// OPENCL20-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[MUL]], align 8, addrspace(5)
-// OPENCL20-NEXT:    store ptr addrspace(5) [[TMP1]], ptr [[ALLOC_PTR_ASCAST]], align 4
-// OPENCL20-NEXT:    ret void
-//
-// OPENCL30-LABEL: define dso_local void @test1_builtin_alloca(
-// OPENCL30-SAME: i32 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] {
-// OPENCL30-NEXT:  [[ENTRY:.*:]]
-// OPENCL30-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// OPENCL30-NEXT:    [[ALLOC_PTR:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
-// OPENCL30-NEXT:    store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
-// OPENCL30-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
-// OPENCL30-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
-// OPENCL30-NEXT:    [[MUL:%.*]] = mul i64 [[CONV]], 4
-// OPENCL30-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[MUL]], align 8, addrspace(5)
-// OPENCL30-NEXT:    store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR]], align 4
-// OPENCL30-NEXT:    ret void
-//
-// OPENCL30GAS-LABEL: define dso_local void @test1_builtin_alloca(
-// OPENCL30GAS-SAME: i32 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] {
-// OPENCL30GAS-NEXT:  [[ENTRY:.*:]]
-// OPENCL30GAS-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// OPENCL30GAS-NEXT:    [[ALLOC_PTR:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
-// OPENCL30GAS-NEXT:    [[N_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_ADDR]] to ptr
-// OPENCL30GAS-NEXT:    [[ALLOC_PTR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOC_PTR]] to ptr
-// OPENCL30GAS-NEXT:    store i32 [[N]], ptr [[N_ADDR_ASCAST]], align 4
-// OPENCL30GAS-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4
-// OPENCL30GAS-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
-// OPENCL30GAS-NEXT:    [[MUL:%.*]] = mul i64 [[CONV]], 4
-// OPENCL30GAS-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[MUL]], align 8, addrspace(5)
-// OPENCL30GAS-NEXT:    store ptr addrspace(5) [[TMP1]], ptr [[ALLOC_PTR_ASCAST]], align 4
-// OPENCL30GAS-NEXT:    ret void
 //
 void test1_builtin_alloca(unsigned n) {
     __private float* alloc_ptr = (__private float*)__builtin_alloca(n*sizeof(int));
@@ -92,61 +37,6 @@ void test1_builtin_alloca(unsigned n) {
 // OPENCL-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[MUL]], align 8, addrspace(5)
 // OPENCL-NEXT:    store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR_UNINITIALIZED]], align 4
 // OPENCL-NEXT:    ret void
-// OPENCL12-LABEL: define dso_local void @test1_builtin_alloca_uninitialized(
-// OPENCL12-SAME: i32 noundef [[N:%.*]]) #[[ATTR0]] {
-// OPENCL12-NEXT:  [[ENTRY:.*:]]
-// OPENCL12-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// OPENCL12-NEXT:    [[ALLOC_PTR_UNINITIALIZED:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
-// OPENCL12-NEXT:    store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
-// OPENCL12-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
-// OPENCL12-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
-// OPENCL12-NEXT:    [[MUL:%.*]] = mul i64 [[CONV]], 4
-// OPENCL12-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[MUL]], align 8, addrspace(5)
-// OPENCL12-NEXT:    store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR_UNINITIALIZED]], align 4
-// OPENCL12-NEXT:    ret void
-//
-// OPENCL20-LABEL: define dso_local void @test1_builtin_alloca_uninitialized(
-// OPENCL20-SAME: i32 noundef [[N:%.*]]) #[[ATTR0]] {
-// OPENCL20-NEXT:  [[ENTRY:.*:]]
-// OPENCL20-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// OPENCL20-NEXT:    [[ALLOC_PTR_UNINITIALIZED:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
-// OPENCL20-NEXT:    [[N_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_ADDR]] to ptr
-// OPENCL20-NEXT:    [[ALLOC_PTR_UNINITIALIZED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOC_PTR_UNINITIALIZED]] to ptr
-// OPENCL20-NEXT:    store i32 [[N]], ptr [[N_ADDR_ASCAST]], align 4
-// OPENCL20-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4
-// OPENCL20-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
-// OPENCL20-NEXT:    [[MUL:%.*]] = mul i64 [[CONV]], 4
-// OPENCL20-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[MUL]], align 8, addrspace(5)
-// OPENCL20-NEXT:    store ptr addrspace(5) [[TMP1]], ptr [[ALLOC_PTR_UNINITIALIZED_ASCAST]], align 4
-// OPENCL20-NEXT:    ret void
-//
-// OPENCL30-LABEL: define dso_local void @test1_builtin_alloca_uninitialized(
-// OPENCL30-SAME: i32 noundef [[N:%.*]]) #[[ATTR0]] {
-// OPENCL30-NEXT:  [[ENTRY:.*:]]
-// OPENCL30-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// OPENCL30-NEXT:    [[ALLOC_PTR_UNINITIALIZED:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
-// OPENCL30-NEXT:    store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
-// OPENCL30-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
-// OPENCL30-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
-// OPENCL30-NEXT:    [[MUL:%.*]] = mul i64 [[CONV]], 4
-// OPENCL30-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[MUL]], align 8, addrspace(5)
-// OPENCL30-NEXT:    store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR_UNINITIALIZED]], align 4
-// OPENCL30-NEXT:    ret void
-//
-// OPENCL30GAS-LABEL: define dso_local void @test1_builtin_alloca_uninitialized(
-// OPENCL30GAS-SAME: i32 noundef [[N:%.*]]) #[[ATTR0]] {
-// OPENCL30GAS-NEXT:  [[ENTRY:.*:]]
-// OPENCL30GAS-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// OPENCL30GAS-NEXT:    [[ALLOC_PTR_UNINITIALIZED:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
-// OPENCL30GAS-NEXT:    [[N_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_ADDR]] to ptr
-// OPENCL30GAS-NEXT:    [[ALLOC_PTR_UNINITIALIZED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOC_PTR_UNINITIALIZED]] to ptr
-// OPENCL30GAS-NEXT:    store i32 [[N]], ptr [[N_ADDR_ASCAST]], align 4
-// OPENCL30GAS-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4
-// OPENCL30GAS-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
-// OPENCL30GAS-NEXT:    [[MUL:%.*]] = mul i64 [[CONV]], 4
-// OPENCL30GAS-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[MUL]], align 8, addrspace(5)
-// OPENCL30GAS-NEXT:    store ptr addrspace(5) [[TMP1]], ptr [[ALLOC_PTR_UNINITIALIZED_ASCAST]], align 4
-// OPENCL30GAS-NEXT:    ret void
 //
 void test1_builtin_alloca_uninitialized(unsigned n) {
     __private float* alloc_ptr_uninitialized = (__private float*)__builtin_alloca_uninitialized(n*sizeof(int));
@@ -164,61 +54,6 @@ void test1_builtin_alloca_uninitialized(unsigned n) {
 // OPENCL-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[MUL]], align 1, addrspace(5)
 // OPENCL-NEXT:    store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR_ALIGN]], align 4
 // OPENCL-NEXT:    ret void
-// OPENCL12-LABEL: define dso_local void @test1_builtin_alloca_with_align(
-// OPENCL12-SAME: i32 noundef [[N:%.*]]) #[[ATTR0]] {
-// OPENCL12-NEXT:  [[ENTRY:.*:]]
-// OPENCL12-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// OPENCL12-NEXT:    [[ALLOC_PTR_ALIGN:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
-// OPENCL12-NEXT:    store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
-// OPENCL12-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
-// OPENCL12-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
-// OPENCL12-NEXT:    [[MUL:%.*]] = mul i64 [[CONV]], 4
-// OPENCL12-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[MUL]], align 1, addrspace(5)
-// OPENCL12-NEXT:    store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR_ALIGN]], align 4
-// OPENCL12-NEXT:    ret void
-//
-// OPENCL20-LABEL: define dso_local void @test1_builtin_alloca_with_align(
-// OPENCL20-SAME: i32 noundef [[N:%.*]]) #[[ATTR0]] {
-// OPENCL20-NEXT:  [[ENTRY:.*:]]
-// OPENCL20-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// OPENCL20-NEXT:    [[ALLOC_PTR_ALIGN:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
-// OPENCL20-NEXT:    [[N_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_ADDR]] to ptr
-// OPENCL20-NEXT:    [[ALLOC_PTR_ALIGN_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOC_PTR_ALIGN]] to ptr
-// OPENCL20-NEXT:    store i32 [[N]], ptr [[N_ADDR_ASCAST]], align 4
-// OPENCL20-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4
-// OPENCL20-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
-// OPENCL20-NEXT:    [[MUL:%.*]] = mul i64 [[CONV]], 4
-// OPENCL20-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[MUL]], align 1, addrspace(5)
-// OPENCL20-NEXT:    store ptr addrspace(5) [[TMP1]], ptr [[ALLOC_PTR_ALIGN_ASCAST]], align 4
-// OPENCL20-NEXT:    ret void
-//
-// OPENCL30-LABEL: define dso_local void @test1_builtin_alloca_with_align(
-// OPENCL30-SAME: i32 noundef [[N:%.*]]) #[[ATTR0]] {
-// OPENCL30-NEXT:  [[ENTRY:.*:]]
-// OPENCL30-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// OPENCL30-NEXT:    [[ALLOC_PTR_ALIGN:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
-// OPENCL30-NEXT:    store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
-// OPENCL30-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
-// OPENCL30-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
-// OPENCL30-NEXT:    [[MUL:%.*]] = mul i64 [[CONV]], 4
-// OPENCL30-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[MUL]], align 1, addrspace(5)
-// OPENCL30-NEXT:    store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR_ALIGN]], align 4
-// OPENCL30-NEXT:    ret void
-//
-// OPENCL30GAS-LABEL: define dso_local void @test1_builtin_alloca_with_align(
-// OPENCL30GAS-SAME: i32 noundef [[N:%.*]]) #[[ATTR0]] {
-// OPENCL30GAS-NEXT:  [[ENTRY:.*:]]
-// OPENCL30GAS-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// OPENCL30GAS-NEXT:    [[ALLOC_PTR_ALIGN:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
-// OPENCL30GAS-NEXT:    [[N_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_ADDR]] to ptr
-// OPENCL30GAS-NEXT:    [[ALLOC_PTR_ALIGN_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOC_PTR_ALIGN]] to ptr
-// OPENCL30GAS-NEXT:    store i32 [[N]], ptr [[N_ADDR_ASCAST]], align 4
-// OPENCL30GAS-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4
-// OPENCL30GAS-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
-// OPENCL30GAS-NEXT:    [[MUL:%.*]] = mul i64 [[CONV]], 4
-// OPENCL30GAS-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[MUL]], align 1, addrspace(5)
-// OPENCL30GAS-NEXT:    store ptr addrspace(5) [[TMP1]], ptr [[ALLOC_PTR_ALIGN_ASCAST]], align 4
-// OPENCL30GAS-NEXT:    ret void
 //
 void test1_builtin_alloca_with_align(unsigned n) {
     __private float* alloc_ptr_align = (__private float*)__builtin_alloca_with_align((n*sizeof(int)), 8);
@@ -236,61 +71,6 @@ void test1_builtin_alloca_with_align(unsigned n) {
 // OPENCL-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[MUL]], align 1, addrspace(5)
 // OPENCL-NEXT:    store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR_ALIGN_UNINITIALIZED]], align 4
 // OPENCL-NEXT:    ret void
-// OPENCL12-LABEL: define dso_local void @test1_builtin_alloca_with_align_uninitialized(
-// OPENCL12-SAME: i32 noundef [[N:%.*]]) #[[ATTR0]] {
-// OPENCL12-NEXT:  [[ENTRY:.*:]]
-// OPENCL12-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// OPENCL12-NEXT:    [[ALLOC_PTR_ALIGN_UNINITIALIZED:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
-// OPENCL12-NEXT:    store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
-// OPENCL12-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
-// OPENCL12-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
-// OPENCL12-NEXT:    [[MUL:%.*]] = mul i64 [[CONV]], 4
-// OPENCL12-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[MUL]], align 1, addrspace(5)
-// OPENCL12-NEXT:    store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR_ALIGN_UNINITIALIZED]], align 4
-// OPENCL12-NEXT:    ret void
-//
-// OPENCL20-LABEL: define dso_local void @test1_builtin_alloca_with_align_uninitialized(
-// OPENCL20-SAME: i32 noundef [[N:%.*]]) #[[ATTR0]] {
-// OPENCL20-NEXT:  [[ENTRY:.*:]]
-// OPENCL20-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// OPENCL20-NEXT:    [[ALLOC_PTR_ALIGN_UNINITIALIZED:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
-// OPENCL20-NEXT:    [[N_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_ADDR]] to ptr
-// OPENCL20-NEXT:    [[ALLOC_PTR_ALIGN_UNINITIALIZED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOC_PTR_ALIGN_UNINITIALIZED]] to ptr
-// OPENCL20-NEXT:    store i32 [[N]], ptr [[N_ADDR_ASCAST]], align 4
-// OPENCL20-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4
-// OPENCL20-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
-// OPENCL20-NEXT:    [[MUL:%.*]] = mul i64 [[CONV]], 4
-// OPENCL20-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[MUL]], align 1, addrspace(5)
-// OPENCL20-NEXT:    store ptr addrspace(5) [[TMP1]], ptr [[ALLOC_PTR_ALIGN_UNINITIALIZED_ASCAST]], align 4
-// OPENCL20-NEXT:    ret void
-//
-// OPENCL30-LABEL: define dso_local void @test1_builtin_alloca_with_align_uninitialized(
-// OPENCL30-SAME: i32 noundef [[N:%.*]]) #[[ATTR0]] {
-// OPENCL30-NEXT:  [[ENTRY:.*:]]
-// OPENCL30-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// OPENCL30-NEXT:    [[ALLOC_PTR_ALIGN_UNINITIALIZED:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
-// OPENCL30-NEXT:    store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
-// OPENCL30-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
-// OPENCL30-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
-// OPENCL30-NEXT:    [[MUL:%.*]] = mul i64 [[CONV]], 4
-// OPENCL30-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[MUL]], align 1, addrspace(5)
-// OPENCL30-NEXT:    store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR_ALIGN_UNINITIALIZED]], align 4
-// OPENCL30-NEXT:    ret void
-//
-// OPENCL30GAS-LABEL: define dso_local void @test1_builtin_alloca_with_align_uninitialized(
-// OPENCL30GAS-SAME: i32 noundef [[N:%.*]]) #[[ATTR0]] {
-// OPENCL30GAS-NEXT:  [[ENTRY:.*:]]
-// OPENCL30GAS-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// OPENCL30GAS-NEXT:    [[ALLOC_PTR_ALIGN_UNINITIALIZED:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
-// OPENCL30GAS-NEXT:    [[N_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_ADDR]] to ptr
-// OPENCL30GAS-NEXT:    [[ALLOC_PTR_ALIGN_UNINITIALIZED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOC_PTR_ALIGN_UNINITIALIZED]] to ptr
-// OPENCL30GAS-NEXT:    store i32 [[N]], ptr [[N_ADDR_ASCAST]], align 4
-// OPENCL30GAS-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4
-// OPENCL30GAS-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
-// OPENCL30GAS-NEXT:    [[MUL:%.*]] = mul i64 [[CONV]], 4
-// OPENCL30GAS-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[MUL]], align 1, addrspace(5)
-// OPENCL30GAS-NEXT:    store ptr addrspace(5) [[TMP1]], ptr [[ALLOC_PTR_ALIGN_UNINITIALIZED_ASCAST]], align 4
-// OPENCL30GAS-NEXT:    ret void
 //
 void test1_builtin_alloca_with_align_uninitialized(unsigned n) {
     __private float* alloc_ptr_align_uninitialized = (__private float*)__builtin_alloca_with_align_uninitialized((n*sizeof(int)), 8);
@@ -307,57 +87,6 @@ void test1_builtin_alloca_with_align_uninitialized(unsigned n) {
 // OPENCL-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[CONV]], align 8, addrspace(5)
 // OPENCL-NEXT:    store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR]], align 4
 // OPENCL-NEXT:    ret void
-// OPENCL12-LABEL: define dso_local void @test2_builtin_alloca(
-// OPENCL12-SAME: i32 noundef [[N:%.*]]) #[[ATTR0]] {
-// OPENCL12-NEXT:  [[ENTRY:.*:]]
-// OPENCL12-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// OPENCL12-NEXT:    [[ALLOC_PTR:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
-// OPENCL12-NEXT:    store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
-// OPENCL12-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
-// OPENCL12-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
-// OPENCL12-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[CONV]], align 8, addrspace(5)
-// OPENCL12-NEXT:    store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR]], align 4
-// OPENCL12-NEXT:    ret void
-//
-// OPENCL20-LABEL: define dso_local void @test2_builtin_alloca(
-// OPENCL20-SAME: i32 noundef [[N:%.*]]) #[[ATTR0]] {
-// OPENCL20-NEXT:  [[ENTRY:.*:]]
-// OPENCL20-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// OPENCL20-NEXT:    [[ALLOC_PTR:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
-// OPENCL20-NEXT:    [[N_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_ADDR]] to ptr
-// OPENCL20-NEXT:    [[ALLOC_PTR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOC_PTR]] to ptr
-// OPENCL20-NEXT:    store i32 [[N]], ptr [[N_ADDR_ASCAST]], align 4
-// OPENCL20-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4
-// OPENCL20-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
-// OPENCL20-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[CONV]], align 8, addrspace(5)
-// OPENCL20-NEXT:    store ptr addrspace(5) [[TMP1]], ptr [[ALLOC_PTR_ASCAST]], align 4
-// OPENCL20-NEXT:    ret void
-//
-// OPENCL30-LABEL: define dso_local void @test2_builtin_alloca(
-// OPENCL30-SAME: i32 noundef [[N:%.*]]) #[[ATTR0]] {
-// OPENCL30-NEXT:  [[ENTRY:.*:]]
-// OPENCL30-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// OPENCL30-NEXT:    [[ALLOC_PTR:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
-// OPENCL30-NEXT:    store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
-// OPENCL30-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
-// OPENCL30-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
-// OPENCL30-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[CONV]], align 8, addrspace(5)
-// OPENCL30-NEXT:    store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR]], align 4
-// OPENCL30-NEXT:    ret void
-//
-// OPENCL30GAS-LABEL: define dso_local void @test2_builtin_alloca(
-// OPENCL30GAS-SAME: i32 noundef [[N:%.*]]) #[[ATTR0]] {
-// OPENCL30GAS-NEXT:  [[ENTRY:.*:]]
-// OPENCL30GAS-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// OPENCL30GAS-NEXT:    [[ALLOC_PTR:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
-// OPENCL30GAS-NEXT:    [[N_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_ADDR]] to ptr
-// OPENCL30GAS-NEXT:    [[ALLOC_PTR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOC_PTR]] to ptr
-// OPENCL30GAS-NEXT:    store i32 [[N]], ptr [[N_ADDR_ASCAST]], align 4
-// OPENCL30GAS-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4
-// OPENCL30GAS-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
-// OPENCL30GAS-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[CONV]], align 8, addrspace(5)
-// OPENCL30GAS-NEXT:    store ptr addrspace(5) [[TMP1]], ptr [[ALLOC_PTR_ASCAST]], align 4
-// OPENCL30GAS-NEXT:    ret void
 //
 void test2_builtin_alloca(unsigned n) {
     __private void *alloc_ptr = __builtin_alloca(n);
@@ -374,57 +103,6 @@ void test2_builtin_alloca(unsigned n) {
 // OPENCL-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[CONV]], align 8, addrspace(5)
 // OPENCL-NEXT:    store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR_UNINITIALIZED]], align 4
 // OPENCL-NEXT:    ret void
-// OPENCL12-LABEL: define dso_local void @test2_builtin_alloca_uninitialized(
-// OPENCL12-SAME: i32 noundef [[N:%.*]]) #[[ATTR0]] {
-// OPENCL12-NEXT:  [[ENTRY:.*:]]
-// OPENCL12-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// OPENCL12-NEXT:    [[ALLOC_PTR_UNINITIALIZED:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
-// OPENCL12-NEXT:    store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
-// OPENCL12-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
-// OPENCL12-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
-// OPENCL12-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[CONV]], align 8, addrspace(5)
-// OPENCL12-NEXT:    store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR_UNINITIALIZED]], align 4
-// OPENCL12-NEXT:    ret void
-//
-// OPENCL20-LABEL: define dso_local void @test2_builtin_alloca_uninitialized(
-// OPENCL20-SAME: i32 noundef [[N:%.*]]) #[[ATTR0]] {
-// OPENCL20-NEXT:  [[ENTRY:.*:]]
-// OPENCL20-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// OPENCL20-NEXT:    [[ALLOC_PTR_UNINITIALIZED:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
-// OPENCL20-NEXT:    [[N_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_ADDR]] to ptr
-// OPENCL20-NEXT:    [[ALLOC_PTR_UNINITIALIZED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOC_PTR_UNINITIALIZED]] to ptr
-// OPENCL20-NEXT:    store i32 [[N]], ptr [[N_ADDR_ASCAST]], align 4
-// OPENCL20-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4
-// OPENCL20-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
-// OPENCL20-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[CONV]], align 8, addrspace(5)
-// OPENCL20-NEXT:    store ptr addrspace(5) [[TMP1]], ptr [[ALLOC_PTR_UNINITIALIZED_ASCAST]], align 4
-// OPENCL20-NEXT:    ret void
-//
-// OPENCL30-LABEL: define dso_local void @test2_builtin_alloca_uninitialized(
-// OPENCL30-SAME: i32 noundef [[N:%.*]]) #[[ATTR0]] {
-// OPENCL30-NEXT:  [[ENTRY:.*:]]
-// OPENCL30-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// OPENCL30-NEXT:    [[ALLOC_PTR_UNINITIALIZED:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
-// OPENCL30-NEXT:    store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
-// OPENCL30-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
-// OPENCL30-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
-// OPENCL30-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[CONV]], align 8, addrspace(5)
-// OPENCL30-NEXT:    store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR_UNINITIALIZED]], align 4
-// OPENCL30-NEXT:    ret void
-//
-// OPENCL30GAS-LABEL: define dso_local void @test2_builtin_alloca_uninitialized(
-// OPENCL30GAS-SAME: i32 noundef [[N:%.*]]) #[[ATTR0]] {
-// OPENCL30GAS-NEXT:  [[ENTRY:.*:]]
-// OPENCL30GAS-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// OPENCL30GAS-NEXT:    [[ALLOC_PTR_UNINITIALIZED:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
-// OPENCL30GAS-NEXT:    [[N_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_ADDR]] to ptr
-// OPENCL30GAS-NEXT:    [[ALLOC_PTR_UNINITIALIZED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOC_PTR_UNINITIALIZED]] to ptr
-// OPENCL30GAS-NEXT:    store i32 [[N]], ptr [[N_ADDR_ASCAST]], align 4
-// OPENCL30GAS-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4
-// OPENCL30GAS-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
-// OPENCL30GAS-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[CONV]], align 8, addrspace(5)
-// OPENCL30GAS-NEXT:    store ptr addrspace(5) [[TMP1]], ptr [[ALLOC_PTR_UNINITIALIZED_ASCAST]], align 4
-// OPENCL30GAS-NEXT:    ret void
 //
 void test2_builtin_alloca_uninitialized(unsigned n) {
     __private void *alloc_ptr_uninitialized = __builtin_alloca_uninitialized(n);
@@ -441,57 +119,6 @@ void test2_builtin_alloca_uninitialized(unsigned n) {
 // OPENCL-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[CONV]], align 1, addrspace(5)
 // OPENCL-NEXT:    store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR_ALIGN]], align 4
 // OPENCL-NEXT:    ret void
-// OPENCL12-LABEL: define dso_local void @test2_builtin_alloca_with_align(
-// OPENCL12-SAME: i32 noundef [[N:%.*]]) #[[ATTR0]] {
-// OPENCL12-NEXT:  [[ENTRY:.*:]]
-// OPENCL12-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// OPENCL12-NEXT:    [[ALLOC_PTR_ALIGN:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
-// OPENCL12-NEXT:    store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
-// OPENCL12-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
-// OPENCL12-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
-// OPENCL12-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[CONV]], align 1, addrspace(5)
-// OPENCL12-NEXT:    store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR_ALIGN]], align 4
-// OPENCL12-NEXT:    ret void
-//
-// OPENCL20-LABEL: define dso_local void @test2_builtin_alloca_with_align(
-// OPENCL20-SAME: i32 noundef [[N:%.*]]) #[[ATTR0]] {
-// OPENCL20-NEXT:  [[ENTRY:.*:]]
-// OPENCL20-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// OPENCL20-NEXT:    [[ALLOC_PTR_ALIGN:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
-// OPENCL20-NEXT:    [[N_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_ADDR]] to ptr
-// OPENCL20-NEXT:    [[ALLOC_PTR_ALIGN_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOC_PTR_ALIGN]] to ptr
-// OPENCL20-NEXT:    store i32 [[N]], ptr [[N_ADDR_ASCAST]], align 4
-// OPENCL20-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4
-// OPENCL20-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
-// OPENCL20-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[CONV]], align 1, addrspace(5)
-// OPENCL20-NEXT:    store ptr addrspace(5) [[TMP1]], ptr [[ALLOC_PTR_ALIGN_ASCAST]], align 4
-// OPENCL20-NEXT:    ret void
-//
-// OPENCL30-LABEL: define dso_local void @test2_builtin_alloca_with_align(
-// OPENCL30-SAME: i32 noundef [[N:%.*]]) #[[ATTR0]] {
-// OPENCL30-NEXT:  [[ENTRY:.*:]]
-// OPENCL30-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// OPENCL30-NEXT:    [[ALLOC_PTR_ALIGN:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
-// OPENCL30-NEXT:    store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
-// OPENCL30-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
-// OPENCL30-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
-// OPENCL30-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[CONV]], align 1, addrspace(5)
-// OPENCL30-NEXT:    store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR_ALIGN]], align 4
-// OPENCL30-NEXT:    ret void
-//
-// OPENCL30GAS-LABEL: define dso_local void @test2_builtin_alloca_with_align(
-// OPENCL30GAS-SAME: i32 noundef [[N:%.*]]) #[[ATTR0]] {
-// OPENCL30GAS-NEXT:  [[ENTRY:.*:]]
-// OPENCL30GAS-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// OPENCL30GAS-NEXT:    [[ALLOC_PTR_ALIGN:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
-// OPENCL30GAS-NEXT:    [[N_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_ADDR]] to ptr
-// OPENCL30GAS-NEXT:    [[ALLOC_PTR_ALIGN_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOC_PTR_ALIGN]] to ptr
-// OPENCL30GAS-NEXT:    store i32 [[N]], ptr [[N_ADDR_ASCAST]], align 4
-// OPENCL30GAS-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4
-// OPENCL30GAS-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
-// OPENCL30GAS-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[CONV]], align 1, addrspace(5)
-// OPENCL30GAS-NEXT:    store ptr addrspace(5) [[TMP1]], ptr [[ALLOC_PTR_ALIGN_ASCAST]], align 4
-// OPENCL30GAS-NEXT:    ret void
 //
 void test2_builtin_alloca_with_align(unsigned n) {
     __private void *alloc_ptr_align = __builtin_alloca_with_align(n, 8);
@@ -508,57 +135,6 @@ void test2_builtin_alloca_with_align(unsigned n) {
 // OPENCL-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[CONV]], align 1, addrspace(5)
 // OPENCL-NEXT:    store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR_ALIGN_UNINITIALIZED]], align 4
 // OPENCL-NEXT:    ret void
-// OPENCL12-LABEL: define dso_local void @test2_builtin_alloca_with_align_uninitialized(
-// OPENCL12-SAME: i32 noundef [[N:%.*]]) #[[ATTR0]] {
-// OPENCL12-NEXT:  [[ENTRY:.*:]]
-// OPENCL12-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// OPENCL12-NEXT:    [[ALLOC_PTR_ALIGN_UNINITIALIZED:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
-// OPENCL12-NEXT:    store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
-// OPENCL12-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
-// OPENCL12-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
-// OPENCL12-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[CONV]], align 1, addrspace(5)
-// OPENCL12-NEXT:    store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR_ALIGN_UNINITIALIZED]], align 4
-// OPENCL12-NEXT:    ret void
-//
-// OPENCL20-LABEL: define dso_local void @test2_builtin_alloca_with_align_uninitialized(
-// OPENCL20-SAME: i32 noundef [[N:%.*]]) #[[ATTR0]] {
-// OPENCL20-NEXT:  [[ENTRY:.*:]]
-// OPENCL20-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// OPENCL20-NEXT:    [[ALLOC_PTR_ALIGN_UNINITIALIZED:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
-// OPENCL20-NEXT:    [[N_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_ADDR]] to ptr
-// OPENCL20-NEXT:    [[ALLOC_PTR_ALIGN_UNINITIALIZED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOC_PTR_ALIGN_UNINITIALIZED]] to ptr
-// OPENCL20-NEXT:    store i32 [[N]], ptr [[N_ADDR_ASCAST]], align 4
-// OPENCL20-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4
-// OPENCL20-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
-// OPENCL20-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[CONV]], align 1, addrspace(5)
-// OPENCL20-NEXT:    store ptr addrspace(5) [[TMP1]], ptr [[ALLOC_PTR_ALIGN_UNINITIALIZED_ASCAST]], align 4
-// OPENCL20-NEXT:    ret void
-//
-// OPENCL30-LABEL: define dso_local void @test2_builtin_alloca_with_align_uninitialized(
-// OPENCL30-SAME: i32 noundef [[N:%.*]]) #[[ATTR0]] {
-// OPENCL30-NEXT:  [[ENTRY:.*:]]
-// OPENCL30-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// OPENCL30-NEXT:    [[ALLOC_PTR_ALIGN_UNINITIALIZED:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
-// OPENCL30-NEXT:    store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
-// OPENCL30-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
-// OPENCL30-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
-// OPENCL30-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[CONV]], align 1, addrspace(5)
-// OPENCL30-NEXT:    store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR_ALIGN_UNINITIALIZED]], align 4
-// OPENCL30-NEXT:    ret void
-//
-// OPENCL30GAS-LABEL: define dso_local void @test2_builtin_alloca_with_align_uninitialized(
-// OPENCL30GAS-SAME: i32 noundef [[N:%.*]]) #[[ATTR0]] {
-// OPENCL30GAS-NEXT:  [[ENTRY:.*:]]
-// OPENCL30GAS-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// OPENCL30GAS-NEXT:    [[ALLOC_PTR_ALIGN_UNINITIALIZED:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
-// OPENCL30GAS-NEXT:    [[N_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_ADDR]] to ptr
-// OPENCL30GAS-NEXT:    [[ALLOC_PTR_ALIGN_UNINITIALIZED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOC_PTR_ALIGN_UNINITIALIZED]] to ptr
-// OPENCL30GAS-NEXT:    store i32 [[N]], ptr [[N_ADDR_ASCAST]], align 4
-// OPENCL30GAS-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4
-// OPENCL30GAS-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
-// OPENCL30GAS-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[CONV]], align 1, addrspace(5)
-// OPENCL30GAS-NEXT:    store ptr addrspace(5) [[TMP1]], ptr [[ALLOC_PTR_ALIGN_UNINITIALIZED_ASCAST]], align 4
-// OPENCL30GAS-NEXT:    ret void
 //
 void test2_builtin_alloca_with_align_uninitialized(unsigned n) {
     __private void *alloc_ptr_align_uninitialized = __builtin_alloca_with_align_uninitialized(n, 8);
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
index 5b5ae419f0a4a..cfed96a8d9871 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
@@ -8,9 +8,8 @@ typedef unsigned int uint;
 // CHECK-LABEL: @test_s_sleep_var(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[D_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// CHECK-NEXT:    [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
-// CHECK-NEXT:    store i32 [[D:%.*]], ptr [[D_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[D_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[D:%.*]], ptr addrspace(5) [[D_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[D_ADDR]], align 4
 // CHECK-NEXT:    call void @llvm.amdgcn.s.sleep.var(i32 [[TMP0]])
 // CHECK-NEXT:    call void @llvm.amdgcn.s.sleep.var(i32 15)
 // CHECK-NEXT:    ret void
@@ -27,19 +26,15 @@ void test_s_sleep_var(int d)
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // CHECK-NEXT:    [[C_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// CHECK-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
-// CHECK-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    store i32 [[B:%.*]], ptr [[B_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    store i32 [[C:%.*]], ptr [[C_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[C_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[B:%.*]], ptr addrspace(5) [[B_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[C:%.*]], ptr addrspace(5) [[C_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[C_ADDR]], align 4
 // CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.permlane16.var(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i1 false, i1 false)
-// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
 // CHECK-NEXT:    ret void
 //
@@ -53,19 +48,15 @@ void test_permlane16_var(global uint* out, uint a, uint b, uint c) {
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // CHECK-NEXT:    [[C_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// CHECK-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
-// CHECK-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    store i32 [[B:%.*]], ptr [[B_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    store i32 [[C:%.*]], ptr [[C_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[C_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[B:%.*]], ptr addrspace(5) [[B_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[C:%.*]], ptr addrspace(5) [[C_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[C_ADDR]], align 4
 // CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.permlanex16.var(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i1 false, i1 false)
-// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
 // CHECK-NEXT:    ret void
 //
@@ -89,13 +80,11 @@ void test_s_barrier_signal()
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[BAR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// CHECK-NEXT:    [[BAR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BAR_ADDR]] to ptr
-// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// CHECK-NEXT:    store ptr [[BAR:%.*]], ptr [[BAR_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[BAR_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store ptr [[BAR:%.*]], ptr addrspace(5) [[BAR_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[BAR_ADDR]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3)
-// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4
 // CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) [[TMP1]], i32 [[TMP2]])
 // CHECK-NEXT:    ret void
 //
@@ -109,21 +98,18 @@ void test_s_barrier_signal_var(void *bar, int a)
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // CHECK-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // CHECK-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// CHECK-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
-// CHECK-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
-// CHECK-NEXT:    store ptr [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store ptr [[B:%.*]], ptr [[B_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store ptr [[C:%.*]], ptr [[C_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store ptr [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[B:%.*]], ptr addrspace(5) [[B_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[C:%.*]], ptr addrspace(5) [[C_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 1)
 // CHECK-NEXT:    br i1 [[TMP0]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
 // CHECK:       if.then:
-// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store ptr [[TMP1]], ptr [[A_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[TMP1]], ptr addrspace(5) [[A_ADDR]], align 8
 // CHECK-NEXT:    br label [[IF_END:%.*]]
 // CHECK:       if.else:
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store ptr [[TMP2]], ptr [[A_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[TMP2]], ptr addrspace(5) [[A_ADDR]], align 8
 // CHECK-NEXT:    br label [[IF_END]]
 // CHECK:       if.end:
 // CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.wait(i16 1)
@@ -143,13 +129,11 @@ void test_s_barrier_signal_isfirst(int* a, int* b, int *c)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[BAR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// CHECK-NEXT:    [[BAR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BAR_ADDR]] to ptr
-// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// CHECK-NEXT:    store ptr [[BAR:%.*]], ptr [[BAR_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[BAR_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store ptr [[BAR:%.*]], ptr addrspace(5) [[BAR_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[BAR_ADDR]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3)
-// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4
 // CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.init(ptr addrspace(3) [[TMP1]], i32 [[TMP2]])
 // CHECK-NEXT:    ret void
 //
@@ -161,9 +145,8 @@ void test_s_barrier_init(void *bar, int a)
 // CHECK-LABEL: @test_s_barrier_join(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[BAR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// CHECK-NEXT:    [[BAR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BAR_ADDR]] to ptr
-// CHECK-NEXT:    store ptr [[BAR:%.*]], ptr [[BAR_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[BAR_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store ptr [[BAR:%.*]], ptr addrspace(5) [[BAR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[BAR_ADDR]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3)
 // CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) [[TMP1]])
 // CHECK-NEXT:    ret void
@@ -185,17 +168,13 @@ void test_s_barrier_leave()
 
 // CHECK-LABEL: @test_s_get_barrier_state(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // CHECK-NEXT:    [[STATE:%.*]] = alloca i32, align 4, addrspace(5)
-// CHECK-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
-// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// CHECK-NEXT:    [[STATE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[STATE]] to ptr
-// CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.s.get.barrier.state(i32 [[TMP0]])
-// CHECK-NEXT:    store i32 [[TMP1]], ptr [[STATE_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[STATE_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[STATE]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[STATE]], align 4
 // CHECK-NEXT:    ret i32 [[TMP2]]
 //
 unsigned test_s_get_barrier_state(int a)
@@ -206,18 +185,14 @@ unsigned test_s_get_barrier_state(int a)
 
 // CHECK-LABEL: @test_s_get_named_barrier_state(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
 // CHECK-NEXT:    [[BAR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // CHECK-NEXT:    [[STATE:%.*]] = alloca i32, align 4, addrspace(5)
-// CHECK-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
-// CHECK-NEXT:    [[BAR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BAR_ADDR]] to ptr
-// CHECK-NEXT:    [[STATE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[STATE]] to ptr
-// CHECK-NEXT:    store ptr [[BAR:%.*]], ptr [[BAR_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[BAR_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store ptr [[BAR:%.*]], ptr addrspace(5) [[BAR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[BAR_ADDR]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3)
 // CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) [[TMP1]])
-// CHECK-NEXT:    store i32 [[TMP2]], ptr [[STATE_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[STATE_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[TMP2]], ptr addrspace(5) [[STATE]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(5) [[STATE]], align 4
 // CHECK-NEXT:    ret i32 [[TMP3]]
 //
 unsigned test_s_get_named_barrier_state(void *bar)
@@ -252,20 +227,16 @@ void test_s_ttracedata_imm()
 // CHECK-NEXT:    [[GP_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // CHECK-NEXT:    [[CP_ADDR:%.*]] = alloca ptr addrspace(4), align 8, addrspace(5)
 // CHECK-NEXT:    [[LEN_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// CHECK-NEXT:    [[FP_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[FP_ADDR]] to ptr
-// CHECK-NEXT:    [[GP_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[GP_ADDR]] to ptr
-// CHECK-NEXT:    [[CP_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CP_ADDR]] to ptr
-// CHECK-NEXT:    [[LEN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[LEN_ADDR]] to ptr
-// CHECK-NEXT:    store ptr [[FP:%.*]], ptr [[FP_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store ptr addrspace(1) [[GP:%.*]], ptr [[GP_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store ptr addrspace(4) [[CP:%.*]], ptr [[CP_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store i32 [[LEN:%.*]], ptr [[LEN_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[FP_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store ptr [[FP:%.*]], ptr addrspace(5) [[FP_ADDR]], align 8
+// CHECK-NEXT:    store ptr addrspace(1) [[GP:%.*]], ptr addrspace(5) [[GP_ADDR]], align 8
+// CHECK-NEXT:    store ptr addrspace(4) [[CP:%.*]], ptr addrspace(5) [[CP_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[LEN:%.*]], ptr addrspace(5) [[LEN_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[FP_ADDR]], align 8
 // CHECK-NEXT:    call void @llvm.amdgcn.s.prefetch.data.p0(ptr [[TMP0]], i32 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[GP_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[LEN_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[GP_ADDR]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[LEN_ADDR]], align 4
 // CHECK-NEXT:    call void @llvm.amdgcn.s.prefetch.data.p1(ptr addrspace(1) [[TMP1]], i32 [[TMP2]])
-// CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(4), ptr [[CP_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(4), ptr addrspace(5) [[CP_ADDR]], align 8
 // CHECK-NEXT:    call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) [[TMP3]], i32 31)
 // CHECK-NEXT:    ret void
 //
@@ -280,14 +251,12 @@ void test_s_prefetch_data(int *fp, global float *gp, constant char *cp, unsigned
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[RSRC_ADDR:%.*]] = alloca ptr addrspace(8), align 16, addrspace(5)
 // CHECK-NEXT:    [[LEN_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// CHECK-NEXT:    [[RSRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RSRC_ADDR]] to ptr
-// CHECK-NEXT:    [[LEN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[LEN_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(8) [[RSRC:%.*]], ptr [[RSRC_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    store i32 [[LEN:%.*]], ptr [[LEN_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(8), ptr [[RSRC_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[LEN_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr addrspace(8) [[RSRC:%.*]], ptr addrspace(5) [[RSRC_ADDR]], align 16
+// CHECK-NEXT:    store i32 [[LEN:%.*]], ptr addrspace(5) [[LEN_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(8), ptr addrspace(5) [[RSRC_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[LEN_ADDR]], align 4
 // CHECK-NEXT:    call void @llvm.amdgcn.s.buffer.prefetch.data(ptr addrspace(8) [[TMP0]], i32 128, i32 [[TMP1]])
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(8), ptr [[RSRC_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(8), ptr addrspace(5) [[RSRC_ADDR]], align 16
 // CHECK-NEXT:    call void @llvm.amdgcn.s.buffer.prefetch.data(ptr addrspace(8) [[TMP2]], i32 0, i32 31)
 // CHECK-NEXT:    ret void
 //
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx940.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx940.cl
index a2f14c652c828..fc5649d8a41f7 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx940.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx940.cl
@@ -10,12 +10,10 @@ typedef unsigned char u8;
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // CHECK-NEXT:    [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
-// CHECK-NEXT:    [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
-// CHECK-NEXT:    [[DST_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(1) [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store ptr addrspace(3) [[DST:%.*]], ptr [[DST_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr addrspace(1) [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8
+// CHECK-NEXT:    store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4
 // CHECK-NEXT:    call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 4, i32 0, i32 0)
 // CHECK-NEXT:    ret void
 //
@@ -27,12 +25,10 @@ void test_global_load_lds_u32(global u32* src, local u32 *dst) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // CHECK-NEXT:    [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
-// CHECK-NEXT:    [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
-// CHECK-NEXT:    [[DST_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(1) [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store ptr addrspace(3) [[DST:%.*]], ptr [[DST_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr addrspace(1) [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8
+// CHECK-NEXT:    store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4
 // CHECK-NEXT:    call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 2, i32 0, i32 0)
 // CHECK-NEXT:    ret void
 //
@@ -44,12 +40,10 @@ void test_global_load_lds_u16(global u16* src, local u16 *dst) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // CHECK-NEXT:    [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
-// CHECK-NEXT:    [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
-// CHECK-NEXT:    [[DST_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(1) [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store ptr addrspace(3) [[DST:%.*]], ptr [[DST_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr addrspace(1) [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8
+// CHECK-NEXT:    store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4
 // CHECK-NEXT:    call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 1, i32 0, i32 0)
 // CHECK-NEXT:    ret void
 //
diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
index 461abc3708128..e8b6eb57c38d7 100644
--- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
+++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
@@ -71,7 +71,7 @@ short2 test_flat_add_2bf16(__generic short2 *addr, short2 x) {
 // CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(1) %{{.+}}, <2 x half> %{{.+}} syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
 
 // GFX12-LABEL:  test_global_add_half2
-// GFX12:  global_atomic_pk_add_f16 v2, v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off th:TH_ATOMIC_RETURN
+// GFX12:  global_atomic_pk_add_f16 v2, v[0:1], v2, off th:TH_ATOMIC_RETURN
 void test_global_add_half2(__global half2 *addr, half2 x) {
   half2 *rtn;
   *rtn = __builtin_amdgcn_global_atomic_fadd_v2f16(addr, x);
@@ -93,7 +93,7 @@ void test_global_add_half2_noret(__global half2 *addr, half2 x) {
 
 
 // GFX12-LABEL:  test_global_add_2bf16
-// GFX12: global_atomic_pk_add_bf16 v2, v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off th:TH_ATOMIC_RETURN
+// GFX12: global_atomic_pk_add_bf16 v2, v[0:1], v2, off th:TH_ATOMIC_RETURN
 void test_global_add_2bf16(__global short2 *addr, short2 x) {
   short2 *rtn;
   *rtn = __builtin_amdgcn_global_atomic_fadd_v2bf16(addr, x);
diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx8.cl b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx8.cl
index 2728490d5c02e..2f00977ec6014 100644
--- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx8.cl
+++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx8.cl
@@ -8,7 +8,7 @@
 // CHECK-LABEL: test_fadd_local
 // CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, float %{{.+}} monotonic, align 4
 // GFX8-LABEL: test_fadd_local$local:
-// GFX8: ds_add_rtn_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+// GFX8: ds_add_rtn_f32 v2, v0, v1
 // GFX8: s_endpgm
 kernel void test_fadd_local(__local float *ptr, float val){
     float *res;
diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
index ef97d12afab1d..556e553903d1a 100644
--- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
+++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
@@ -20,7 +20,7 @@ void test_global_add_f64(__global double *addr, double x) {
 // CHECK-LABEL: test_global_add_half2
 // CHECK: = atomicrmw fadd ptr addrspace(1) %{{.+}}, <2 x half> %{{.+}} syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
 // GFX90A-LABEL:  test_global_add_half2
-// GFX90A:  global_atomic_pk_add_f16 v2, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off glc
+// GFX90A:  global_atomic_pk_add_f16 v2, v[0:1], v2, off glc
 void test_global_add_half2(__global half2 *addr, half2 x) {
   half2 *rtn;
   *rtn = __builtin_amdgcn_global_atomic_fadd_v2f16(addr, x);
diff --git a/clang/test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl b/clang/test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl
index b1e45e6d6e6dc..625f0660482a9 100644
--- a/clang/test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl
+++ b/clang/test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl
@@ -18,7 +18,7 @@ kernel void test(int i) {
 // SPIR64: %block_sizes = alloca [1 x i64]
 // COMMON-LABEL: if.then:
 // COMMON-NOT: alloca
-// CHECK-DEBUG: getelementptr {{.*}} %block_sizes{{.*}}, {{.*}} !dbg ![[TEMPLOCATION:[0-9]+]]
+// CHECK-DEBUG: getelementptr {{.*}} %block_sizes, {{.*}} !dbg ![[TEMPLOCATION:[0-9]+]]
 // COMMON-LABEL: if.end
   queue_t default_queue;
   unsigned flags = 0;
diff --git a/clang/test/CodeGenOpenCL/opencl_types.cl b/clang/test/CodeGenOpenCL/opencl_types.cl
index aac3492b7a9e8..5743c6e12d61e 100644
--- a/clang/test/CodeGenOpenCL/opencl_types.cl
+++ b/clang/test/CodeGenOpenCL/opencl_types.cl
@@ -42,7 +42,7 @@ kernel void foo(image1d_t img) {
   // CHECK-AMDGCN: alloca ptr addrspace(4)
   event_t evt;
   // CHECK-SPIR: alloca target("spirv.Event")
-  // CHECK-AMDGCN: alloca ptr
+  // CHECK-AMDGCN: alloca ptr addrspace(5)
   clk_event_t clk_evt;
   // CHECK-SPIR: alloca target("spirv.DeviceEvent")
   // CHECK-AMDGCN: alloca ptr addrspace(1)
diff --git a/clang/test/Index/pipe-size.cl b/clang/test/Index/pipe-size.cl
index a48857baef1a6..08b936f1a9b07 100644
--- a/clang/test/Index/pipe-size.cl
+++ b/clang/test/Index/pipe-size.cl
@@ -11,6 +11,6 @@ __kernel void testPipe( pipe int test )
     // SPIR: store i32 4, ptr %s, align 4
     // SPIR64: store target("spirv.Pipe", 0) %test, ptr %test.addr, align 8
     // SPIR64: store i32 8, ptr %s, align 4
-    // AMDGCN: store ptr addrspace(1) %test, ptr %test{{.*}}, align 8
-    // AMDGCN: store i32 8, ptr %s{{.*}}, align 4
+    // AMDGCN: store ptr addrspace(1) %test, ptr addrspace(5) %test.addr, align 8
+    // AMDGCN: store i32 8, ptr addrspace(5) %s, align 4
 }

From 83081919e42376eb3cdbb8267daaf8da0691f9ae Mon Sep 17 00:00:00 2001
From: Jonas Hahnfeld <jonas.hahnfeld@cern.ch>
Date: Fri, 21 Feb 2025 21:19:56 +0100
Subject: [PATCH 201/282] [Support] Ensure complete type DelimitedScope
 (#127459)

`JSONScopedPrinter` has a `std::unique_ptr<DelimitedScope>` member and
defaulted constructor argument, so it needs a complete type. This
resolves one of the many build errors with C++23 using Clang.

(cherry picked from commit e65d3882af6fcc15342451ad4f9494b1ba6b9b9d)
---
 llvm/include/llvm/Support/ScopedPrinter.h | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/llvm/include/llvm/Support/ScopedPrinter.h b/llvm/include/llvm/Support/ScopedPrinter.h
index 419ab97366796..506b40a09ed78 100644
--- a/llvm/include/llvm/Support/ScopedPrinter.h
+++ b/llvm/include/llvm/Support/ScopedPrinter.h
@@ -539,7 +539,13 @@ ScopedPrinter::printHex<support::ulittle16_t>(StringRef Label,
   startLine() << Label << ": " << hex(Value) << "\n";
 }
 
-struct DelimitedScope;
+struct DelimitedScope {
+  DelimitedScope(ScopedPrinter &W) : W(&W) {}
+  DelimitedScope() : W(nullptr) {}
+  virtual ~DelimitedScope() = default;
+  virtual void setPrinter(ScopedPrinter &W) = 0;
+  ScopedPrinter *W;
+};
 
 class JSONScopedPrinter : public ScopedPrinter {
 private:
@@ -838,14 +844,6 @@ class JSONScopedPrinter : public ScopedPrinter {
   }
 };
 
-struct DelimitedScope {
-  DelimitedScope(ScopedPrinter &W) : W(&W) {}
-  DelimitedScope() : W(nullptr) {}
-  virtual ~DelimitedScope() = default;
-  virtual void setPrinter(ScopedPrinter &W) = 0;
-  ScopedPrinter *W;
-};
-
 struct DictScope : DelimitedScope {
   explicit DictScope() = default;
   explicit DictScope(ScopedPrinter &W) : DelimitedScope(W) { W.objectBegin(); }

From c02bd85194d3cb6416baaf3dd607a056f2047720 Mon Sep 17 00:00:00 2001
From: Alexandre Ganea <alex_toresh@yahoo.fr>
Date: Mon, 24 Feb 2025 08:53:44 -0500
Subject: [PATCH 202/282] On Windows, remove the UCRT libraries from the
 release script (#128378)

Since the 19.0 release, the UCRT dlls shouldn't be included anymore in the Windows distribution, as we link the CRT
statically into all distributed binaries.

https://discourse.llvm.org/t/llvm-x86-64-pc-windows-msvc-binaries-no-longer-need-msvc-runtime-dlls-since-19-x/84465
(cherry picked from commit 4ba3ebef642eaa4d46567a972c5ee1badb2ebadf)
---
 llvm/utils/release/build_llvm_release.bat | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/utils/release/build_llvm_release.bat b/llvm/utils/release/build_llvm_release.bat
index 1c30673cf88bd..588d7201fcb92 100755
--- a/llvm/utils/release/build_llvm_release.bat
+++ b/llvm/utils/release/build_llvm_release.bat
@@ -152,7 +152,6 @@ set common_cmake_flags=^
   -DLLVM_INSTALL_TOOLCHAIN_ONLY=ON ^
   -DLLVM_TARGETS_TO_BUILD="AArch64;ARM;X86;BPF;WebAssembly;RISCV;NVPTX" ^
   -DLLVM_BUILD_LLVM_C_DYLIB=ON ^
-  -DCMAKE_INSTALL_UCRT_LIBRARIES=ON ^
   -DPython3_FIND_REGISTRY=NEVER ^
   -DPACKAGE_VERSION=%package_version% ^
   -DLLDB_RELOCATABLE_PYTHON=1 ^

From d18f15fe53bc1948e427ed1481b1f586938009d9 Mon Sep 17 00:00:00 2001
From: Ikhlas Ajbar <iajbar@quicinc.com>
Date: Tue, 25 Feb 2025 09:07:29 -0600
Subject: [PATCH 203/282] [Hexagon] Add a case to BitTracker for new register
 class (#128580)

Code in the HexagonBitTracker checks for a specific register class when
processing sub-registers. A crash occurred due to a register class that
was not handled. The register class is
DoubleRegs_with_isub_hi_in_IntRegsLow8RegClassID, which is a class
formed by creating a register pair when one of the sub registers is a
Low8 integer register.
Fixes #128078
Patch by: Brendon Cahoon

(cherry picked from commit 4f7d8948d9d9a0d366ac737247abab2246834e05)
---
 llvm/lib/Target/Hexagon/HexagonBitTracker.cpp |  2 +
 .../CodeGen/Hexagon/bittracker-regclass.ll    | 40 +++++++++++++++++++
 2 files changed, 42 insertions(+)
 create mode 100644 llvm/test/CodeGen/Hexagon/bittracker-regclass.ll

diff --git a/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp b/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp
index d0580b91380ac..6a48af59fbd59 100644
--- a/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp
@@ -94,6 +94,7 @@ BT::BitMask HexagonEvaluator::mask(Register Reg, unsigned Sub) const {
   bool IsSubLo = (Sub == HRI.getHexagonSubRegIndex(RC, Hexagon::ps_sub_lo));
   switch (ID) {
     case Hexagon::DoubleRegsRegClassID:
+    case Hexagon::DoubleRegs_with_isub_hi_in_IntRegsLow8RegClassID:
     case Hexagon::HvxWRRegClassID:
     case Hexagon::HvxVQRRegClassID:
       return IsSubLo ? BT::BitMask(0, RW-1)
@@ -139,6 +140,7 @@ const TargetRegisterClass &HexagonEvaluator::composeWithSubRegIndex(
 
   switch (RC.getID()) {
     case Hexagon::DoubleRegsRegClassID:
+    case Hexagon::DoubleRegs_with_isub_hi_in_IntRegsLow8RegClassID:
       return Hexagon::IntRegsRegClass;
     case Hexagon::HvxWRRegClassID:
       return Hexagon::HvxVRRegClass;
diff --git a/llvm/test/CodeGen/Hexagon/bittracker-regclass.ll b/llvm/test/CodeGen/Hexagon/bittracker-regclass.ll
new file mode 100644
index 0000000000000..436a23399c11d
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/bittracker-regclass.ll
@@ -0,0 +1,40 @@
+; RUN: llc -mtriple=hexagon -mcpu=hexagonv75 -mattr=+hvxv75,+hvx-length64b,-small-data < %s | FileCheck %s
+
+; Test that the compiler generates code, and doesn't crash, when the compiler
+; creates a DoubleReg value with an IntLow8Reg value. The BitTracker pass
+; needs to handle this register class.
+
+; CHECK: [[REG:r[0-9]+:[0-9]+]] = combine(#33,#32)
+; CHECK: memd({{.*}}) = [[REG]]
+
+@out = external dso_local global [100 x i32], align 512
+@in55 = external dso_local global [55 x i32], align 256
+@.str.3 = external dso_local unnamed_addr constant [29 x i8], align 1
+
+define dso_local void @main(i1 %cond) local_unnamed_addr #0 {
+entry:
+  br label %for.body.i198
+
+for.body.i198:
+  br i1 %cond, label %for.body34.preheader, label %for.body.i198
+
+for.body34.preheader:
+  %wide.load269.5 = load <16 x i32>, <16 x i32>* bitcast (i32* getelementptr inbounds ([100 x i32], [100 x i32]* @out, i32 0, i32 80) to <16 x i32>*), align 64
+  %0 = add nsw <16 x i32> %wide.load269.5, zeroinitializer
+  %rdx.shuf270 = shufflevector <16 x i32> %0, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx271 = add <16 x i32> %0, %rdx.shuf270
+  %bin.rdx273 = add <16 x i32> %bin.rdx271, zeroinitializer
+  %bin.rdx275 = add <16 x i32> %bin.rdx273, zeroinitializer
+  %bin.rdx277 = add <16 x i32> %bin.rdx275, zeroinitializer
+  %1 = extractelement <16 x i32> %bin.rdx277, i32 0
+  %add45 = add nsw i32 0, %1
+  %add45.1 = add nsw i32 0, %add45
+  %add45.2 = add nsw i32 0, %add45.1
+  %add45.3 = add nsw i32 0, %add45.2
+  call void (i8*, ...) @printf(i8* getelementptr inbounds ([29 x i8], [29 x i8]* @.str.3, i32 0, i32 0), i32 %add45.3) #2
+  store i32 32, i32* getelementptr inbounds ([55 x i32], [55 x i32]* @in55, i32 0, i32 32), align 128
+  store i32 33, i32* getelementptr inbounds ([55 x i32], [55 x i32]* @in55, i32 0, i32 33), align 4
+  ret void
+}
+
+declare dso_local void @printf(i8*, ...) local_unnamed_addr #1

From addf1c97117e52bd52f445a360047c2752df9d1a Mon Sep 17 00:00:00 2001
From: Stephen Tozer <stephen.tozer@sony.com>
Date: Tue, 25 Feb 2025 14:49:59 +0000
Subject: [PATCH 204/282] Do not treat llvm.fake.use as a debug instruction
 (#128684)

The llvm.fake.use intrinsic is used to prevent certain values from being
optimized out for the benefit of debug info; it is not, however, a debug
or pseudo instruction itself and necessarily must not be treated as one,
since its purpose is to act like a normal instruction. In the original
commit that added them, the IR intrinsic however was treated as one in
`getPrevNonDebugInstruction` (but _not_ in `getNextNonDebugInstruction`,
or in the MIR equivalents). This patch correctly treats it as a
non-debug instruction.

(cherry picked from commit af68927a831c45b92248b1f6fc24d445be42dd91)
---
 llvm/lib/IR/Instruction.cpp                   |  5 +-
 .../X86/fake-use-considered-when-sinking.ll   | 67 +++++++++++++++++++
 2 files changed, 68 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/Transforms/SimplifyCFG/X86/fake-use-considered-when-sinking.ll

diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index 84d9306ca6700..7d43de982df0d 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -1233,10 +1233,7 @@ Instruction::getNextNonDebugInstruction(bool SkipPseudoOp) const {
 const Instruction *
 Instruction::getPrevNonDebugInstruction(bool SkipPseudoOp) const {
   for (const Instruction *I = getPrevNode(); I; I = I->getPrevNode())
-    if (!isa<DbgInfoIntrinsic>(I) &&
-        !(SkipPseudoOp && isa<PseudoProbeInst>(I)) &&
-        !(isa<IntrinsicInst>(I) &&
-          cast<IntrinsicInst>(I)->getIntrinsicID() == Intrinsic::fake_use))
+    if (!isa<DbgInfoIntrinsic>(I) && !(SkipPseudoOp && isa<PseudoProbeInst>(I)))
       return I;
   return nullptr;
 }
diff --git a/llvm/test/Transforms/SimplifyCFG/X86/fake-use-considered-when-sinking.ll b/llvm/test/Transforms/SimplifyCFG/X86/fake-use-considered-when-sinking.ll
new file mode 100644
index 0000000000000..63217315c978c
--- /dev/null
+++ b/llvm/test/Transforms/SimplifyCFG/X86/fake-use-considered-when-sinking.ll
@@ -0,0 +1,67 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p="simplifycfg<sink-common-insts>" -S < %s | FileCheck %s
+
+;; Verify that fake uses are not ignored when sinking instructions in
+;; SimplifyCFG; when a fake use appears in only one incoming block they prevent
+;; further sinking, and when identical fake uses appear on both sides they
+;; are sunk normally.
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @foo(i1 %bool, ptr %p) {
+; CHECK-LABEL: define void @foo(
+; CHECK-SAME: i1 [[BOOL:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 [[BOOL]], label %[[IF_ELSE:.*]], label %[[IF_THEN:.*]]
+; CHECK:       [[COMMON_RET:.*]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    store ptr [[P]], ptr [[P]], align 8
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+; CHECK:       [[IF_ELSE]]:
+; CHECK-NEXT:    store ptr [[P]], ptr [[P]], align 8
+; CHECK-NEXT:    notail call void (...) @llvm.fake.use(ptr [[P]])
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+;
+entry:
+  br i1 %bool, label %if.else, label %if.then
+
+common.ret:                                       ; preds = %if.else, %if.then
+  ret void
+
+if.then:                                          ; preds = %entry
+  store ptr %p, ptr %p, align 8
+  br label %common.ret
+
+if.else:                                          ; preds = %entry
+  store ptr %p, ptr %p, align 8
+  notail call void (...) @llvm.fake.use(ptr %p)
+  br label %common.ret
+}
+
+define void @bar(i1 %bool, ptr %p) {
+; CHECK-LABEL: define void @bar(
+; CHECK-SAME: i1 [[BOOL:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store ptr [[P]], ptr [[P]], align 8
+; CHECK-NEXT:    notail call void (...) @llvm.fake.use(ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 %bool, label %if.else, label %if.then
+
+common.ret:                                       ; preds = %if.else, %if.then
+  ret void
+
+if.then:                                          ; preds = %entry
+  store ptr %p, ptr %p, align 8
+  notail call void (...) @llvm.fake.use(ptr %p)
+  br label %common.ret
+
+if.else:                                          ; preds = %entry
+  store ptr %p, ptr %p, align 8
+  notail call void (...) @llvm.fake.use(ptr %p)
+  br label %common.ret
+}
+

From 5d3b9e2f92bbb6c1f3b4bb68f1f63aff72aaa7b1 Mon Sep 17 00:00:00 2001
From: Amy Kwan <amy.kwan1@ibm.com>
Date: Tue, 25 Feb 2025 18:01:32 -0500
Subject: [PATCH 205/282] [PowerPC] Update LLVM 20.1.0 Release Notes (#128764)

This PR adds LLVM 20.1.0 release notes that are related to the PowerPC
target.

---------

Co-authored-by: Hubert Tong <hubert.reinterpretcast@gmail.com>
---
 clang/docs/ReleaseNotes.rst |  2 ++
 llvm/docs/ReleaseNotes.md   | 30 +++++++++++++++++++++++++++++-
 2 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index a2518042cb5b0..153afdb3d59e3 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -1269,6 +1269,8 @@ CUDA Support
 
 AIX Support
 ^^^^^^^^^^^
+- Fixed the ``-print-runtime-dir`` option.
+- Enable continuous profile syncing feature on AIX.
 
 NetBSD Support
 ^^^^^^^^^^^^^^
diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index e654509792652..958b7adbc4c36 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -263,7 +263,23 @@ Changes to the PowerPC Backend
 ------------------------------
 
 * The Linux `ppc64` LLC default cpu is updated from `ppc` to `ppc64`.
-* The AIX LLC default cpu is updated from `generic` to `pwr7`.
+* Replaced PPCMergeStringPool with GlobalMerge.
+* Disabled vsx and altivec when -msoft-float is used.
+* Added support for -mcpu=pwr11 -mtune=pwr11.
+* Implemented BCD assist builtins.
+* Expanded global named register support.
+* Updated to use tablegen's MatchRegisterName().
+* Fixed saving of Link Register when using ROP Protect.
+* Fixed SUBREG_TO_REG handling in the RegisterCoalescer.
+* Fixed data layout alignment of i128 to 16.
+* Fixed codegen for transparent_union function parameters.
+* Added an error for incorrect use of memory operands.
+* Other various bug fixes and codegen improvements.
+
+AIX Specific:
+* LLC default cpu is updated from `generic` to `pwr7`.
+* Fixed handling in emitGlobalConstantImpl to emit aliases to subobjects at proper offsets.
+* Enabled aggressive merging of constants to reduce TOC entries.
 
 Changes to the RISC-V Backend
 -----------------------------
@@ -478,6 +494,10 @@ Changes to the LLVM tools
 
 * llvm-objcopy now prints the correct file path in the error message when the output file specified by `--dump-section` cannot be opened.
 
+* llvm-cxxfilt now supports demangling call expressions encoded using `cp` instead of `cl`.
+
+* llvm-objdump now supports printing the file header, load section header and auxiliary header for XCOFF object files under the ``--private-headers`` option.
+
 Changes to LLDB
 ---------------------------------
 
@@ -630,6 +650,14 @@ Changes to BOLT
 Changes to Sanitizers
 ---------------------
 
+Changes to the Profile Runtime
+------------------------------
+
+* On platforms where ``atexit``-registered functions are not called when
+  a DSO is ``dlclose``'d, a mechanism is added that implements this
+  missing functionality for calls to ``atexit`` in the profile runtime.
+  [This is currently only enabled on AIX](https://github.com/llvm/llvm-project/pull/102940).
+
 Other Changes
 -------------
 

From b8b7fce446a6516be6bee61456e7292f7c1afb86 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Mon, 24 Feb 2025 18:06:51 +0100
Subject: [PATCH 206/282] [PPC][MC] Restore support for case-insensitive
 register names (#128525)

Lowercase the name before calling MatchRegisterName(), to restore
support for using `%R3` instead of `%r3` and similar, matching the GNU
assembler.

Fixes https://github.com/llvm/llvm-project/issues/126786.

(cherry picked from commit f1252f539ca203a979d61b616186e9be9d612f96)
---
 llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp | 13 ++++++++-----
 llvm/test/MC/PowerPC/case-insensitive-regs.s       | 11 +++++++++++
 2 files changed, 19 insertions(+), 5 deletions(-)
 create mode 100644 llvm/test/MC/PowerPC/case-insensitive-regs.s

diff --git a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
index dc75814b9796b..47e7aa8785e69 100644
--- a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
+++ b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
@@ -1320,7 +1320,10 @@ MCRegister PPCAsmParser::matchRegisterName(int64_t &IntVal) {
   if (!getParser().getTok().is(AsmToken::Identifier))
     return MCRegister();
 
-  StringRef Name = getParser().getTok().getString();
+  // MatchRegisterName() expects lower-case registers, but we want to support
+  // case-insensitive spelling.
+  std::string NameBuf = getParser().getTok().getString().lower();
+  StringRef Name(NameBuf);
   MCRegister RegNo = MatchRegisterName(Name);
   if (!RegNo)
     return RegNo;
@@ -1329,15 +1332,15 @@ MCRegister PPCAsmParser::matchRegisterName(int64_t &IntVal) {
 
   // MatchRegisterName doesn't seem to have special handling for 64bit vs 32bit
   // register types.
-  if (Name.equals_insensitive("lr")) {
+  if (Name == "lr") {
     RegNo = isPPC64() ? PPC::LR8 : PPC::LR;
     IntVal = 8;
-  } else if (Name.equals_insensitive("ctr")) {
+  } else if (Name == "ctr") {
     RegNo = isPPC64() ? PPC::CTR8 : PPC::CTR;
     IntVal = 9;
-  } else if (Name.equals_insensitive("vrsave"))
+  } else if (Name == "vrsave")
     IntVal = 256;
-  else if (Name.starts_with_insensitive("r"))
+  else if (Name.starts_with("r"))
     RegNo = isPPC64() ? XRegs[IntVal] : RRegs[IntVal];
 
   getParser().Lex();
diff --git a/llvm/test/MC/PowerPC/case-insensitive-regs.s b/llvm/test/MC/PowerPC/case-insensitive-regs.s
new file mode 100644
index 0000000000000..f20a590318504
--- /dev/null
+++ b/llvm/test/MC/PowerPC/case-insensitive-regs.s
@@ -0,0 +1,11 @@
+# RUN: llvm-mc -triple powerpc64le-unknown-unknown %s 2>&1 | FileCheck %s
+
+# Test that upper case registers are accepted.
+
+# CHECK-LABEL: test:
+# CHECK-NEXT: ld 1, 0(3)
+# CHECK-NEXT: blr
+
+test:
+    ld %R1, 0(%R3)
+    blr

From a2112e20c736b1648ada1aa0b976f42fedf62a23 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Tue, 25 Feb 2025 07:33:42 -0800
Subject: [PATCH 207/282] [CMake][Release] Enable bolt optimization for clang
 on Linux (#128090)

Also stop buiding the bolt project on other platforms since bolt only
supports ELF.

(cherry picked from commit 148111fdcf0e807fe74274b18fcf65c4cff45d63)
---
 .github/workflows/release-binaries.yml |  2 +-
 clang/cmake/caches/Release.cmake       | 17 ++++++++++++++++-
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/release-binaries.yml b/.github/workflows/release-binaries.yml
index fa3dfa5b7d313..a18f64d6bc226 100644
--- a/.github/workflows/release-binaries.yml
+++ b/.github/workflows/release-binaries.yml
@@ -142,7 +142,7 @@ jobs:
             # 2-3 hours to build on macOS, much slower than on Linux.
             # The long build time causes the release build to time out on x86_64,
             # so we need to disable flang there.
-            target_cmake_flags="$target_cmake_flags -DLLVM_RELEASE_ENABLE_PROJECTS='clang;lld;lldb;clang-tools-extra;bolt;polly;mlir'"
+            target_cmake_flags="$target_cmake_flags -DLLVM_RELEASE_ENABLE_PROJECTS='clang;lld;lldb;clang-tools-extra;polly;mlir'"
           fi
           target_cmake_flags="$target_cmake_flags -DBOOTSTRAP_DARWIN_osx_ARCHS=$arches -DBOOTSTRAP_DARWIN_osx_BUILTIN_ARCHS=$arches"
         fi
diff --git a/clang/cmake/caches/Release.cmake b/clang/cmake/caches/Release.cmake
index a1c68fc51dbd0..aedbd1a25fb38 100644
--- a/clang/cmake/caches/Release.cmake
+++ b/clang/cmake/caches/Release.cmake
@@ -29,6 +29,13 @@ endfunction()
 # cache file to CMake via -C. e.g.
 #
 # cmake -D LLVM_RELEASE_ENABLE_PGO=ON -C Release.cmake
+
+set (DEFAULT_PROJECTS "clang;lld;lldb;clang-tools-extra;polly;mlir;flang")
+# bolt only supports ELF, so only enable it for Linux.
+if (${CMAKE_HOST_SYSTEM_NAME} MATCHES "Linux")
+  list(APPEND DEFAULT_PROJECTS "bolt")
+endif()
+
 set (DEFAULT_RUNTIMES "compiler-rt;libcxx")
 if (NOT WIN32)
   list(APPEND DEFAULT_RUNTIMES "libcxxabi" "libunwind")
@@ -36,7 +43,7 @@ endif()
 set(LLVM_RELEASE_ENABLE_LTO THIN CACHE STRING "")
 set(LLVM_RELEASE_ENABLE_PGO ON CACHE BOOL "")
 set(LLVM_RELEASE_ENABLE_RUNTIMES ${DEFAULT_RUNTIMES} CACHE STRING "")
-set(LLVM_RELEASE_ENABLE_PROJECTS "clang;lld;lldb;clang-tools-extra;bolt;polly;mlir;flang" CACHE STRING "")
+set(LLVM_RELEASE_ENABLE_PROJECTS ${DEFAULT_PROJECTS} CACHE STRING "")
 # Note we don't need to add install here, since it is one of the pre-defined
 # steps.
 set(LLVM_RELEASE_FINAL_STAGE_TARGETS "clang;package;check-all;check-llvm;check-clang" CACHE STRING "")
@@ -118,6 +125,11 @@ if(NOT ${CMAKE_HOST_SYSTEM_NAME} MATCHES "Darwin")
   set(RELEASE_LINKER_FLAGS "${RELEASE_LINKER_FLAGS} -static-libgcc")
 endif()
 
+# Set flags for bolt
+if (${CMAKE_HOST_SYSTEM_NAME} MATCHES "Linux")
+  set(RELEASE_LINKER_FLAGS "${RELEASE_LINKER_FLAGS} -Wl,--emit-relocs,-znow")
+endif()
+
 set_instrument_and_final_stage_var(CMAKE_EXE_LINKER_FLAGS ${RELEASE_LINKER_FLAGS} STRING)
 set_instrument_and_final_stage_var(CMAKE_SHARED_LINKER_FLAGS ${RELEASE_LINKER_FLAGS} STRING)
 set_instrument_and_final_stage_var(CMAKE_MODULE_LINKER_FLAGS ${RELEASE_LINKER_FLAGS} STRING)
@@ -125,6 +137,9 @@ set_instrument_and_final_stage_var(CMAKE_MODULE_LINKER_FLAGS ${RELEASE_LINKER_FL
 # Final Stage Config (stage2)
 set_final_stage_var(LLVM_ENABLE_RUNTIMES "${LLVM_RELEASE_ENABLE_RUNTIMES}" STRING)
 set_final_stage_var(LLVM_ENABLE_PROJECTS "${LLVM_RELEASE_ENABLE_PROJECTS}" STRING)
+if (${CMAKE_HOST_SYSTEM_NAME} MATCHES "Linux")
+  set_final_stage_var(CLANG_BOLT "INSTRUMENT" STRING)
+endif()
 set_final_stage_var(CPACK_GENERATOR "TXZ" STRING)
 set_final_stage_var(CPACK_ARCHIVE_THREADS "0" STRING)
 

From 560ac756cb18db3f6252b4d43e0444b10fd36e8b Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Tue, 25 Feb 2025 16:10:20 -0800
Subject: [PATCH 208/282] [CMake][Release] Statically link ZSTD on all OSes
 (#128554)

This will make the binaries more portable.

(cherry picked from commit 09832777d830e0fddff84bf36793ec4e453656b0)
---
 clang/cmake/caches/Release.cmake | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/clang/cmake/caches/Release.cmake b/clang/cmake/caches/Release.cmake
index aedbd1a25fb38..fb12dfcdcb5a5 100644
--- a/clang/cmake/caches/Release.cmake
+++ b/clang/cmake/caches/Release.cmake
@@ -143,6 +143,4 @@ endif()
 set_final_stage_var(CPACK_GENERATOR "TXZ" STRING)
 set_final_stage_var(CPACK_ARCHIVE_THREADS "0" STRING)
 
-if(${CMAKE_HOST_SYSTEM_NAME} MATCHES "Darwin")
-  set_final_stage_var(LLVM_USE_STATIC_ZSTD "ON" BOOL)
-endif()
+set_final_stage_var(LLVM_USE_STATIC_ZSTD "ON" BOOL)

From a69568efe6c4972e71af295c6577b3412dd57c22 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Tue, 25 Feb 2025 18:02:43 -0800
Subject: [PATCH 209/282] Bump version to 20.1.0-rc3

---
 cmake/Modules/LLVMVersion.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/Modules/LLVMVersion.cmake b/cmake/Modules/LLVMVersion.cmake
index 848c94feb19e1..4048a7466b6d4 100644
--- a/cmake/Modules/LLVMVersion.cmake
+++ b/cmake/Modules/LLVMVersion.cmake
@@ -10,6 +10,6 @@ if(NOT DEFINED LLVM_VERSION_PATCH)
   set(LLVM_VERSION_PATCH 0)
 endif()
 if(NOT DEFINED LLVM_VERSION_SUFFIX)
-  set(LLVM_VERSION_SUFFIX -rc2)
+  set(LLVM_VERSION_SUFFIX -rc3)
 endif()
 

From 83058aadeef4a2995c2610abe3119ed86371cd3e Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Wed, 12 Feb 2025 10:01:46 -0800
Subject: [PATCH 210/282] workflows/release-binaries-all: Add missing secret
 input (#126921)

Since d194c6b9a7fdda7a61abcd6bfe39ab465bf0cc87 this workflow was missing
the secret input which was causing it to fail.

(cherry picked from commit a684e0ea57ebb93c81506c066afb25cb496dcc11)
---
 .github/workflows/release-binaries-all.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/release-binaries-all.yml b/.github/workflows/release-binaries-all.yml
index d18b9b0b5c2ff..fd4694ebea32d 100644
--- a/.github/workflows/release-binaries-all.yml
+++ b/.github/workflows/release-binaries-all.yml
@@ -27,6 +27,10 @@ on:
         required: true
         default: false
         type: boolean
+    secrets:
+      RELEASE_TASKS_USER_TOKEN:
+        description: "Secret used to check user permissions."
+        required: false
 
   pull_request:
     types:

From b23c3cc933e3a057f08ea7737d8f2a141d9a8452 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Fri, 14 Feb 2025 07:28:44 -0800
Subject: [PATCH 211/282] workflows/release-binaries: Fix macos-14 build
 (#127157)

This was broken when pgo was enabled by
0572580dd040a81dc69b798e202550d51d17204a.

(cherry picked from commit d595d5a770d93c9564268fc631d85f3a6ce1f505)
---
 .github/workflows/release-binaries.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/release-binaries.yml b/.github/workflows/release-binaries.yml
index a18f64d6bc226..231dd26e54ae0 100644
--- a/.github/workflows/release-binaries.yml
+++ b/.github/workflows/release-binaries.yml
@@ -133,7 +133,7 @@ jobs:
         # add extra CMake args to disable them.
         # See https://github.com/llvm/llvm-project/issues/99767
         if [ "$RUNNER_OS" = "macOS" ]; then
-          target_cmake_flags="$target_cmake_flags -DBOOTSTRAP_COMPILER_RT_ENABLE_IOS=OFF"
+          target_cmake_flags="$target_cmake_flags -DBOOTSTRAP_BOOTSTRAP_COMPILER_RT_ENABLE_IOS=OFF"
           if [ "$RUNNER_ARCH" = "ARM64" ]; then
             arches=arm64
           else
@@ -144,7 +144,7 @@ jobs:
             # so we need to disable flang there.
             target_cmake_flags="$target_cmake_flags -DLLVM_RELEASE_ENABLE_PROJECTS='clang;lld;lldb;clang-tools-extra;polly;mlir'"
           fi
-          target_cmake_flags="$target_cmake_flags -DBOOTSTRAP_DARWIN_osx_ARCHS=$arches -DBOOTSTRAP_DARWIN_osx_BUILTIN_ARCHS=$arches"
+          target_cmake_flags="$target_cmake_flags -DBOOTSTRAP_BOOTSTRAP_DARWIN_osx_ARCHS=$arches -DBOOTSTRAP_BOOTSTRAP_DARWIN_osx_BUILTIN_ARCHS=$arches"
         fi
 
         build_flang="true"

From f88a2946737a24565d47eafb7e8257ddf8cbb4fb Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Fri, 21 Feb 2025 20:46:43 -0800
Subject: [PATCH 212/282] [clang-format] Fix a bug that changes keyword `or` to
 an identifier (#128410)

Backports ffc61dc393e4 0968df9c3a55 2d585ccecc45

Fixes #105482
---
 clang/docs/ClangFormatStyleOptions.rst        | 13 +++++-
 clang/docs/ReleaseNotes.rst                   |  4 ++
 clang/include/clang/Format/Format.h           | 17 ++++++--
 clang/lib/Format/Format.cpp                   | 43 ++++++++++++++++++-
 clang/lib/Format/FormatToken.cpp              | 10 ++---
 clang/lib/Format/FormatToken.h                | 23 ----------
 clang/lib/Format/TokenAnnotator.cpp           |  4 +-
 clang/lib/Format/TokenAnnotator.h             |  2 +-
 clang/lib/Format/UnwrappedLineParser.cpp      |  8 +---
 clang/unittests/Format/FormatTest.cpp         | 19 +++++++-
 clang/unittests/Format/TokenAnnotatorTest.cpp | 11 ++++-
 11 files changed, 106 insertions(+), 48 deletions(-)

diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst
index bbb912eb10e94..4b4c412a13323 100644
--- a/clang/docs/ClangFormatStyleOptions.rst
+++ b/clang/docs/ClangFormatStyleOptions.rst
@@ -4735,15 +4735,24 @@ the configuration (without a prefix: ``Auto``).
 .. _Language:
 
 **Language** (``LanguageKind``) :versionbadge:`clang-format 3.5` :ref:`¶ <Language>`
-  Language, this format style is targeted at.
+  The language that this format style targets.
+
+  .. note::
+
+   You can specify the language (``C``, ``Cpp``, or ``ObjC``) for ``.h``
+   files by adding a ``// clang-format Language:`` line before the first
+   non-comment (and non-empty) line, e.g. ``// clang-format Language: Cpp``.
 
   Possible values:
 
   * ``LK_None`` (in configuration: ``None``)
     Do not use.
 
+  * ``LK_C`` (in configuration: ``C``)
+    Should be used for C.
+
   * ``LK_Cpp`` (in configuration: ``Cpp``)
-    Should be used for C, C++.
+    Should be used for C++.
 
   * ``LK_CSharp`` (in configuration: ``CSharp``)
     Should be used for C#.
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 153afdb3d59e3..57a567509a068 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -1358,6 +1358,10 @@ clang-format
 - Adds ``WrapNamespaceBodyWithEmptyLines`` option.
 - Adds the ``IndentExportBlock`` option.
 - Adds ``PenaltyBreakBeforeMemberAccess`` option.
+- Add the C language instead of treating it like C++.
+- Allow specifying the language (C, C++, or Objective-C) for a ``.h`` file by
+  adding a special comment (e.g. ``// clang-format Language: ObjC``) near the
+  top of the file.
 
 libclang
 --------
diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h
index 6f432d1d50315..abab543518222 100644
--- a/clang/include/clang/Format/Format.h
+++ b/clang/include/clang/Format/Format.h
@@ -3275,7 +3275,9 @@ struct FormatStyle {
   enum LanguageKind : int8_t {
     /// Do not use.
     LK_None,
-    /// Should be used for C, C++.
+    /// Should be used for C.
+    LK_C,
+    /// Should be used for C++.
     LK_Cpp,
     /// Should be used for C#.
     LK_CSharp,
@@ -3300,7 +3302,9 @@ struct FormatStyle {
     /// https://sci-hub.st/10.1109/IEEESTD.2018.8299595
     LK_Verilog
   };
-  bool isCpp() const { return Language == LK_Cpp || Language == LK_ObjC; }
+  bool isCpp() const {
+    return Language == LK_Cpp || Language == LK_C || Language == LK_ObjC;
+  }
   bool isCSharp() const { return Language == LK_CSharp; }
   bool isJson() const { return Language == LK_Json; }
   bool isJavaScript() const { return Language == LK_JavaScript; }
@@ -3310,7 +3314,12 @@ struct FormatStyle {
   }
   bool isTableGen() const { return Language == LK_TableGen; }
 
-  /// Language, this format style is targeted at.
+  /// The language that this format style targets.
+  /// \note
+  ///  You can specify the language (``C``, ``Cpp``, or ``ObjC``) for ``.h``
+  ///  files by adding a ``// clang-format Language:`` line before the first
+  ///  non-comment (and non-empty) line, e.g. ``// clang-format Language: Cpp``.
+  /// \endnote
   /// \version 3.5
   LanguageKind Language;
 
@@ -5665,6 +5674,8 @@ FormatStyle::LanguageKind guessLanguage(StringRef FileName, StringRef Code);
 // Returns a string representation of ``Language``.
 inline StringRef getLanguageName(FormatStyle::LanguageKind Language) {
   switch (Language) {
+  case FormatStyle::LK_C:
+    return "C";
   case FormatStyle::LK_Cpp:
     return "C++";
   case FormatStyle::LK_CSharp:
diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp
index f02bf95cfeed7..0bb8545884442 100644
--- a/clang/lib/Format/Format.cpp
+++ b/clang/lib/Format/Format.cpp
@@ -401,6 +401,7 @@ template <> struct MappingTraits<FormatStyle::KeepEmptyLinesStyle> {
 
 template <> struct ScalarEnumerationTraits<FormatStyle::LanguageKind> {
   static void enumeration(IO &IO, FormatStyle::LanguageKind &Value) {
+    IO.enumCase(Value, "C", FormatStyle::LK_C);
     IO.enumCase(Value, "Cpp", FormatStyle::LK_Cpp);
     IO.enumCase(Value, "Java", FormatStyle::LK_Java);
     IO.enumCase(Value, "JavaScript", FormatStyle::LK_JavaScript);
@@ -3952,7 +3953,12 @@ LangOptions getFormattingLangOpts(const FormatStyle &Style) {
   LangOpts.Digraphs = LexingStd >= FormatStyle::LS_Cpp11;
 
   LangOpts.LineComment = 1;
-  LangOpts.CXXOperatorNames = Style.isCpp();
+
+  const auto Language = Style.Language;
+  LangOpts.C17 = Language == FormatStyle::LK_C;
+  LangOpts.CXXOperatorNames =
+      Language == FormatStyle::LK_Cpp || Language == FormatStyle::LK_ObjC;
+
   LangOpts.Bool = 1;
   LangOpts.ObjC = 1;
   LangOpts.MicrosoftExt = 1;    // To get kw___try, kw___finally.
@@ -3977,6 +3983,8 @@ const char *StyleOptionHelpDescription =
     "   --style=\"{BasedOnStyle: llvm, IndentWidth: 8}\"";
 
 static FormatStyle::LanguageKind getLanguageByFileName(StringRef FileName) {
+  if (FileName.ends_with(".c"))
+    return FormatStyle::LK_C;
   if (FileName.ends_with(".java"))
     return FormatStyle::LK_Java;
   if (FileName.ends_with_insensitive(".js") ||
@@ -4016,6 +4024,35 @@ static FormatStyle::LanguageKind getLanguageByFileName(StringRef FileName) {
   return FormatStyle::LK_Cpp;
 }
 
+static FormatStyle::LanguageKind getLanguageByComment(const Environment &Env) {
+  const auto ID = Env.getFileID();
+  const auto &SourceMgr = Env.getSourceManager();
+
+  LangOptions LangOpts;
+  LangOpts.CPlusPlus = 1;
+  LangOpts.LineComment = 1;
+
+  Lexer Lex(ID, SourceMgr.getBufferOrFake(ID), SourceMgr, LangOpts);
+  Lex.SetCommentRetentionState(true);
+
+  for (Token Tok; !Lex.LexFromRawLexer(Tok) && Tok.is(tok::comment);) {
+    auto Text = StringRef(SourceMgr.getCharacterData(Tok.getLocation()),
+                          Tok.getLength());
+    if (!Text.consume_front("// clang-format Language:"))
+      continue;
+
+    Text = Text.trim();
+    if (Text == "C")
+      return FormatStyle::LK_C;
+    if (Text == "Cpp")
+      return FormatStyle::LK_Cpp;
+    if (Text == "ObjC")
+      return FormatStyle::LK_ObjC;
+  }
+
+  return FormatStyle::LK_None;
+}
+
 FormatStyle::LanguageKind guessLanguage(StringRef FileName, StringRef Code) {
   const auto GuessedLanguage = getLanguageByFileName(FileName);
   if (GuessedLanguage == FormatStyle::LK_Cpp) {
@@ -4025,6 +4062,10 @@ FormatStyle::LanguageKind guessLanguage(StringRef FileName, StringRef Code) {
     if (!Code.empty() && (Extension.empty() || Extension == ".h")) {
       auto NonEmptyFileName = FileName.empty() ? "guess.h" : FileName;
       Environment Env(Code, NonEmptyFileName, /*Ranges=*/{});
+      if (const auto Language = getLanguageByComment(Env);
+          Language != FormatStyle::LK_None) {
+        return Language;
+      }
       ObjCHeaderStyleGuesser Guesser(Env, getLLVMStyle());
       Guesser.process();
       if (Guesser.isObjC())
diff --git a/clang/lib/Format/FormatToken.cpp b/clang/lib/Format/FormatToken.cpp
index 963e8f87793fa..60e428123d26d 100644
--- a/clang/lib/Format/FormatToken.cpp
+++ b/clang/lib/Format/FormatToken.cpp
@@ -42,11 +42,11 @@ static SmallVector<StringRef> CppNonKeywordTypes = {
 };
 
 bool FormatToken::isTypeName(const LangOptions &LangOpts) const {
-  const bool IsCpp = LangOpts.CXXOperatorNames;
-  return is(TT_TypeName) || Tok.isSimpleTypeSpecifier(LangOpts) ||
-         (IsCpp && is(tok::identifier) &&
-          std::binary_search(CppNonKeywordTypes.begin(),
-                             CppNonKeywordTypes.end(), TokenText));
+  if (is(TT_TypeName) || Tok.isSimpleTypeSpecifier(LangOpts))
+    return true;
+  return (LangOpts.CXXOperatorNames || LangOpts.C17) && is(tok::identifier) &&
+         std::binary_search(CppNonKeywordTypes.begin(),
+                            CppNonKeywordTypes.end(), TokenText);
 }
 
 bool FormatToken::isTypeOrIdentifier(const LangOptions &LangOpts) const {
diff --git a/clang/lib/Format/FormatToken.h b/clang/lib/Format/FormatToken.h
index 29aba281ae103..02429970599c0 100644
--- a/clang/lib/Format/FormatToken.h
+++ b/clang/lib/Format/FormatToken.h
@@ -743,29 +743,6 @@ struct FormatToken {
     return isOneOf(tok::star, tok::amp, tok::ampamp);
   }
 
-  bool isCppAlternativeOperatorKeyword() const {
-    assert(!TokenText.empty());
-    if (!isalpha(TokenText[0]))
-      return false;
-
-    switch (Tok.getKind()) {
-    case tok::ampamp:
-    case tok::ampequal:
-    case tok::amp:
-    case tok::pipe:
-    case tok::tilde:
-    case tok::exclaim:
-    case tok::exclaimequal:
-    case tok::pipepipe:
-    case tok::pipeequal:
-    case tok::caret:
-    case tok::caretequal:
-      return true;
-    default:
-      return false;
-    }
-  }
-
   bool isUnaryOperator() const {
     switch (Tok.getKind()) {
     case tok::plus:
diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index ac5b25d52ce84..976c4d888e1fd 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -129,7 +129,7 @@ class AnnotatingParser {
       : Style(Style), Line(Line), CurrentToken(Line.First), AutoFound(false),
         IsCpp(Style.isCpp()), LangOpts(getFormattingLangOpts(Style)),
         Keywords(Keywords), Scopes(Scopes), TemplateDeclarationDepth(0) {
-    assert(IsCpp == LangOpts.CXXOperatorNames);
+    assert(IsCpp == (LangOpts.CXXOperatorNames || LangOpts.C17));
     Contexts.push_back(Context(tok::unknown, 1, /*IsExpression=*/false));
     resetTokenMetadata();
   }
@@ -3820,7 +3820,7 @@ static bool isFunctionDeclarationName(const LangOptions &LangOpts,
   };
 
   const auto *Next = Current.Next;
-  const bool IsCpp = LangOpts.CXXOperatorNames;
+  const bool IsCpp = LangOpts.CXXOperatorNames || LangOpts.C17;
 
   // Find parentheses of parameter list.
   if (Current.is(tok::kw_operator)) {
diff --git a/clang/lib/Format/TokenAnnotator.h b/clang/lib/Format/TokenAnnotator.h
index 6aea310a56d69..c0c13941ef4f7 100644
--- a/clang/lib/Format/TokenAnnotator.h
+++ b/clang/lib/Format/TokenAnnotator.h
@@ -225,7 +225,7 @@ class TokenAnnotator {
   TokenAnnotator(const FormatStyle &Style, const AdditionalKeywords &Keywords)
       : Style(Style), IsCpp(Style.isCpp()),
         LangOpts(getFormattingLangOpts(Style)), Keywords(Keywords) {
-    assert(IsCpp == LangOpts.CXXOperatorNames);
+    assert(IsCpp == (LangOpts.CXXOperatorNames || LangOpts.C17));
   }
 
   /// Adapts the indent levels of comment lines to the indent of the
diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index 1411197e32554..9b4257fdd8c8f 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -168,7 +168,7 @@ UnwrappedLineParser::UnwrappedLineParser(
                        : IG_Inited),
       IncludeGuardToken(nullptr), FirstStartColumn(FirstStartColumn),
       Macros(Style.Macros, SourceMgr, Style, Allocator, IdentTable) {
-  assert(IsCpp == LangOpts.CXXOperatorNames);
+  assert(IsCpp == (LangOpts.CXXOperatorNames || LangOpts.C17));
 }
 
 void UnwrappedLineParser::reset() {
@@ -1712,12 +1712,6 @@ void UnwrappedLineParser::parseStructuralElement(
            OpeningBrace && OpeningBrace->isOneOf(TT_RequiresExpressionLBrace,
                                                  TT_CompoundRequirementLBrace);
        !eof();) {
-    if (IsCpp && FormatTok->isCppAlternativeOperatorKeyword()) {
-      if (auto *Next = Tokens->peekNextToken(/*SkipComment=*/true);
-          Next && Next->isBinaryOperator()) {
-        FormatTok->Tok.setKind(tok::identifier);
-      }
-    }
     const FormatToken *Previous = FormatTok->Previous;
     switch (FormatTok->Tok.getKind()) {
     case tok::at:
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index 3b7856d6ee150..d1e96e0fa544a 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -17784,9 +17784,11 @@ TEST_F(FormatTest, ConfigurableSpaceBeforeAssignmentOperators) {
   verifyFormat("int a = 5;");
   verifyFormat("a += 42;");
   verifyFormat("a or_eq 8;");
-  verifyFormat("xor = foo;");
 
-  FormatStyle Spaces = getLLVMStyle();
+  auto Spaces = getLLVMStyle(FormatStyle::LK_C);
+  verifyFormat("xor = foo;", Spaces);
+
+  Spaces.Language = FormatStyle::LK_Cpp;
   Spaces.SpaceBeforeAssignmentOperators = false;
   verifyFormat("int a= 5;", Spaces);
   verifyFormat("a+= 42;", Spaces);
@@ -24683,6 +24685,7 @@ TEST_F(FormatTest, StructuredBindings) {
 }
 
 TEST_F(FormatTest, FileAndCode) {
+  EXPECT_EQ(FormatStyle::LK_C, guessLanguage("foo.c", ""));
   EXPECT_EQ(FormatStyle::LK_Cpp, guessLanguage("foo.cc", ""));
   EXPECT_EQ(FormatStyle::LK_ObjC, guessLanguage("foo.m", ""));
   EXPECT_EQ(FormatStyle::LK_ObjC, guessLanguage("foo.mm", ""));
@@ -24848,6 +24851,18 @@ TEST_F(FormatTest, GuessLanguageWithChildLines) {
       guessLanguage("foo.h", "#define FOO ({ foo(); ({ NSString *s; }) })"));
 }
 
+TEST_F(FormatTest, GetLanguageByComment) {
+  EXPECT_EQ(FormatStyle::LK_C,
+            guessLanguage("foo.h", "// clang-format Language: C\n"
+                                   "int i;"));
+  EXPECT_EQ(FormatStyle::LK_Cpp,
+            guessLanguage("foo.h", "// clang-format Language: Cpp\n"
+                                   "int DoStuff(CGRect rect);"));
+  EXPECT_EQ(FormatStyle::LK_ObjC,
+            guessLanguage("foo.h", "// clang-format Language: ObjC\n"
+                                   "int i;"));
+}
+
 TEST_F(FormatTest, TypenameMacros) {
   std::vector<std::string> TypenameMacros = {"STACK_OF", "LIST", "TAILQ_ENTRY"};
 
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index dffb07c89bacc..f1a6999cfdfb8 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -3646,6 +3646,11 @@ TEST_F(TokenAnnotatorTest, CppAltOperatorKeywords) {
   ASSERT_EQ(Tokens.size(), 7u) << Tokens;
   EXPECT_TOKEN(Tokens[3], tok::pipepipe, TT_BinaryOperator);
 
+  Tokens = annotate("return segment < *this or *this < segment;");
+  ASSERT_EQ(Tokens.size(), 12u) << Tokens;
+  EXPECT_TOKEN(Tokens[5], tok::pipepipe, TT_BinaryOperator);
+  EXPECT_TOKEN(Tokens[6], tok::star, TT_UnaryOperator);
+
   Tokens = annotate("a = b or_eq c;");
   ASSERT_EQ(Tokens.size(), 7u) << Tokens;
   EXPECT_TOKEN(Tokens[3], tok::pipeequal, TT_BinaryOperator);
@@ -3658,11 +3663,13 @@ TEST_F(TokenAnnotatorTest, CppAltOperatorKeywords) {
   ASSERT_EQ(Tokens.size(), 7u) << Tokens;
   EXPECT_TOKEN(Tokens[3], tok::caretequal, TT_BinaryOperator);
 
-  Tokens = annotate("xor = foo;");
+  const auto StyleC = getLLVMStyle(FormatStyle::LK_C);
+
+  Tokens = annotate("xor = foo;", StyleC);
   ASSERT_EQ(Tokens.size(), 5u) << Tokens;
   EXPECT_TOKEN(Tokens[0], tok::identifier, TT_Unknown);
 
-  Tokens = annotate("int xor = foo;");
+  Tokens = annotate("int xor = foo;", StyleC);
   ASSERT_EQ(Tokens.size(), 6u) << Tokens;
   EXPECT_TOKEN(Tokens[1], tok::identifier, TT_StartOfName);
 }

From 794ba171a43baaf9de9739440ddb582500527031 Mon Sep 17 00:00:00 2001
From: Nathan Ridge <zeratul976@hotmail.com>
Date: Tue, 4 Mar 2025 10:01:18 -0500
Subject: [PATCH 213/282] [clangd] Add clangd 20 release notes (#127358)

---
 clang-tools-extra/docs/ReleaseNotes.rst | 53 +++++++++++++++++++++++--
 1 file changed, 50 insertions(+), 3 deletions(-)

diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index cc5f64a3f9fa3..316ac1743ccb7 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -56,7 +56,8 @@ Improvements to clangd
 Inlay hints
 ^^^^^^^^^^^
 
-- Added `DefaultArguments` Inlay Hints option.
+- Added support for inlay hints for default arguments, enabled using the
+  `DefaultArguments` config option (#GH95712)
 
 Diagnostics
 ^^^^^^^^^^^
@@ -67,21 +68,42 @@ Semantic Highlighting
 Compile flags
 ^^^^^^^^^^^^^
 
+- Fixed a bug where clangd would unnecessarily reparse open files whose
+  compile command did not change when receiving a new compile command
+  via an LSP `workspace/configuration` request (#GH115438)
+
 Hover
 ^^^^^
 
+- Hovering over a function name now shows the function's documentation
+  comment even if the comment is written above the function's out-of-line
+  definition in a different source file (#GH67802)
+
 Code completion
 ^^^^^^^^^^^^^^^
 
+- Added an `ArgumentLists` config option under `Completion`. This is a more
+  flexible version of the `--function-arg-placeholders` command line flag,
+  allowing users more detailed control of what is inserted in argument list
+  position when clangd completes the name of a function in a function call
+  context. (#GH111322)
+- Clangd now supports configuring which headers should be inserted using 
+  `<>` vs. `""` syntax using the `QuotedHeaders` and `AngledHeaders` config
+  options under `Style` (#GH67749)
 - Added completion for C++20 keywords.
+- Improved code completion behaviour in dependent/templated code
+- Completion items now include documentation comments in more cases (#GH120099)
 
 Code actions
 ^^^^^^^^^^^^
 
 - Added `Swap operands` tweak for certain binary operators.
-
 - Improved the extract-to-function code action to allow extracting statements
   with overloaded operators like ``<<`` of ``std::ostream``.
+- `Define outline` now handles member functions of class templates, and
+  member function templates.
+- `Extract variable` can now operate on the top-level expression in an
+  expression statement (#GH112525)
 
 Signature help
 ^^^^^^^^^^^^^^
@@ -89,13 +111,38 @@ Signature help
 Cross-references
 ^^^^^^^^^^^^^^^^
 
+- Clangd now supports the "outgoing calls" direction of call hierarchy
+  (#GH77556)
+- Call hierarchy can now be invoked on fields and namespace-scope
+  variables (#GH113900)
+- Improved heuristics for filtering out generated Protobuf symbol names
+  during indexing (#GH110091)
+- Compiler intrinsics defined in `*intrin.h` system headers are now
+  indexed even if they have reserved names (#GH119735)
+- Various improvements to go-to-definition in templated code
+
 Objective-C
 ^^^^^^^^^^^
 
+Clang-tidy integration
+^^^^^^^^^^^^^^^^^^^^^^
+
+- Improved robustness in handling clang-tidy check names (#GH109421)
+
+C++20 Modules Support
+^^^^^^^^^^^^^^^^^^^^^
+
+- Support code completion for symbols defined in modules (#GH110083)
+- Improve performance when opening files that import modules (#GH106683)
+- Compile commands for modules now respect modifications specified in `.clangd`
+  files (#GH122606)
+
 Miscellaneous
 ^^^^^^^^^^^^^
 
-- The DefineOutline tweak now handles member functions of class templates.
+- Fixed an OOM affecting some versions of libcxx headers compiled in C++20
+  mode (#GH108866)
+- Various other stability improvements, e.g. crash fixes
 
 Improvements to clang-doc
 -------------------------

From 24a30daaa559829ad079f2ff7f73eb4e18095f88 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Tue, 4 Mar 2025 11:50:46 -0800
Subject: [PATCH 214/282] Bump version to 20.1.0 (final)

---
 cmake/Modules/LLVMVersion.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/Modules/LLVMVersion.cmake b/cmake/Modules/LLVMVersion.cmake
index 4048a7466b6d4..be4bb4329715e 100644
--- a/cmake/Modules/LLVMVersion.cmake
+++ b/cmake/Modules/LLVMVersion.cmake
@@ -10,6 +10,6 @@ if(NOT DEFINED LLVM_VERSION_PATCH)
   set(LLVM_VERSION_PATCH 0)
 endif()
 if(NOT DEFINED LLVM_VERSION_SUFFIX)
-  set(LLVM_VERSION_SUFFIX -rc3)
+  set(LLVM_VERSION_SUFFIX)
 endif()
 

From 0e537474ea5907df91b979d86bf3a1a9050b1ee7 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Tue, 11 Mar 2025 12:58:18 -0700
Subject: [PATCH 215/282] Bump version to 20.1.1 (#130806)

---
 cmake/Modules/LLVMVersion.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/Modules/LLVMVersion.cmake b/cmake/Modules/LLVMVersion.cmake
index be4bb4329715e..05e772cb6fa38 100644
--- a/cmake/Modules/LLVMVersion.cmake
+++ b/cmake/Modules/LLVMVersion.cmake
@@ -7,7 +7,7 @@ if(NOT DEFINED LLVM_VERSION_MINOR)
   set(LLVM_VERSION_MINOR 1)
 endif()
 if(NOT DEFINED LLVM_VERSION_PATCH)
-  set(LLVM_VERSION_PATCH 0)
+  set(LLVM_VERSION_PATCH 1)
 endif()
 if(NOT DEFINED LLVM_VERSION_SUFFIX)
   set(LLVM_VERSION_SUFFIX)

From 20adce87104c242ce92ef173849b1d7d5305e8c5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Fri, 28 Feb 2025 13:56:16 -0100
Subject: [PATCH 216/282] [libc++][ci] Update the Windows toolchains to Clang
 19 (#129232)

This also fixes test failures in the clang-cl build configs that started
a couple days ago. It seems like the failures were triggered by an update
to the base image on the Github provided runners.

There were failures in test/libcxx/system_reserved_names.gen.py, due to
an issue in an Clang intrinsics header (avx512fp16intrin.h); this issue
was observed and fixed for Clang 19 in 6f04f46927c. The test does
    #define A SYSTEM_RESERVED_NAME
which clashes with a parameter with the name `A` in that header.

By upgrading the toolchain to Clang 19, we get fixed version of this
intrinsics header.

Also update the llvm-mingw toolchains to a version with Clang 19.1.7.

(cherry picked from commit e6a0ee3d1d12c9c02c1a361109e282d18dd2430c)
---
 .github/workflows/libcxx-build-and-test.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/libcxx-build-and-test.yaml b/.github/workflows/libcxx-build-and-test.yaml
index ee77e83363d37..3346c1322a07c 100644
--- a/.github/workflows/libcxx-build-and-test.yaml
+++ b/.github/workflows/libcxx-build-and-test.yaml
@@ -251,11 +251,11 @@ jobs:
       - name: Install a current LLVM
         if: ${{ matrix.mingw != true }}
         run: |
-          choco install -y llvm --version=18.1.6 --allow-downgrade
+          choco install -y llvm --version=19.1.7 --allow-downgrade
       - name: Install llvm-mingw
         if: ${{ matrix.mingw == true }}
         run: |
-          curl -LO https://github.com/mstorsjo/llvm-mingw/releases/download/20240606/llvm-mingw-20240606-ucrt-x86_64.zip
+          curl -LO https://github.com/mstorsjo/llvm-mingw/releases/download/20250114/llvm-mingw-20250114-ucrt-x86_64.zip
           powershell Expand-Archive llvm-mingw*.zip -DestinationPath .
           del llvm-mingw*.zip
           mv llvm-mingw* c:\llvm-mingw

From 5b552d780ae82b07aaed5703690e6a7cc431ee95 Mon Sep 17 00:00:00 2001
From: Elvis Wang <elvis.wang@sifive.com>
Date: Tue, 25 Feb 2025 11:09:09 +0800
Subject: [PATCH 217/282] [LV][VPlan] Prevent calculate cost for skiped
 instructions in precomputeCosts(). (#127966)

Skip calculating instruction costs for exit conditions in
precomputeCosts() when it should be skipped.

Reported from:
https://github.com/llvm/llvm-project/issues/115744#issuecomment-2670479463
Godbolt for reduced test cases: https://godbolt.org/z/fr4YMeqcv

(cherry picked from commit 8009c1fd81ad0b6ac65724d2b134a92db48f8fbf)
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |   5 +-
 .../LoopVectorize/X86/cost-model.ll           | 124 ++++++++++++++++++
 2 files changed, 127 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 0ceeec48487f6..7cd395255163a 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7239,7 +7239,7 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
   // Collect all exit conditions.
   for (BasicBlock *EB : Exiting) {
     auto *Term = dyn_cast<BranchInst>(EB->getTerminator());
-    if (!Term)
+    if (!Term || CostCtx.skipCostComputation(Term, VF.isVector()))
       continue;
     if (auto *CondI = dyn_cast<Instruction>(Term->getOperand(0))) {
       ExitInstrs.insert(CondI);
@@ -7259,7 +7259,8 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
     Cost += CondICost;
     for (Value *Op : CondI->operands()) {
       auto *OpI = dyn_cast<Instruction>(Op);
-      if (!OpI || any_of(OpI->users(), [&ExitInstrs, this](User *U) {
+      if (!OpI || CostCtx.skipCostComputation(OpI, VF.isVector()) ||
+          any_of(OpI->users(), [&ExitInstrs, this](User *U) {
             return OrigLoop->contains(cast<Instruction>(U)->getParent()) &&
                    !ExitInstrs.contains(cast<Instruction>(U));
           }))
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
index bd28e28ddff95..3718a092d9aa0 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
@@ -1211,6 +1211,130 @@ exit:
   ret i32 %or
 }
 
+; Check if the vplan-based cost model select same VF to the legacy cost model.
+; Reduced from: https://github.com/llvm/llvm-project/issues/115744#issuecomment-2670479463
+define i32 @g(i64 %n) {
+; CHECK-LABEL: @g(
+; CHECK-NEXT:  iter.check:
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[N:%.*]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[TMP0]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP1]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK:       vector.scevcheck:
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ugt i64 [[N]], 4294967295
+; CHECK-NEXT:    br i1 [[TMP2]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK:       vector.main.loop.iter.check:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[TMP1]], 16
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP1]], 16
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP1]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-NEXT:    [[STEP_ADD_2:%.*]] = add <4 x i32> [[STEP_ADD]], splat (i32 4)
+; CHECK-NEXT:    [[STEP_ADD_3:%.*]] = add <4 x i32> [[STEP_ADD_2]], splat (i32 4)
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i32> [[VEC_IND]] to <4 x i64>
+; CHECK-NEXT:    [[TMP4:%.*]] = zext <4 x i32> [[STEP_ADD]] to <4 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i32> [[STEP_ADD_2]] to <4 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = zext <4 x i32> [[STEP_ADD_3]] to <4 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq <4 x i64> [[BROADCAST_SPLAT]], [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq <4 x i64> [[BROADCAST_SPLAT]], [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <4 x i64> [[BROADCAST_SPLAT]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq <4 x i64> [[BROADCAST_SPLAT]], [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[TMP7]], <4 x i32> zeroinitializer, <4 x i32> splat (i32 2)
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[TMP8]], <4 x i32> zeroinitializer, <4 x i32> splat (i32 2)
+; CHECK-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[TMP9]], <4 x i32> zeroinitializer, <4 x i32> splat (i32 2)
+; CHECK-NEXT:    [[TMP14:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> zeroinitializer, <4 x i32> splat (i32 2)
+; CHECK-NEXT:    [[TMP15]] = or <4 x i32> [[TMP11]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP16]] = or <4 x i32> [[TMP12]], [[VEC_PHI2]]
+; CHECK-NEXT:    [[TMP17]] = or <4 x i32> [[TMP13]], [[VEC_PHI3]]
+; CHECK-NEXT:    [[TMP18]] = or <4 x i32> [[TMP14]], [[VEC_PHI4]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD_3]], splat (i32 4)
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = or <4 x i32> [[TMP16]], [[TMP15]]
+; CHECK-NEXT:    [[BIN_RDX5:%.*]] = or <4 x i32> [[TMP17]], [[BIN_RDX]]
+; CHECK-NEXT:    [[BIN_RDX6:%.*]] = or <4 x i32> [[TMP18]], [[BIN_RDX5]]
+; CHECK-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[BIN_RDX6]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; CHECK:       vec.epilog.iter.check:
+; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i32 [[TMP1]], [[N_VEC]]
+; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i32 [[N_VEC_REMAINING]], 4
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK:       vec.epilog.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP20]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[N_MOD_VF7:%.*]] = urem i32 [[TMP1]], 4
+; CHECK-NEXT:    [[N_VEC8:%.*]] = sub i32 [[TMP1]], [[N_MOD_VF7]]
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[BC_RESUME_VAL]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[DOTSPLAT]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT13:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT14:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT13]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; CHECK:       vec.epilog.vector.body:
+; CHECK-NEXT:    [[INDEX9:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND10:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI12:%.*]] = phi <4 x i32> [ [[TMP21]], [[VEC_EPILOG_PH]] ], [ [[TMP25:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP22:%.*]] = zext <4 x i32> [[VEC_IND10]] to <4 x i64>
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq <4 x i64> [[BROADCAST_SPLAT14]], [[TMP22]]
+; CHECK-NEXT:    [[TMP24:%.*]] = select <4 x i1> [[TMP23]], <4 x i32> zeroinitializer, <4 x i32> splat (i32 2)
+; CHECK-NEXT:    [[TMP25]] = or <4 x i32> [[TMP24]], [[VEC_PHI12]]
+; CHECK-NEXT:    [[INDEX_NEXT15]] = add nuw i32 [[INDEX9]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT11]] = add <4 x i32> [[VEC_IND10]], splat (i32 4)
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i32 [[INDEX_NEXT15]], [[N_VEC8]]
+; CHECK-NEXT:    br i1 [[TMP26]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]]
+; CHECK:       vec.epilog.middle.block:
+; CHECK-NEXT:    [[TMP27:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP25]])
+; CHECK-NEXT:    [[CMP_N16:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC8]]
+; CHECK-NEXT:    br i1 [[CMP_N16]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK:       vec.epilog.scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL17:%.*]] = phi i32 [ [[N_VEC8]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX18:%.*]] = phi i32 [ [[TMP27]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK]] ], [ [[TMP20]], [[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL17]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[SELECT:%.*]] = phi i32 [ [[BC_MERGE_RDX18]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[SELECT_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IV_WIDEN:%.*]] = zext i32 [[IV]] to i64
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[N]], [[IV_WIDEN]]
+; CHECK-NEXT:    [[SELECT_I:%.*]] = select i1 [[EXITCOND]], i32 0, i32 2
+; CHECK-NEXT:    [[SELECT_NEXT]] = or i32 [[SELECT_I]], [[SELECT]]
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP32:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[SELECT_NEXT_LCSSA:%.*]] = phi i32 [ [[SELECT_NEXT]], [[LOOP]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ], [ [[TMP27]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[SELECT_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %select = phi i32 [ 0, %entry ], [ %select.next, %loop ]
+  %iv.widen = zext i32 %iv to i64
+  %exitcond = icmp eq i64 %n, %iv.widen
+  %select.i = select i1 %exitcond, i32 0, i32 2
+  %select.next = or i32 %select.i, %select
+  %iv.next = add i32 %iv, 1
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret i32 %select.next
+}
+
 declare void @llvm.assume(i1 noundef) #0
 
 attributes #0 = { "target-cpu"="penryn" }

From 0f5e7e86e38e6ba249af45f18f3610acef1a9274 Mon Sep 17 00:00:00 2001
From: Jonathan Albrecht <jonathan.albrecht@ibm.com>
Date: Fri, 28 Feb 2025 04:16:19 -0500
Subject: [PATCH 218/282] [SystemZ] Add header guard macros to vecintrin.h
 (#129170)

Add header guard macros to clang/lib/Headers/vecintrin.h. Found while
compiling the latest numpy with clang 19 on s390x which ends up
including vecintrin.h twice. The gcc version of this file has header
guards so numpy compiles fine with gcc.

Signed-off-by: Jonathan Albrecht <jonathan.albrecht@ibm.com>
(cherry picked from commit ddaa5b3bfb2980f79c6f277608ad33a6efe8d554)
---
 clang/lib/Headers/vecintrin.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/clang/lib/Headers/vecintrin.h b/clang/lib/Headers/vecintrin.h
index a14c39f9f7313..338ea51ce8863 100644
--- a/clang/lib/Headers/vecintrin.h
+++ b/clang/lib/Headers/vecintrin.h
@@ -7,6 +7,9 @@
  *===-----------------------------------------------------------------------===
  */
 
+#ifndef _VECINTRIN_H
+#define _VECINTRIN_H
+
 #if defined(__s390x__) && defined(__VEC__)
 
 #define __ATTRS_ai __attribute__((__always_inline__))
@@ -12861,3 +12864,5 @@ vec_search_string_until_zero_cc(__vector unsigned int __a,
 #error "Use -fzvector to enable vector extensions"
 
 #endif
+
+#endif /* _VECINTRIN_H */

From dcc378e862de5ba51174ec1c77a73df8628f178b Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Fri, 28 Feb 2025 15:41:44 -0500
Subject: [PATCH 219/282] [libc++] Guard <codecvt> contents on
 _LIBCPP_HAS_LOCALIZATION (#129112)

The codecvt class is defined in <locale> and the contents of the
<codecvt> header don't work when localization is disabled. Without this
guard, builds with localization disabled that happen to include
<codecvt> could be broken because they would try to include <__locale>,
which ends up trying to include the locale base API and eventually
platform headers like <xlocale.h> that may not exist.

(cherry picked from commit fda7373daf5790833101c504be1c749bbb0fceb8)
---
 libcxx/include/codecvt | 31 ++++++++++++++++++-------------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/libcxx/include/codecvt b/libcxx/include/codecvt
index f7ae804c6789c..0526b8512175f 100644
--- a/libcxx/include/codecvt
+++ b/libcxx/include/codecvt
@@ -58,14 +58,17 @@ class codecvt_utf8_utf16
 #  include <__cxx03/codecvt>
 #else
 #  include <__config>
-#  include <__locale>
-#  include <version>
 
-#  if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#    pragma GCC system_header
-#  endif
+#  if _LIBCPP_HAS_LOCALIZATION
+
+#    include <__locale>
+#    include <version>
 
-#  if _LIBCPP_STD_VER < 26 || defined(_LIBCPP_BUILDING_LIBRARY) || defined(_LIBCPP_ENABLE_CXX26_REMOVED_CODECVT)
+#    if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#      pragma GCC system_header
+#    endif
+
+#    if _LIBCPP_STD_VER < 26 || defined(_LIBCPP_BUILDING_LIBRARY) || defined(_LIBCPP_ENABLE_CXX26_REMOVED_CODECVT)
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
@@ -76,7 +79,7 @@ enum _LIBCPP_DEPRECATED_IN_CXX17 codecvt_mode { consume_header = 4, generate_hea
 template <class _Elem>
 class __codecvt_utf8;
 
-#    if _LIBCPP_HAS_WIDE_CHARACTERS
+#      if _LIBCPP_HAS_WIDE_CHARACTERS
 template <>
 class _LIBCPP_EXPORTED_FROM_ABI __codecvt_utf8<wchar_t> : public codecvt<wchar_t, char, mbstate_t> {
   unsigned long __maxcode_;
@@ -115,7 +118,7 @@ protected:
   int do_length(state_type&, const extern_type* __frm, const extern_type* __end, size_t __mx) const override;
   int do_max_length() const _NOEXCEPT override;
 };
-#    endif // _LIBCPP_HAS_WIDE_CHARACTERS
+#      endif // _LIBCPP_HAS_WIDE_CHARACTERS
 
 _LIBCPP_SUPPRESS_DEPRECATED_PUSH
 template <>
@@ -206,7 +209,7 @@ _LIBCPP_SUPPRESS_DEPRECATED_POP
 template <class _Elem, bool _LittleEndian>
 class __codecvt_utf16;
 
-#    if _LIBCPP_HAS_WIDE_CHARACTERS
+#      if _LIBCPP_HAS_WIDE_CHARACTERS
 template <>
 class _LIBCPP_EXPORTED_FROM_ABI __codecvt_utf16<wchar_t, false> : public codecvt<wchar_t, char, mbstate_t> {
   unsigned long __maxcode_;
@@ -284,7 +287,7 @@ protected:
   int do_length(state_type&, const extern_type* __frm, const extern_type* __end, size_t __mx) const override;
   int do_max_length() const _NOEXCEPT override;
 };
-#    endif // _LIBCPP_HAS_WIDE_CHARACTERS
+#      endif // _LIBCPP_HAS_WIDE_CHARACTERS
 
 _LIBCPP_SUPPRESS_DEPRECATED_PUSH
 template <>
@@ -451,7 +454,7 @@ _LIBCPP_SUPPRESS_DEPRECATED_POP
 template <class _Elem>
 class __codecvt_utf8_utf16;
 
-#    if _LIBCPP_HAS_WIDE_CHARACTERS
+#      if _LIBCPP_HAS_WIDE_CHARACTERS
 template <>
 class _LIBCPP_EXPORTED_FROM_ABI __codecvt_utf8_utf16<wchar_t> : public codecvt<wchar_t, char, mbstate_t> {
   unsigned long __maxcode_;
@@ -490,7 +493,7 @@ protected:
   int do_length(state_type&, const extern_type* __frm, const extern_type* __end, size_t __mx) const override;
   int do_max_length() const _NOEXCEPT override;
 };
-#    endif // _LIBCPP_HAS_WIDE_CHARACTERS
+#      endif // _LIBCPP_HAS_WIDE_CHARACTERS
 
 _LIBCPP_SUPPRESS_DEPRECATED_PUSH
 template <>
@@ -579,7 +582,9 @@ _LIBCPP_SUPPRESS_DEPRECATED_POP
 
 _LIBCPP_END_NAMESPACE_STD
 
-#  endif // _LIBCPP_STD_VER < 26 || defined(_LIBCPP_BUILDING_LIBRARY) || defined(_LIBCPP_ENABLE_CXX26_REMOVED_CODECVT)
+#    endif // _LIBCPP_STD_VER < 26 || defined(_LIBCPP_BUILDING_LIBRARY) || defined(_LIBCPP_ENABLE_CXX26_REMOVED_CODECVT)
+
+#  endif // _LIBCPP_HAS_LOCALIZATION
 
 #  if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
 #    include <atomic>

From 6525b151fe774ee6b426f17623c146abbddedf83 Mon Sep 17 00:00:00 2001
From: pkarveti <quic_pkarveti@quicinc.com>
Date: Tue, 25 Feb 2025 21:56:28 +0530
Subject: [PATCH 220/282] [Hexagon] Handle Call Operand vxi1 in Hexagon Backend
 (#128027)

This commit updates the Hexagon backend to handle
vxi1 call operands. It ensures compatibility for
vector types of sizes 4, 8, 16, 32, 64, and 128 x i1 when HVX is
enabled.

~Fixes #59009 and #118879~

(cherry picked from commit 37559c8401cf9236d561eebd75bd3d70be6ab723)
---
 llvm/lib/Target/Hexagon/HexagonCallingConv.td | 18 +++++++
 .../CodeGen/Hexagon/calloperand-v128i1.ll     | 39 +++++++++++++++
 .../test/CodeGen/Hexagon/calloperand-v16i1.ll | 40 +++++++++++++++
 .../test/CodeGen/Hexagon/calloperand-v32i1.ll | 50 +++++++++++++++++++
 llvm/test/CodeGen/Hexagon/calloperand-v4i1.ll | 39 +++++++++++++++
 .../test/CodeGen/Hexagon/calloperand-v64i1.ll | 50 +++++++++++++++++++
 llvm/test/CodeGen/Hexagon/calloperand-v8i1.ll | 39 +++++++++++++++
 7 files changed, 275 insertions(+)
 create mode 100644 llvm/test/CodeGen/Hexagon/calloperand-v128i1.ll
 create mode 100644 llvm/test/CodeGen/Hexagon/calloperand-v16i1.ll
 create mode 100644 llvm/test/CodeGen/Hexagon/calloperand-v32i1.ll
 create mode 100644 llvm/test/CodeGen/Hexagon/calloperand-v4i1.ll
 create mode 100644 llvm/test/CodeGen/Hexagon/calloperand-v64i1.ll
 create mode 100644 llvm/test/CodeGen/Hexagon/calloperand-v8i1.ll

diff --git a/llvm/lib/Target/Hexagon/HexagonCallingConv.td b/llvm/lib/Target/Hexagon/HexagonCallingConv.td
index cc41b569e4904..2378bbc928d49 100644
--- a/llvm/lib/Target/Hexagon/HexagonCallingConv.td
+++ b/llvm/lib/Target/Hexagon/HexagonCallingConv.td
@@ -65,6 +65,8 @@ def CC_Hexagon: CallingConv<[
   CCIfType<[i32],
     CCIfSplit<
       CCCustom<"CC_SkipOdd">>>,
+  CCIfType<[v4i1],  CCPromoteToType<v4i16>>,
+  CCIfType<[v8i1],  CCPromoteToType<v8i8>>,
 
   CCIfType<[i32,v2i16,v4i8],
     CCAssignToReg<[R0,R1,R2,R3,R4,R5]>>,
@@ -111,6 +113,14 @@ class CCIfHvx128<CCAction A>
 
 def CC_Hexagon_HVX: CallingConv<[
   // HVX 64-byte mode
+
+  CCIfHvx64<
+        CCIfType<[v16i1], CCPromoteToType<v16i32>>>,
+  CCIfHvx64<
+        CCIfType<[v32i1], CCPromoteToType<v32i16>>>,
+  CCIfHvx64<
+        CCIfType<[v64i1], CCPromoteToType<v64i8>>>,
+
   CCIfHvx64<
     CCIfType<[v16i32,v32i16,v64i8],
       CCAssignToReg<[V0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15]>>>,
@@ -125,6 +135,14 @@ def CC_Hexagon_HVX: CallingConv<[
       CCAssignToStack<128,64>>>,
 
   // HVX 128-byte mode
+
+  CCIfHvx128<
+        CCIfType<[v32i1], CCPromoteToType<v32i32>>>,
+  CCIfHvx128<
+        CCIfType<[v64i1], CCPromoteToType<v64i16>>>,
+  CCIfHvx128<
+        CCIfType<[v128i1], CCPromoteToType<v128i8>>>,
+
   CCIfHvx128<
     CCIfType<[v32i32,v64i16,v128i8,v32f32,v64f16],
       CCAssignToReg<[V0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15]>>>,
diff --git a/llvm/test/CodeGen/Hexagon/calloperand-v128i1.ll b/llvm/test/CodeGen/Hexagon/calloperand-v128i1.ll
new file mode 100644
index 0000000000000..ddac8c1cd8279
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/calloperand-v128i1.ll
@@ -0,0 +1,39 @@
+;RUN: llc -mtriple=hexagon -mattr=+hvxv79,+hvx-length128b < %s -o - | FileCheck %s
+
+; CHECK-LABEL: compare_vectors
+; CHECK: [[REG1:(q[0-9]+)]] = vcmp.eq(v{{[0-9]+}}.b,v{{[0-9]+}}.b)
+; CHECK: [[REG2:(r[0-9]+)]] = #-1
+; CHECK: v0 = vand([[REG1]],[[REG2]])
+
+define void @compare_vectors(<128 x i8> %a, <128 x i8> %b) {
+entry:
+  %result = icmp eq <128 x i8> %a, %b
+  call i32 @f.1(<128 x i1> %result)
+  ret void
+}
+
+; CHECK-LABEL: f.1:
+; CHECK: [[REG3:(q[0-9]+)]] = vand(v0,r{{[0-9]+}})
+; CHECK: [[REG4:(v[0-9]+)]] = vand([[REG3]],r{{[0-9]+}})
+; CHECK: r{{[0-9]+}} = vextract([[REG4]],r{{[0-9]+}})
+
+define i32 @f.1(<128 x i1> %vec) {
+  %element = extractelement <128 x i1> %vec, i32 6
+  %is_true = icmp eq i1 %element, true
+  br i1 %is_true, label %if_true, label %if_false
+
+if_true:
+  call void @action_if_true()
+  br label %end
+
+if_false:
+  call void @action_if_false()
+  br label %end
+
+end:
+  %result = phi i32 [1, %if_true], [0, %if_false]
+  ret i32 %result
+}
+
+declare void @action_if_true()
+declare void @action_if_false()
diff --git a/llvm/test/CodeGen/Hexagon/calloperand-v16i1.ll b/llvm/test/CodeGen/Hexagon/calloperand-v16i1.ll
new file mode 100644
index 0000000000000..bbb2697246df1
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/calloperand-v16i1.ll
@@ -0,0 +1,40 @@
+;RUN: llc -mtriple=hexagon -mattr=+hvxv79,+hvx-length64b < %s -o - | FileCheck %s --check-prefix=CHECK
+;RUN: llc -mtriple=hexagon -mattr=+hvxv79,+hvx-length128b < %s -o - | FileCheck %s --check-prefix=CHECK
+
+; CHECK-LABEL: compare_vectors
+; CHECK: [[REG1:(q[0-9]+)]] = vcmp.eq(v{{[0-9]+}}.w,v{{[0-9]+}}.w)
+; CHECK: [[REG2:(r[0-9]+)]] = #-1
+; CHECK: v0  = vand([[REG1]],[[REG2]])
+
+define void @compare_vectors(<16 x i32> %a, <16 x i32> %b) {
+entry:
+  %result = icmp eq <16 x i32> %a, %b
+  call i32 @f.1(<16 x i1> %result)
+  ret void
+}
+
+; CHECK-LABEL: f.1:
+; CHECK: [[REG3:(q[0-9]+)]] = vand(v0,r{{[0-9]+}})
+; CHECK: [[REG4:(v[0-9]+)]] = vand([[REG3]],r{{[0-9]+}})
+; CHECK: r{{[0-9]+}} = vextract([[REG4]],r{{[0-9]+}})
+
+define i32 @f.1(<16 x i1> %vec) {
+  %element = extractelement <16 x i1> %vec, i32 6
+  %is_true = icmp eq i1 %element, true
+  br i1 %is_true, label %if_true, label %if_false
+
+if_true:
+  call void @action_if_true()
+  br label %end
+
+if_false:
+  call void @action_if_false()
+  br label %end
+
+end:
+  %result = phi i32 [1, %if_true], [0, %if_false]
+  ret i32 %result
+}
+
+declare void @action_if_true()
+declare void @action_if_false()
diff --git a/llvm/test/CodeGen/Hexagon/calloperand-v32i1.ll b/llvm/test/CodeGen/Hexagon/calloperand-v32i1.ll
new file mode 100644
index 0000000000000..a73478728d910
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/calloperand-v32i1.ll
@@ -0,0 +1,50 @@
+; RUN: llc -mtriple=hexagon -mattr=+hvxv79,+hvx-length64b < %s -o - | FileCheck %s --check-prefix=CHECK-64
+; RUN: llc -mtriple=hexagon -mattr=+hvxv79,+hvx-length128b < %s -o - | FileCheck %s --check-prefix=CHECK-128
+
+; CHECK-LABEL: compare_vectors
+; CHECK-64: [[REG1:(q[0-9]+)]] = vcmp.eq(v{{[0-9]+}}.h,v{{[0-9]+}}.h)
+; CHECK-64: [[REG2:(r[0-9]+)]] = #-1
+; CHECK-64: v0 = vand([[REG1]],[[REG2]])
+; CHECK-128: r{{[0-9]+}}:{{[0-9]+}} = combine(##.LCPI0_0,#-1)
+; CHECK-128: [[REG1:(q[0-9]+)]] = vcmp.eq(v0.h,v1.h)
+; CHECK-128: [[REG2:(v[0-9]+)]] = vand([[REG1]],r{{[0-9]+}})
+; CHECK-128: [[REG3:(v[0-9]+)]] = vmem(r{{[0-9]+}}+#0)
+; CHECK-128: [[REG4:(v[0-9]+)]] = vdelta([[REG2]],[[REG3]])
+; CHECK-128: [[REG5:(q[0-9]+)]] = vand([[REG4]],r{{[0-9]+}})
+; CHECK-128: v0 = vand([[REG5]],r{{[0-9]+}})
+
+define void @compare_vectors(<32 x i16> %a, <32 x i16> %b) {
+entry:
+  %result = icmp eq <32 x i16> %a, %b
+  call i32 @f.1(<32 x i1> %result)
+  ret void
+}
+
+; CHECK-LABEL: f.1:
+; CHECK-64: [[REG3:(q[0-9]+)]] = vand(v0,r{{[0-9]+}})
+; CHECK-64: [[REG4:(v[0-9]+)]] = vand([[REG3]],r{{[0-9]+}})
+; CHECK-64: r{{[0-9]+}} = vextract([[REG4]],r{{[0-9]+}})
+; CHECK-128: [[REG6:(q[0-9]+)]] = vand(v0,r{{[0-9]+}})
+; CHECK-128: [[REG7:(v[0-9]+)]] = vand([[REG6]],r{{[0-9]+}})
+; CHECK-128: r{{[0-9]+}} = vextract([[REG7]],r{{[0-9]+}})
+
+define i32 @f.1(<32 x i1> %vec) {
+  %element = extractelement <32 x i1> %vec, i32 6
+  %is_true = icmp eq i1 %element, true
+  br i1 %is_true, label %if_true, label %if_false
+
+if_true:
+  call void @action_if_true()
+  br label %end
+
+if_false:
+  call void @action_if_false()
+  br label %end
+
+end:
+  %result = phi i32 [1, %if_true], [0, %if_false]
+  ret i32 %result
+}
+
+declare void @action_if_true()
+declare void @action_if_false()
diff --git a/llvm/test/CodeGen/Hexagon/calloperand-v4i1.ll b/llvm/test/CodeGen/Hexagon/calloperand-v4i1.ll
new file mode 100644
index 0000000000000..af23e6c788ea5
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/calloperand-v4i1.ll
@@ -0,0 +1,39 @@
+;RUN: llc -mtriple=hexagon  < %s -o - | FileCheck %s --check-prefix=CHECK
+;RUN: llc -mtriple=hexagon -mattr=+hvxv79,+hvx-length64b < %s -o - | FileCheck %s --check-prefix=CHECK
+;RUN: llc -mtriple=hexagon -mattr=+hvxv79,+hvx-length128b < %s -o - | FileCheck %s --check-prefix=CHECK
+
+; CHECK-LABEL: compare_vectors
+; CHECK: [[REG0:(p[0-9]+)]] = vcmph.eq([[REG1:(r[0-9]+):[0-9]]],[[REG2:(r[0-9]+):[0-9]]])
+; CHECK: [[REG1:(r[0-9]+):[0-9]]] = CONST64(#281479271743489)
+; CHECK: [[REG2:(r[0-9]+):[0-9]]] = mask([[REG0]])
+; CHECK: r{{[0-9]+}}:{{[0-9]+}} = and([[REG2]],[[REG1]])
+
+define void @compare_vectors(<4 x i16> %a, <4 x i16> %b) {
+entry:
+  %result = icmp eq <4 x i16> %a, %b
+  call i32 @f.1(<4 x i1> %result)
+  ret void
+}
+; CHECK-LABEL: f.1:
+; CHECK: [[REG3:(r[0-9]+)]] = and([[REG3]],##65537)
+; CHECK: [[REG4:(r[0-9]+)]] = and([[REG4]],##65537)
+define i32 @f.1(<4 x i1> %vec) {
+  %element = extractelement <4 x i1> %vec, i32 2
+  %is_true = icmp eq i1 %element, true
+  br i1 %is_true, label %if_true, label %if_false
+
+if_true:
+  call void @action_if_true()
+  br label %end
+
+if_false:
+  call void @action_if_false()
+  br label %end
+
+end:
+  %result = phi i32 [1, %if_true], [0, %if_false]
+  ret i32 %result
+}
+
+declare void @action_if_true()
+declare void @action_if_false()
diff --git a/llvm/test/CodeGen/Hexagon/calloperand-v64i1.ll b/llvm/test/CodeGen/Hexagon/calloperand-v64i1.ll
new file mode 100644
index 0000000000000..7cc562085a7e6
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/calloperand-v64i1.ll
@@ -0,0 +1,50 @@
+; RUN: llc -mtriple=hexagon -mattr=+hvxv79,+hvx-length64b < %s -o - | FileCheck %s --check-prefix=CHECK-64
+; RUN: llc -mtriple=hexagon -mattr=+hvxv79,+hvx-length128b < %s -o - | FileCheck %s --check-prefix=CHECK-128
+
+; CHECK-LABEL: compare_vectors
+; CHECK-64: [[REG1:(q[0-9]+)]] = vcmp.eq(v{{[0-9]+}}.b,v{{[0-9]+}}.b)
+; CHECK-64: [[REG2:(r[0-9]+)]] = #-1
+; CHECK-64: v0 = vand([[REG1]],[[REG2]])
+; CHECK-128: r{{[0-9]+}}:{{[0-9]+}} = combine(##.LCPI0_0,#-1)
+; CHECK-128: [[REG1:(q[0-9]+)]] = vcmp.eq(v0.b,v1.b)
+; CHECK-128: [[REG2:(v[0-9]+)]] = vand([[REG1]],r{{[0-9]+}})
+; CHECK-128: [[REG3:(v[0-9]+)]] = vmem(r{{[0-9]+}}+#0)
+; CHECK-128: [[REG4:(v[0-9]+)]] = vdelta([[REG2]],[[REG3]])
+; CHECK-128: [[REG5:(q[0-9]+)]] = vand([[REG4]],r{{[0-9]+}})
+; CHECK-128: v0 = vand([[REG5]],r{{[0-9]+}})
+
+define void @compare_vectors(<64 x i8> %a, <64 x i8> %b) {
+entry:
+  %result = icmp eq <64 x i8> %a, %b
+  call i32 @f.1(<64 x i1> %result)
+  ret void
+}
+
+; CHECK-LABEL: f.1:
+; CHECK-64: [[REG3:(q[0-9]+)]] = vand(v0,r{{[0-9]+}})
+; CHECK-64: [[REG4:(v[0-9]+)]] = vand([[REG3]],r{{[0-9]+}})
+; CHECK-64: r{{[0-9]+}} = vextract([[REG4]],r{{[0-9]+}})
+; CHECK-128: [[REG6:(q[0-9]+)]] = vand(v0,r{{[0-9]+}})
+; CHECK-128: [[REG7:(v[0-9]+)]] = vand([[REG6]],r{{[0-9]+}})
+; CHECK-128: r{{[0-9]+}} = vextract([[REG7]],r{{[0-9]+}})
+
+define i32 @f.1(<64 x i1> %vec) {
+  %element = extractelement <64 x i1> %vec, i32 6
+  %is_true = icmp eq i1 %element, true
+  br i1 %is_true, label %if_true, label %if_false
+
+if_true:
+  call void @action_if_true()
+  br label %end
+
+if_false:
+  call void @action_if_false()
+  br label %end
+
+end:
+  %result = phi i32 [1, %if_true], [0, %if_false]
+  ret i32 %result
+}
+
+declare void @action_if_true()
+declare void @action_if_false()
diff --git a/llvm/test/CodeGen/Hexagon/calloperand-v8i1.ll b/llvm/test/CodeGen/Hexagon/calloperand-v8i1.ll
new file mode 100644
index 0000000000000..2ec163acd6ae7
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/calloperand-v8i1.ll
@@ -0,0 +1,39 @@
+;RUN: llc -mtriple=hexagon  < %s -o - | FileCheck %s --check-prefix=CHECK
+;RUN: llc -mtriple=hexagon -mattr=+hvxv79,+hvx-length64b < %s -o - | FileCheck %s --check-prefix=CHECK
+;RUN: llc -mtriple=hexagon -mattr=+hvxv79,+hvx-length128b < %s -o - | FileCheck %s --check-prefix=CHECK
+
+; CHECK-LABEL: compare_vectors
+; CHECK: [[REG0:(p[0-9]+)]] = vcmpb.eq([[REG1:(r[0-9]+):[0-9]]],[[REG2:(r[0-9]+):[0-9]]])
+; CHECK: [[REG1:(r[0-9]+):[0-9]]] = CONST64(#72340172838076673)
+; CHECK: [[REG2:(r[0-9]+):[0-9]]] = mask([[REG0]])
+; CHECK: r{{[0-9]+}}:{{[0-9]+}} = and([[REG2]],[[REG1]])
+
+define void @compare_vectors(<8 x i8> %a, <8 x i8> %b) {
+entry:
+  %result = icmp eq <8 x i8> %a, %b
+  call i32 @f.1(<8 x i1> %result)
+  ret void
+}
+; CHECK-LABEL: f.1:
+; CHECK: [[REG3:(r[0-9]+)]] = and([[REG3]],##16843009)
+; CHECK: [[REG4:(r[0-9]+)]] = and([[REG4]],##16843009)
+define i32 @f.1(<8 x i1> %vec) {
+  %element = extractelement <8 x i1> %vec, i32 6
+  %is_true = icmp eq i1 %element, true
+  br i1 %is_true, label %if_true, label %if_false
+
+if_true:
+  call void @action_if_true()
+  br label %end
+
+if_false:
+  call void @action_if_false()
+  br label %end
+
+end:
+  %result = phi i32 [1, %if_true], [0, %if_false]
+  ret i32 %result
+}
+
+declare void @action_if_true()
+declare void @action_if_false()

From 712d3c7f0944e8163843e449e2f3c159d0dbdd46 Mon Sep 17 00:00:00 2001
From: Jordan R AW <ajordanr@google.com>
Date: Fri, 14 Feb 2025 21:37:39 -0800
Subject: [PATCH 221/282] [lldb] Add terminfo dependency for ncurses support
 (#126810)

For some operating systems (e.g. chromiumos), terminfo is a separate
package and library from ncurses. Both are still requirements for curses
support in lldb, individually.

This is a rework of this original spack commit:

https://github.com/spack/spack/commit/9ea261265010eacd250691a8361f661d0576f25c

Instead though, this PR uses CMake to detect whether the symbol is
present and defined in the curses library, and only falls back to a separate
tinfo if not found.

Without this fix, LLDB cannot be built on these systems.

Fixes #101368

(cherry picked from commit 8fff0c181f26a5e8b2344c061ebf2559118b1160)
---
 lldb/cmake/modules/FindCursesAndPanel.cmake | 42 ++++++++++++++++++---
 1 file changed, 37 insertions(+), 5 deletions(-)

diff --git a/lldb/cmake/modules/FindCursesAndPanel.cmake b/lldb/cmake/modules/FindCursesAndPanel.cmake
index aaadf214bf54b..75ebaa35d7ea1 100644
--- a/lldb/cmake/modules/FindCursesAndPanel.cmake
+++ b/lldb/cmake/modules/FindCursesAndPanel.cmake
@@ -2,23 +2,55 @@
 # FindCursesAndPanel
 # -----------
 #
-# Find the curses and panel library as a whole.
+# Find the curses, terminfo, and panel library as a whole.
 
-if(CURSES_INCLUDE_DIRS AND CURSES_LIBRARIES AND PANEL_LIBRARIES)
+include(CMakePushCheckState)
+
+function(lldb_check_curses_tinfo CURSES_LIBRARIES CURSES_HAS_TINFO)
+  cmake_reset_check_state()
+  set(CMAKE_REQUIRED_LIBRARIES "${CURSES_LIBRARIES}")
+  # acs_map is one of many symbols that are part of tinfo but could
+  # be bundled in curses.
+  check_symbol_exists(acs_map "curses.h" CURSES_HAS_TINFO)
+endfunction()
+
+if(CURSES_INCLUDE_DIRS AND CURSES_LIBRARIES AND TINFO_LIBRARIES AND PANEL_LIBRARIES)
   set(CURSESANDPANEL_FOUND TRUE)
 else()
   find_package(Curses QUIET)
   find_library(PANEL_LIBRARIES NAMES panel DOC "The curses panel library" QUIET)
   include(FindPackageHandleStandardArgs)
+
+  if(CURSES_FOUND AND PANEL_LIBRARIES)
+    # Sometimes the curses libraries define their own terminfo symbols,
+    # other times they're extern and are defined by a separate terminfo library.
+    # Auto-detect which.
+    lldb_check_curses_tinfo("${CURSES_LIBRARIES}" CURSES_HAS_TINFO)
+    if (NOT CURSES_HAS_TINFO)
+      message(STATUS "curses library missing terminfo symbols, looking for tinfo separately")
+      find_library(TINFO_LIBRARIES NAMES tinfo DOC "The curses tinfo library" QUIET)
+      list(APPEND CURSES_LIBRARIES "${TINFO_LIBRARIES}")
+    endif()
+    set(HAS_TERMINFO_SYMBOLS "$<OR:$<BOOL:${TERMINFO_LIBRARIES}>,$<BOOL:${CURSES_HAS_TINFO}>>")
+  endif()
+
   find_package_handle_standard_args(CursesAndPanel
                                     FOUND_VAR
                                       CURSESANDPANEL_FOUND
                                     REQUIRED_VARS
                                       CURSES_INCLUDE_DIRS
                                       CURSES_LIBRARIES
-                                      PANEL_LIBRARIES)
-  if(CURSES_FOUND AND PANEL_LIBRARIES)
-    mark_as_advanced(CURSES_INCLUDE_DIRS CURSES_LIBRARIES PANEL_LIBRARIES)
+                                      PANEL_LIBRARIES
+                                      HAS_TERMINFO_SYMBOLS)
+
+  if(CURSES_FOUND AND PANEL_LIBRARIES AND HAS_TERMINFO_SYMBOLS)
+    mark_as_advanced(CURSES_INCLUDE_DIRS
+                      PANEL_LIBRARIES
+                      HAS_TERMINFO_SYMBOLS
+                      CURSES_HAS_TINFO)
+  endif()
+  if(TINFO_LIBRARIES)
+    mark_as_advanced(TINFO_LIBRARIES)
   endif()
 endif()
 

From 54c90e4cdf2ff335cf8804876e686deac3b803e0 Mon Sep 17 00:00:00 2001
From: Jordan R AW <ajordanr@google.com>
Date: Sat, 22 Feb 2025 09:13:46 -0800
Subject: [PATCH 222/282] [lldb] Fix manual CURSES_LIBRARIES tinfo finding
 (#128245)

(cherry picked from commit bb6a273d9ab9ee90dbb957e541f4d810fffb22ee)
---
 lldb/cmake/modules/FindCursesAndPanel.cmake | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/lldb/cmake/modules/FindCursesAndPanel.cmake b/lldb/cmake/modules/FindCursesAndPanel.cmake
index 75ebaa35d7ea1..8628059f91ba1 100644
--- a/lldb/cmake/modules/FindCursesAndPanel.cmake
+++ b/lldb/cmake/modules/FindCursesAndPanel.cmake
@@ -6,15 +6,25 @@
 
 include(CMakePushCheckState)
 
-function(lldb_check_curses_tinfo CURSES_LIBRARIES CURSES_HAS_TINFO)
+function(lldb_check_curses_tinfo CURSES_INCLUDE_DIRS CURSES_LIBRARIES CURSES_HAS_TINFO)
   cmake_reset_check_state()
+  set(CMAKE_REQUIRED_INCLUDES "${CURSES_INCLUDE_DIRS}")
   set(CMAKE_REQUIRED_LIBRARIES "${CURSES_LIBRARIES}")
   # acs_map is one of many symbols that are part of tinfo but could
   # be bundled in curses.
   check_symbol_exists(acs_map "curses.h" CURSES_HAS_TINFO)
 endfunction()
 
-if(CURSES_INCLUDE_DIRS AND CURSES_LIBRARIES AND TINFO_LIBRARIES AND PANEL_LIBRARIES)
+if(CURSES_INCLUDE_DIRS AND CURSES_LIBRARIES AND PANEL_LIBRARIES)
+  if(NOT HAS_TERMINFO_SYMBOLS)
+    lldb_check_curses_tinfo("${CURSES_INCLUDE_DIRS}"
+                            "${CURSES_LIBRARIES}"
+                            CURSES_HAS_TINFO)
+    if(NOT CURSES_HAS_TINFO)
+      message(WARNING "CURSES_LIBRARIES was provided manually but is missing terminfo symbols")
+    endif()
+    mark_as_advanced(CURSES_HAS_TINFO)
+  endif()
   set(CURSESANDPANEL_FOUND TRUE)
 else()
   find_package(Curses QUIET)
@@ -25,8 +35,10 @@ else()
     # Sometimes the curses libraries define their own terminfo symbols,
     # other times they're extern and are defined by a separate terminfo library.
     # Auto-detect which.
-    lldb_check_curses_tinfo("${CURSES_LIBRARIES}" CURSES_HAS_TINFO)
-    if (NOT CURSES_HAS_TINFO)
+    lldb_check_curses_tinfo("${CURSES_INCLUDE_DIRS}"
+                            "${CURSES_LIBRARIES}"
+                            CURSES_HAS_TINFO)
+    if(NOT CURSES_HAS_TINFO)
       message(STATUS "curses library missing terminfo symbols, looking for tinfo separately")
       find_library(TINFO_LIBRARIES NAMES tinfo DOC "The curses tinfo library" QUIET)
       list(APPEND CURSES_LIBRARIES "${TINFO_LIBRARIES}")

From 0064565bce3fbe8d605aea7c39e87e806346ff71 Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Sat, 1 Mar 2025 20:21:28 +0800
Subject: [PATCH 223/282] [DAGCombiner] Don't ignore N2's undef elements in
 `foldVSelectOfConstants` (#129272)

Since N2 will be reused in the fold, we cannot skip N2's undef elements
if the corresponding element in N1 is well-defined.
For example:
```
t2: v4i32 = BUILD_VECTOR Constant:i32<0>, Constant:i32<0>, Constant:i32<0>, Constant:i32<0>
t24: v4i32 = BUILD_VECTOR undef:i32, undef:i32, Constant:i32<1>, undef:i32
t11: v4i32 = vselect t8, t2, t10
```
Before this patch, we fold t11 into:
```
t26: v4i32 = sign_extend t8
t27: v4i32 = add t26, t24
```
The last element of t27 is incorrect.

Closes https://github.com/llvm/llvm-project/issues/129181.

(cherry picked from commit 2709366f75b054e2cba4f61310de5a9605f4aa24)
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  5 +++--
 llvm/test/CodeGen/X86/vselect-constants.ll    | 18 ++++++++++++++++++
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 9d04568483677..e921ced8326b1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -12547,9 +12547,10 @@ SDValue DAGCombiner::foldVSelectOfConstants(SDNode *N) {
   for (unsigned i = 0; i != Elts; ++i) {
     SDValue N1Elt = N1.getOperand(i);
     SDValue N2Elt = N2.getOperand(i);
-    if (N1Elt.isUndef() || N2Elt.isUndef())
+    if (N1Elt.isUndef())
       continue;
-    if (N1Elt.getValueType() != N2Elt.getValueType()) {
+    // N2 should not contain undef values since it will be reused in the fold.
+    if (N2Elt.isUndef() || N1Elt.getValueType() != N2Elt.getValueType()) {
       AllAddOne = false;
       AllSubOne = false;
       break;
diff --git a/llvm/test/CodeGen/X86/vselect-constants.ll b/llvm/test/CodeGen/X86/vselect-constants.ll
index 901f7e4a00eb5..34bda718db8f6 100644
--- a/llvm/test/CodeGen/X86/vselect-constants.ll
+++ b/llvm/test/CodeGen/X86/vselect-constants.ll
@@ -302,3 +302,21 @@ define i32 @wrong_min_signbits(<2 x i16> %x) {
   %t1 = bitcast <2 x i16> %sel to i32
   ret i32 %t1
 }
+
+define i32 @pr129181() {
+; SSE-LABEL: pr129181:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: pr129181:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    retq
+entry:
+  %x = insertelement <4 x i32> zeroinitializer, i32 0, i32 0
+  %cmp = icmp ult <4 x i32> %x, splat (i32 1)
+  %sel = select <4 x i1> %cmp, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 0, i32 1, i32 poison>
+  %reduce = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %sel)
+  ret i32 %reduce
+}

From 1d4d84c89be6955f80b90c54c6d24dcedd915cce Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Mon, 3 Mar 2025 15:50:11 +0000
Subject: [PATCH 224/282] [AArch64] Don't try to custom lower fp16 selects with
 nofp (#129492)

If we do not have fp then we do not need to try and custom lower fp16
selects.

Fixes #129394.

(cherry picked from commit cb850fef2a564ea330e8a4878fafb4f5b4a7a98e)
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  6 +-
 .../16bit-float-promotion-with-nofp.ll        | 91 +++++++++++++++++++
 2 files changed, 95 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index b5cca88b6b511..a2824d6a8f5af 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -498,8 +498,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::BR_CC, MVT::f64, Custom);
   setOperationAction(ISD::SELECT, MVT::i32, Custom);
   setOperationAction(ISD::SELECT, MVT::i64, Custom);
-  setOperationAction(ISD::SELECT, MVT::f16, Custom);
-  setOperationAction(ISD::SELECT, MVT::bf16, Custom);
+  if (Subtarget->hasFPARMv8()) {
+    setOperationAction(ISD::SELECT, MVT::f16, Custom);
+    setOperationAction(ISD::SELECT, MVT::bf16, Custom);
+  }
   setOperationAction(ISD::SELECT, MVT::f32, Custom);
   setOperationAction(ISD::SELECT, MVT::f64, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
diff --git a/llvm/test/CodeGen/AArch64/16bit-float-promotion-with-nofp.ll b/llvm/test/CodeGen/AArch64/16bit-float-promotion-with-nofp.ll
index bfe9ab8424bb0..f560420e2c920 100644
--- a/llvm/test/CodeGen/AArch64/16bit-float-promotion-with-nofp.ll
+++ b/llvm/test/CodeGen/AArch64/16bit-float-promotion-with-nofp.ll
@@ -29,3 +29,94 @@ entry:
   ret bfloat %0
 }
 
+define double @select_f64(double %a, double %b, i1 %c) {
+; CHECK-LABEL: select_f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    tst w2, #0x1
+; CHECK-NEXT:    csel x0, x0, x1, ne
+; CHECK-NEXT:    ret
+entry:
+  %0 = select i1 %c, double %a, double %b
+  ret double %0
+}
+
+define float @select_f32(float %a, float %b, i1 %c) {
+; CHECK-LABEL: select_f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    tst w2, #0x1
+; CHECK-NEXT:    csel w0, w0, w1, ne
+; CHECK-NEXT:    ret
+entry:
+  %0 = select i1 %c, float %a, float %b
+  ret float %0
+}
+
+define half @select_f16(half %a, half %b, i1 %c) {
+; CHECK-LABEL: select_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    tst w2, #0x1
+; CHECK-NEXT:    csel w0, w0, w1, ne
+; CHECK-NEXT:    ret
+entry:
+  %0 = select i1 %c, half %a, half %b
+  ret half %0
+}
+
+define bfloat @select_bf16(bfloat %a, bfloat %b, i1 %c) {
+; CHECK-LABEL: select_bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    tst w2, #0x1
+; CHECK-NEXT:    csel w0, w0, w1, ne
+; CHECK-NEXT:    ret
+entry:
+  %0 = select i1 %c, bfloat %a, bfloat %b
+  ret bfloat %0
+}
+
+define double @selectcc_f64(double %a, double %b, i32 %d) {
+; CHECK-LABEL: selectcc_f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmp w2, #0
+; CHECK-NEXT:    csel x0, x0, x1, lt
+; CHECK-NEXT:    ret
+entry:
+  %c = icmp slt i32 %d, 0
+  %0 = select i1 %c, double %a, double %b
+  ret double %0
+}
+
+define float @selectcc_f32(float %a, float %b, i32 %d) {
+; CHECK-LABEL: selectcc_f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmp w2, #0
+; CHECK-NEXT:    csel w0, w0, w1, lt
+; CHECK-NEXT:    ret
+entry:
+  %c = icmp slt i32 %d, 0
+  %0 = select i1 %c, float %a, float %b
+  ret float %0
+}
+
+define half @selectcc_f16(half %a, half %b, i32 %d) {
+; CHECK-LABEL: selectcc_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmp w2, #0
+; CHECK-NEXT:    csel w0, w0, w1, lt
+; CHECK-NEXT:    ret
+entry:
+  %c = icmp slt i32 %d, 0
+  %0 = select i1 %c, half %a, half %b
+  ret half %0
+}
+
+define bfloat @selectcc_bf16(bfloat %a, bfloat %b, i32 %d) {
+; CHECK-LABEL: selectcc_bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmp w2, #0
+; CHECK-NEXT:    csel w0, w0, w1, lt
+; CHECK-NEXT:    ret
+entry:
+  %c = icmp slt i32 %d, 0
+  %0 = select i1 %c, bfloat %a, bfloat %b
+  ret bfloat %0
+}

From f7a4e3a4d45dfd53755258cf1bb530c0567eb8ad Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Mon, 10 Mar 2025 19:42:54 -0700
Subject: [PATCH 225/282] [clang-format] Don't remove parentheses separated
 from ellipsis by comma (#130471)

Also clean up `case tok::r_paren` in
`UnwrappedLineParser::parseParens()`.

Fix #130359

(cherry picked from commit 7d4d8509cbec7eecd8aaf2510015b54bc5c173e1)
---
 clang/lib/Format/UnwrappedLineParser.cpp | 86 ++++++++++++++----------
 clang/unittests/Format/FormatTest.cpp    |  4 ++
 2 files changed, 55 insertions(+), 35 deletions(-)

diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index 9b4257fdd8c8f..9a03e9409fcbc 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -2562,12 +2562,12 @@ bool UnwrappedLineParser::parseBracedList(bool IsAngleBracket, bool IsEnum) {
 /// Returns whether there is a `=` token between the parentheses.
 bool UnwrappedLineParser::parseParens(TokenType AmpAmpTokenType) {
   assert(FormatTok->is(tok::l_paren) && "'(' expected.");
-  auto *LeftParen = FormatTok;
+  auto *LParen = FormatTok;
   bool SeenComma = false;
   bool SeenEqual = false;
   bool MightBeFoldExpr = false;
-  const bool MightBeStmtExpr = Tokens->peekNextToken()->is(tok::l_brace);
   nextToken();
+  const bool MightBeStmtExpr = FormatTok->is(tok::l_brace);
   do {
     switch (FormatTok->Tok.getKind()) {
     case tok::l_paren:
@@ -2577,44 +2577,60 @@ bool UnwrappedLineParser::parseParens(TokenType AmpAmpTokenType) {
         parseChildBlock();
       break;
     case tok::r_paren: {
-      auto *Prev = LeftParen->Previous;
-      if (!MightBeStmtExpr && !MightBeFoldExpr && !Line->InMacroBody &&
-          Style.RemoveParentheses > FormatStyle::RPS_Leave) {
-        const auto *Next = Tokens->peekNextToken();
-        const bool DoubleParens =
-            Prev && Prev->is(tok::l_paren) && Next && Next->is(tok::r_paren);
-        const bool CommaSeparated =
-            !DoubleParens && Prev && Prev->isOneOf(tok::l_paren, tok::comma) &&
-            Next && Next->isOneOf(tok::comma, tok::r_paren);
-        const auto *PrevPrev = Prev ? Prev->getPreviousNonComment() : nullptr;
-        const bool Excluded =
-            PrevPrev &&
-            (PrevPrev->isOneOf(tok::kw___attribute, tok::kw_decltype) ||
-             SeenComma ||
-             (SeenEqual &&
-              (PrevPrev->isOneOf(tok::kw_if, tok::kw_while) ||
-               PrevPrev->endsSequence(tok::kw_constexpr, tok::kw_if))));
-        const bool ReturnParens =
-            Style.RemoveParentheses == FormatStyle::RPS_ReturnStatement &&
-            ((NestedLambdas.empty() && !IsDecltypeAutoFunction) ||
-             (!NestedLambdas.empty() && !NestedLambdas.back())) &&
-            Prev && Prev->isOneOf(tok::kw_return, tok::kw_co_return) && Next &&
-            Next->is(tok::semi);
-        if ((DoubleParens && !Excluded) || (CommaSeparated && !SeenComma) ||
-            ReturnParens) {
-          LeftParen->Optional = true;
-          FormatTok->Optional = true;
-        }
-      }
+      auto *Prev = LParen->Previous;
+      auto *RParen = FormatTok;
+      nextToken();
       if (Prev) {
+        auto OptionalParens = [&] {
+          if (MightBeStmtExpr || MightBeFoldExpr || Line->InMacroBody ||
+              SeenComma || Style.RemoveParentheses == FormatStyle::RPS_Leave) {
+            return false;
+          }
+          const bool DoubleParens =
+              Prev->is(tok::l_paren) && FormatTok->is(tok::r_paren);
+          if (DoubleParens) {
+            const auto *PrevPrev = Prev->getPreviousNonComment();
+            const bool Excluded =
+                PrevPrev &&
+                (PrevPrev->isOneOf(tok::kw___attribute, tok::kw_decltype) ||
+                 (SeenEqual &&
+                  (PrevPrev->isOneOf(tok::kw_if, tok::kw_while) ||
+                   PrevPrev->endsSequence(tok::kw_constexpr, tok::kw_if))));
+            if (!Excluded)
+              return true;
+          } else {
+            const bool CommaSeparated =
+                Prev->isOneOf(tok::l_paren, tok::comma) &&
+                FormatTok->isOneOf(tok::comma, tok::r_paren);
+            if (CommaSeparated &&
+                // LParen is not preceded by ellipsis, comma.
+                !Prev->endsSequence(tok::comma, tok::ellipsis) &&
+                // RParen is not followed by comma, ellipsis.
+                !(FormatTok->is(tok::comma) &&
+                  Tokens->peekNextToken()->is(tok::ellipsis))) {
+              return true;
+            }
+            const bool ReturnParens =
+                Style.RemoveParentheses == FormatStyle::RPS_ReturnStatement &&
+                ((NestedLambdas.empty() && !IsDecltypeAutoFunction) ||
+                 (!NestedLambdas.empty() && !NestedLambdas.back())) &&
+                Prev->isOneOf(tok::kw_return, tok::kw_co_return) &&
+                FormatTok->is(tok::semi);
+            if (ReturnParens)
+              return true;
+          }
+          return false;
+        };
         if (Prev->is(TT_TypenameMacro)) {
-          LeftParen->setFinalizedType(TT_TypeDeclarationParen);
-          FormatTok->setFinalizedType(TT_TypeDeclarationParen);
-        } else if (Prev->is(tok::greater) && FormatTok->Previous == LeftParen) {
+          LParen->setFinalizedType(TT_TypeDeclarationParen);
+          RParen->setFinalizedType(TT_TypeDeclarationParen);
+        } else if (Prev->is(tok::greater) && RParen->Previous == LParen) {
           Prev->setFinalizedType(TT_TemplateCloser);
+        } else if (OptionalParens()) {
+          LParen->Optional = true;
+          RParen->Optional = true;
         }
       }
-      nextToken();
       return SeenEqual;
     }
     case tok::r_brace:
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index d1e96e0fa544a..6508ca2e7174f 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -27881,6 +27881,10 @@ TEST_F(FormatTest, RemoveParentheses) {
   verifyFormat("foo((a, b));", "foo(((a), b));", Style);
   verifyFormat("foo((a, b));", "foo((a, (b)));", Style);
   verifyFormat("foo((a, b, c));", "foo((a, ((b)), c));", Style);
+  verifyFormat("(..., (hash_a = hash_combine(hash_a, hash_b)));",
+               "(..., ((hash_a = hash_combine(hash_a, hash_b))));", Style);
+  verifyFormat("((hash_a = hash_combine(hash_a, hash_b)), ...);",
+               "(((hash_a = hash_combine(hash_a, hash_b))), ...);", Style);
   verifyFormat("return (0);", "return (((0)));", Style);
   verifyFormat("return (({ 0; }));", "return ((({ 0; })));", Style);
   verifyFormat("return ((... && std::is_convertible_v<TArgsLocal, TArgs>));",

From 0fda7e6332554b7a9d3437b6bff6c6d0c30cb760 Mon Sep 17 00:00:00 2001
From: Phoebe Wang <phoebe.wang@intel.com>
Date: Tue, 11 Mar 2025 21:20:27 +0800
Subject: [PATCH 226/282] [X86][AVX10.2] Fix unexpected larger scope (#130767)

https://godbolt.org/z/oM6bcqEnr
(cherry picked from commit fcce3084cb43a12f2e6e19b8e5b655f3df8739d6)
---
 llvm/lib/Target/X86/X86InstrSSE.td                |  4 ++--
 llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll | 11 +++++++++++
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 6aadb788c851e..2a7ab1e310618 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -6121,8 +6121,9 @@ let Predicates = [HasAVX, NoAVX10_2] in {
                                     v8i16, VR128, load, i128mem, 0,
                                     SchedWriteMPSAD.XMM>, VEX, VVVV, WIG;
   }
+}
 
-let Uses = [MXCSR], mayRaiseFPException = 1 in {
+let Predicates = [HasAVX], Uses = [MXCSR], mayRaiseFPException = 1 in {
   let ExeDomain = SSEPackedSingle in
   defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
                                    VR128, load, f128mem, 0,
@@ -6136,7 +6137,6 @@ let Uses = [MXCSR], mayRaiseFPException = 1 in {
                                     VR256, load, i256mem, 0,
                                     SchedWriteDPPS.YMM>, VEX, VVVV, VEX_L, WIG;
 }
-}
 
 let Predicates = [HasAVX2, NoAVX10_2] in {
   let isCommutable = 0 in {
diff --git a/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll
index 07e86cb01e133..b2e7caa15944c 100644
--- a/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll
@@ -422,3 +422,14 @@ define { <32 x i16>, <32 x i16>, <32 x i16> } @test_mm512_mask_mpsadbw(<64 x i8>
 }
 
 declare <32 x i16> @llvm.x86.avx10.vmpsadbw.512(<64 x i8>, <64 x i8>, i8)
+
+; Regression test
+
+define <8 x float> @avx_dp_ps(<8 x float> %a, <8 x float> %b) {
+; CHECK-LABEL: avx_dp_ps:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vdpps $255, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x40,0xc1,0xff]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %r = tail call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a, <8 x float> %b, i8 -1)
+  ret <8 x float> %r
+}

From 0e96713a3b293267014b665ec76589c9668c8a9f Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Sun, 9 Mar 2025 21:10:35 +0800
Subject: [PATCH 227/282] [ValueTracking] Bail out on x86_fp80 when computing
 fpclass with knownbits (#130477)

In https://github.com/llvm/llvm-project/pull/97762, we assume the
minimum possible value of X is NaN implies X is NaN. But it doesn't hold
for x86_fp80 format. If the knownbits of X are
`?'011111111111110'????????????????????????????????????????????????????????????????`,
the minimum possible value of X is NaN/unnormal. However, it can be a
normal value.

Closes https://github.com/llvm/llvm-project/issues/130408.

(cherry picked from commit 029e10289a02b438f1a22f401c94ed60ab4bb704)
---
 llvm/lib/Analysis/ValueTracking.cpp       |  3 ++-
 llvm/test/Transforms/InstSimplify/fcmp.ll | 17 +++++++++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 8a674914641a8..4e79f41df2eb9 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -6135,13 +6135,14 @@ void computeKnownFPClass(const Value *V, const APInt &DemandedElts,
     else if (Bits.isNegative())
       Known.signBitMustBeOne();
 
-    if (Ty->isIEEE()) {
+    if (Ty->isIEEELikeFPTy()) {
       // IEEE floats are NaN when all bits of the exponent plus at least one of
       // the fraction bits are 1. This means:
       //   - If we assume unknown bits are 0 and the value is NaN, it will
       //     always be NaN
       //   - If we assume unknown bits are 1 and the value is not NaN, it can
       //     never be NaN
+      // Note: They do not hold for x86_fp80 format.
       if (APFloat(Ty->getFltSemantics(), Bits.One).isNaN())
         Known.KnownFPClasses = fcNan;
       else if (!APFloat(Ty->getFltSemantics(), ~Bits.Zero).isNaN())
diff --git a/llvm/test/Transforms/InstSimplify/fcmp.ll b/llvm/test/Transforms/InstSimplify/fcmp.ll
index 64132f5fb7db7..0c2be5210a741 100644
--- a/llvm/test/Transforms/InstSimplify/fcmp.ll
+++ b/llvm/test/Transforms/InstSimplify/fcmp.ll
@@ -16,3 +16,20 @@ define i1 @poison2(float %x) {
   %v = fcmp ueq float %x, poison
   ret i1 %v
 }
+
+define i1 @pr130408(x86_fp80 %x) {
+; CHECK-LABEL: @pr130408(
+; CHECK-NEXT:    [[BITS:%.*]] = bitcast x86_fp80 [[X:%.*]] to i80
+; CHECK-NEXT:    [[MASKED:%.*]] = and i80 [[BITS]], -604444463063240877801473
+; CHECK-NEXT:    [[OR:%.*]] = or i80 [[MASKED]], 302194561415509874573312
+; CHECK-NEXT:    [[FP:%.*]] = bitcast i80 [[OR]] to x86_fp80
+; CHECK-NEXT:    [[RES:%.*]] = fcmp uno x86_fp80 [[FP]], 0xK00000000000000000000
+; CHECK-NEXT:    ret i1 [[RES]]
+;
+  %bits = bitcast x86_fp80 %x to i80
+  %masked = and i80 %bits, -604444463063240877801473
+  %or = or i80 %masked, 302194561415509874573312
+  %fp = bitcast i80 %or to x86_fp80
+  %res = fcmp uno x86_fp80 %fp, 0xK00000000000000000000
+  ret i1 %res
+}

From 05be3ca72e392ba5055d2c3e617adaeab89e258b Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Wed, 5 Mar 2025 11:23:33 +0000
Subject: [PATCH 228/282] [AArch64] Add BE test coverage for popcount. NFC

For #129843

(cherry picked from commit b673a59c9ae5583aa08a8d34a48f9409b660d826)
---
 llvm/test/CodeGen/AArch64/arm64-popcnt.ll | 161 ++++++++++++++++++++++
 llvm/test/CodeGen/AArch64/popcount.ll     | 104 ++++++++++++++
 2 files changed, 265 insertions(+)

diff --git a/llvm/test/CodeGen/AArch64/arm64-popcnt.ll b/llvm/test/CodeGen/AArch64/arm64-popcnt.ll
index ad0904ff98080..369667ec33f66 100644
--- a/llvm/test/CodeGen/AArch64/arm64-popcnt.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-popcnt.ll
@@ -2,6 +2,7 @@
 ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s
 ; RUN: llc < %s -mtriple=aarch64 -mattr -neon -aarch64-neon-syntax=apple | FileCheck -check-prefix=CHECK-NONEON %s
 ; RUN: llc < %s -mtriple=aarch64 -mattr +cssc -aarch64-neon-syntax=apple | FileCheck -check-prefix=CHECK-CSSC %s
+; RUN: llc < %s -mtriple=aarch64_be-none-eabi | FileCheck %s --check-prefix=CHECK-BE
 
 define i32 @cnt32_advsimd(i32 %x) nounwind readnone {
 ; CHECK-LABEL: cnt32_advsimd:
@@ -32,6 +33,14 @@ define i32 @cnt32_advsimd(i32 %x) nounwind readnone {
 ; CHECK-CSSC:       // %bb.0:
 ; CHECK-CSSC-NEXT:    cnt w0, w0
 ; CHECK-CSSC-NEXT:    ret
+;
+; CHECK-BE-LABEL: cnt32_advsimd:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    fmov s0, w0
+; CHECK-BE-NEXT:    cnt v0.8b, v0.8b
+; CHECK-BE-NEXT:    addv b0, v0.8b
+; CHECK-BE-NEXT:    fmov w0, s0
+; CHECK-BE-NEXT:    ret
   %cnt = tail call i32 @llvm.ctpop.i32(i32 %x)
   ret i32 %cnt
 }
@@ -69,6 +78,16 @@ define i32 @cnt32_advsimd_2(<2 x i32> %x) {
 ; CHECK-CSSC-NEXT:    fmov w8, s0
 ; CHECK-CSSC-NEXT:    cnt w0, w8
 ; CHECK-CSSC-NEXT:    ret
+;
+; CHECK-BE-LABEL: cnt32_advsimd_2:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    rev64 v0.2s, v0.2s
+; CHECK-BE-NEXT:    fmov w8, s0
+; CHECK-BE-NEXT:    fmov s0, w8
+; CHECK-BE-NEXT:    cnt v0.8b, v0.8b
+; CHECK-BE-NEXT:    addv b0, v0.8b
+; CHECK-BE-NEXT:    fmov w0, s0
+; CHECK-BE-NEXT:    ret
   %1 = extractelement <2 x i32> %x, i64 0
   %2 = tail call i32 @llvm.ctpop.i32(i32 %1)
   ret i32 %2
@@ -103,6 +122,16 @@ define i64 @cnt64_advsimd(i64 %x) nounwind readnone {
 ; CHECK-CSSC:       // %bb.0:
 ; CHECK-CSSC-NEXT:    cnt x0, x0
 ; CHECK-CSSC-NEXT:    ret
+;
+; CHECK-BE-LABEL: cnt64_advsimd:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    fmov d0, x0
+; CHECK-BE-NEXT:    rev64 v0.8b, v0.8b
+; CHECK-BE-NEXT:    cnt v0.8b, v0.8b
+; CHECK-BE-NEXT:    addv b0, v0.8b
+; CHECK-BE-NEXT:    rev64 v0.8b, v0.8b
+; CHECK-BE-NEXT:    fmov x0, d0
+; CHECK-BE-NEXT:    ret
   %cnt = tail call i64 @llvm.ctpop.i64(i64 %x)
   ret i64 %cnt
 }
@@ -147,6 +176,22 @@ define i32 @cnt32(i32 %x) nounwind readnone noimplicitfloat {
 ; CHECK-CSSC:       // %bb.0:
 ; CHECK-CSSC-NEXT:    cnt w0, w0
 ; CHECK-CSSC-NEXT:    ret
+;
+; CHECK-BE-LABEL: cnt32:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    lsr w9, w0, #1
+; CHECK-BE-NEXT:    mov w8, #16843009 // =0x1010101
+; CHECK-BE-NEXT:    and w9, w9, #0x55555555
+; CHECK-BE-NEXT:    sub w9, w0, w9
+; CHECK-BE-NEXT:    lsr w10, w9, #2
+; CHECK-BE-NEXT:    and w9, w9, #0x33333333
+; CHECK-BE-NEXT:    and w10, w10, #0x33333333
+; CHECK-BE-NEXT:    add w9, w9, w10
+; CHECK-BE-NEXT:    add w9, w9, w9, lsr #4
+; CHECK-BE-NEXT:    and w9, w9, #0xf0f0f0f
+; CHECK-BE-NEXT:    mul w8, w9, w8
+; CHECK-BE-NEXT:    lsr w0, w8, #24
+; CHECK-BE-NEXT:    ret
   %cnt = tail call i32 @llvm.ctpop.i32(i32 %x)
   ret i32 %cnt
 }
@@ -188,6 +233,22 @@ define i64 @cnt64(i64 %x) nounwind readnone noimplicitfloat {
 ; CHECK-CSSC:       // %bb.0:
 ; CHECK-CSSC-NEXT:    cnt x0, x0
 ; CHECK-CSSC-NEXT:    ret
+;
+; CHECK-BE-LABEL: cnt64:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    lsr x9, x0, #1
+; CHECK-BE-NEXT:    mov x8, #72340172838076673 // =0x101010101010101
+; CHECK-BE-NEXT:    and x9, x9, #0x5555555555555555
+; CHECK-BE-NEXT:    sub x9, x0, x9
+; CHECK-BE-NEXT:    lsr x10, x9, #2
+; CHECK-BE-NEXT:    and x9, x9, #0x3333333333333333
+; CHECK-BE-NEXT:    and x10, x10, #0x3333333333333333
+; CHECK-BE-NEXT:    add x9, x9, x10
+; CHECK-BE-NEXT:    add x9, x9, x9, lsr #4
+; CHECK-BE-NEXT:    and x9, x9, #0xf0f0f0f0f0f0f0f
+; CHECK-BE-NEXT:    mul x8, x9, x8
+; CHECK-BE-NEXT:    lsr x0, x8, #56
+; CHECK-BE-NEXT:    ret
   %cnt = tail call i64 @llvm.ctpop.i64(i64 %x)
   ret i64 %cnt
 }
@@ -215,6 +276,14 @@ define i32 @ctpop_eq_one(i64 %x) nounwind readnone {
 ; CHECK-CSSC-NEXT:    cmp x8, #1
 ; CHECK-CSSC-NEXT:    cset w0, eq
 ; CHECK-CSSC-NEXT:    ret
+;
+; CHECK-BE-LABEL: ctpop_eq_one:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    sub x8, x0, #1
+; CHECK-BE-NEXT:    eor x9, x0, x8
+; CHECK-BE-NEXT:    cmp x9, x8
+; CHECK-BE-NEXT:    cset w0, hi
+; CHECK-BE-NEXT:    ret
   %count = tail call i64 @llvm.ctpop.i64(i64 %x)
   %cmp = icmp eq i64 %count, 1
   %conv = zext i1 %cmp to i32
@@ -244,6 +313,14 @@ define i32 @ctpop_ne_one(i64 %x) nounwind readnone {
 ; CHECK-CSSC-NEXT:    cmp x8, #1
 ; CHECK-CSSC-NEXT:    cset w0, ne
 ; CHECK-CSSC-NEXT:    ret
+;
+; CHECK-BE-LABEL: ctpop_ne_one:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    sub x8, x0, #1
+; CHECK-BE-NEXT:    eor x9, x0, x8
+; CHECK-BE-NEXT:    cmp x9, x8
+; CHECK-BE-NEXT:    cset w0, ls
+; CHECK-BE-NEXT:    ret
   %count = tail call i64 @llvm.ctpop.i64(i64 %x)
   %cmp = icmp ne i64 %count, 1
   %conv = zext i1 %cmp to i32
@@ -273,6 +350,14 @@ define i1 @ctpop32_ne_one(i32 %x) nounwind readnone {
 ; CHECK-CSSC-NEXT:    cmp w8, #1
 ; CHECK-CSSC-NEXT:    cset w0, ne
 ; CHECK-CSSC-NEXT:    ret
+;
+; CHECK-BE-LABEL: ctpop32_ne_one:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    sub w8, w0, #1
+; CHECK-BE-NEXT:    eor w9, w0, w8
+; CHECK-BE-NEXT:    cmp w9, w8
+; CHECK-BE-NEXT:    cset w0, ls
+; CHECK-BE-NEXT:    ret
   %count = tail call i32 @llvm.ctpop.i32(i32 %x)
   %cmp = icmp ne i32 %count, 1
   ret i1 %cmp
@@ -299,6 +384,13 @@ define i1 @ctpop32_eq_one_nonzero(i32 %x) {
 ; CHECK-CSSC-NEXT:    tst w0, w8
 ; CHECK-CSSC-NEXT:    cset w0, eq
 ; CHECK-CSSC-NEXT:    ret
+;
+; CHECK-BE-LABEL: ctpop32_eq_one_nonzero:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    sub w8, w0, #1
+; CHECK-BE-NEXT:    tst w0, w8
+; CHECK-BE-NEXT:    cset w0, eq
+; CHECK-BE-NEXT:    ret
 entry:
   %popcnt = call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x)
   %cmp = icmp eq i32 %popcnt, 1
@@ -326,11 +418,80 @@ define i1 @ctpop32_ne_one_nonzero(i32 %x) {
 ; CHECK-CSSC-NEXT:    tst w0, w8
 ; CHECK-CSSC-NEXT:    cset w0, ne
 ; CHECK-CSSC-NEXT:    ret
+;
+; CHECK-BE-LABEL: ctpop32_ne_one_nonzero:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    sub w8, w0, #1
+; CHECK-BE-NEXT:    tst w0, w8
+; CHECK-BE-NEXT:    cset w0, ne
+; CHECK-BE-NEXT:    ret
 entry:
   %popcnt = tail call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x)
   %cmp = icmp ne i32 %popcnt, 1
   ret i1 %cmp
 }
 
+define i128 @cnt128(i128 %x) nounwind readnone {
+; CHECK-LABEL: cnt128:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov d0, x0
+; CHECK-NEXT:    mov.d v0[1], x1
+; CHECK-NEXT:    cnt.16b v0, v0
+; CHECK-NEXT:    addv.16b b0, v0
+; CHECK-NEXT:    mov.d x1, v0[1]
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+;
+; CHECK-NONEON-LABEL: cnt128:
+; CHECK-NONEON:       // %bb.0:
+; CHECK-NONEON-NEXT:    lsr x9, x0, #1
+; CHECK-NONEON-NEXT:    lsr x10, x1, #1
+; CHECK-NONEON-NEXT:    mov x8, #72340172838076673 // =0x101010101010101
+; CHECK-NONEON-NEXT:    and x9, x9, #0x5555555555555555
+; CHECK-NONEON-NEXT:    and x10, x10, #0x5555555555555555
+; CHECK-NONEON-NEXT:    sub x9, x0, x9
+; CHECK-NONEON-NEXT:    sub x10, x1, x10
+; CHECK-NONEON-NEXT:    mov x1, xzr
+; CHECK-NONEON-NEXT:    lsr x11, x9, #2
+; CHECK-NONEON-NEXT:    lsr x12, x10, #2
+; CHECK-NONEON-NEXT:    and x9, x9, #0x3333333333333333
+; CHECK-NONEON-NEXT:    and x10, x10, #0x3333333333333333
+; CHECK-NONEON-NEXT:    and x11, x11, #0x3333333333333333
+; CHECK-NONEON-NEXT:    add x9, x9, x11
+; CHECK-NONEON-NEXT:    and x11, x12, #0x3333333333333333
+; CHECK-NONEON-NEXT:    add x9, x9, x9, lsr #4
+; CHECK-NONEON-NEXT:    add x10, x10, x11
+; CHECK-NONEON-NEXT:    add x10, x10, x10, lsr #4
+; CHECK-NONEON-NEXT:    and x9, x9, #0xf0f0f0f0f0f0f0f
+; CHECK-NONEON-NEXT:    mul x9, x9, x8
+; CHECK-NONEON-NEXT:    and x10, x10, #0xf0f0f0f0f0f0f0f
+; CHECK-NONEON-NEXT:    mul x8, x10, x8
+; CHECK-NONEON-NEXT:    lsr x9, x9, #56
+; CHECK-NONEON-NEXT:    add x0, x9, x8, lsr #56
+; CHECK-NONEON-NEXT:    ret
+;
+; CHECK-CSSC-LABEL: cnt128:
+; CHECK-CSSC:       // %bb.0:
+; CHECK-CSSC-NEXT:    cnt x8, x1
+; CHECK-CSSC-NEXT:    cnt x9, x0
+; CHECK-CSSC-NEXT:    mov x1, xzr
+; CHECK-CSSC-NEXT:    add x0, x9, x8
+; CHECK-CSSC-NEXT:    ret
+;
+; CHECK-BE-LABEL: cnt128:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    fmov d0, x0
+; CHECK-BE-NEXT:    mov v0.d[1], x1
+; CHECK-BE-NEXT:    rev64 v0.16b, v0.16b
+; CHECK-BE-NEXT:    cnt v0.16b, v0.16b
+; CHECK-BE-NEXT:    addv b0, v0.16b
+; CHECK-BE-NEXT:    rev64 v0.16b, v0.16b
+; CHECK-BE-NEXT:    mov x1, v0.d[1]
+; CHECK-BE-NEXT:    fmov x0, d0
+; CHECK-BE-NEXT:    ret
+  %cnt = tail call i128 @llvm.ctpop.i128(i128 %x)
+  ret i128 %cnt
+}
+
 declare i32 @llvm.ctpop.i32(i32) nounwind readnone
 declare i64 @llvm.ctpop.i64(i64) nounwind readnone
diff --git a/llvm/test/CodeGen/AArch64/popcount.ll b/llvm/test/CodeGen/AArch64/popcount.ll
index 89b1ac0a0edf1..6cc925f0ae91f 100644
--- a/llvm/test/CodeGen/AArch64/popcount.ll
+++ b/llvm/test/CodeGen/AArch64/popcount.ll
@@ -3,6 +3,7 @@
 ; RUN: llc < %s -mtriple=aarch64-unknown-unknown -mattr=+neon | FileCheck %s --check-prefixes=CHECK,NEON
 ; RUN: llc < %s -mtriple=aarch64-unknown-unknown -mattr=+neon,+dotprod | FileCheck %s --check-prefixes=CHECK,DOT
 ; RUN: llc < %s -mtriple=aarch64-unknown-unknown -mattr=+sve | FileCheck %s --check-prefixes=CHECK,SVE
+; RUN: llc < %s -mtriple=aarch64_be-unknown-unknown | FileCheck %s --check-prefixes=BE
 ; RUN: llc < %s -global-isel -mtriple=aarch64-unknown-unknown | FileCheck %s --check-prefix=GISEL
 ; RUN: llc < %s -O0 -global-isel -mtriple=aarch64-unknown-unknown | FileCheck %s --check-prefix=GISELO0
 ; RUN: llc < %s -global-isel -mtriple=aarch64-unknown-unknown -mattr=+neon | FileCheck %s --check-prefixes=GISEL,NEON-GISEL
@@ -32,6 +33,18 @@ define i8 @popcount128(ptr nocapture nonnull readonly %0) {
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
 ;
+; BE-LABEL: popcount128:
+; BE:       // %bb.0: // %Entry
+; BE-NEXT:    ldr d0, [x0]
+; BE-NEXT:    add x8, x0, #8
+; BE-NEXT:    ld1 { v0.d }[1], [x8]
+; BE-NEXT:    rev64 v0.16b, v0.16b
+; BE-NEXT:    cnt v0.16b, v0.16b
+; BE-NEXT:    addv b0, v0.16b
+; BE-NEXT:    rev32 v0.16b, v0.16b
+; BE-NEXT:    mov w0, v0.s[3]
+; BE-NEXT:    ret
+;
 ; GISEL-LABEL: popcount128:
 ; GISEL:       // %bb.0: // %Entry
 ; GISEL-NEXT:    ldr q0, [x0]
@@ -111,6 +124,27 @@ define i16 @popcount256(ptr nocapture nonnull readonly %0) {
 ; CHECK-NEXT:    add w0, w9, w8
 ; CHECK-NEXT:    ret
 ;
+; BE-LABEL: popcount256:
+; BE:       // %bb.0: // %Entry
+; BE-NEXT:    ldr d0, [x0]
+; BE-NEXT:    ldr d1, [x0, #16]
+; BE-NEXT:    add x8, x0, #24
+; BE-NEXT:    add x9, x0, #8
+; BE-NEXT:    ld1 { v0.d }[1], [x9]
+; BE-NEXT:    ld1 { v1.d }[1], [x8]
+; BE-NEXT:    rev64 v0.16b, v0.16b
+; BE-NEXT:    rev64 v1.16b, v1.16b
+; BE-NEXT:    cnt v0.16b, v0.16b
+; BE-NEXT:    cnt v1.16b, v1.16b
+; BE-NEXT:    addv b0, v0.16b
+; BE-NEXT:    addv b1, v1.16b
+; BE-NEXT:    rev32 v0.16b, v0.16b
+; BE-NEXT:    rev32 v1.16b, v1.16b
+; BE-NEXT:    mov w8, v0.s[3]
+; BE-NEXT:    mov w9, v1.s[3]
+; BE-NEXT:    add w0, w9, w8
+; BE-NEXT:    ret
+;
 ; GISEL-LABEL: popcount256:
 ; GISEL:       // %bb.0: // %Entry
 ; GISEL-NEXT:    ldp x8, x9, [x0]
@@ -199,6 +233,18 @@ define <1 x i128> @popcount1x128(<1 x i128> %0) {
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
 ;
+; BE-LABEL: popcount1x128:
+; BE:       // %bb.0: // %Entry
+; BE-NEXT:    fmov d0, x0
+; BE-NEXT:    mov v0.d[1], x1
+; BE-NEXT:    rev64 v0.16b, v0.16b
+; BE-NEXT:    cnt v0.16b, v0.16b
+; BE-NEXT:    addv b0, v0.16b
+; BE-NEXT:    rev64 v0.16b, v0.16b
+; BE-NEXT:    mov x1, v0.d[1]
+; BE-NEXT:    fmov x0, d0
+; BE-NEXT:    ret
+;
 ; GISEL-LABEL: popcount1x128:
 ; GISEL:       // %bb.0: // %Entry
 ; GISEL-NEXT:    mov v0.d[0], x0
@@ -266,6 +312,17 @@ define <2 x i64> @popcount2x64(<2 x i64> %0) {
 ; SVE-NEXT:    uaddlp v0.2d, v0.4s
 ; SVE-NEXT:    ret
 ;
+; BE-LABEL: popcount2x64:
+; BE:       // %bb.0: // %Entry
+; BE-NEXT:    rev64 v0.16b, v0.16b
+; BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; BE-NEXT:    cnt v0.16b, v0.16b
+; BE-NEXT:    uaddlp v0.8h, v0.16b
+; BE-NEXT:    uaddlp v0.4s, v0.8h
+; BE-NEXT:    uaddlp v0.2d, v0.4s
+; BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; BE-NEXT:    ret
+;
 ; GISELO0-LABEL: popcount2x64:
 ; GISELO0:       // %bb.0: // %Entry
 ; GISELO0-NEXT:    cnt v0.16b, v0.16b
@@ -326,6 +383,15 @@ define <1 x i64> @popcount1x64(<1 x i64> %0) {
 ; CHECK-NEXT:    uaddlp v0.1d, v0.2s
 ; CHECK-NEXT:    ret
 ;
+; BE-LABEL: popcount1x64:
+; BE:       // %bb.0: // %Entry
+; BE-NEXT:    rev64 v0.8b, v0.8b
+; BE-NEXT:    cnt v0.8b, v0.8b
+; BE-NEXT:    uaddlp v0.4h, v0.8b
+; BE-NEXT:    uaddlp v0.2s, v0.4h
+; BE-NEXT:    uaddlp v0.1d, v0.2s
+; BE-NEXT:    ret
+;
 ; GISEL-LABEL: popcount1x64:
 ; GISEL:       // %bb.0: // %Entry
 ; GISEL-NEXT:    cnt v0.8b, v0.8b
@@ -382,6 +448,17 @@ define <4 x i32> @popcount4x32(<4 x i32> %0) {
 ; SVE-NEXT:    uaddlp v0.4s, v0.8h
 ; SVE-NEXT:    ret
 ;
+; BE-LABEL: popcount4x32:
+; BE:       // %bb.0: // %Entry
+; BE-NEXT:    rev64 v0.16b, v0.16b
+; BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; BE-NEXT:    cnt v0.16b, v0.16b
+; BE-NEXT:    uaddlp v0.8h, v0.16b
+; BE-NEXT:    uaddlp v0.4s, v0.8h
+; BE-NEXT:    rev64 v0.4s, v0.4s
+; BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; BE-NEXT:    ret
+;
 ; GISELO0-LABEL: popcount4x32:
 ; GISELO0:       // %bb.0: // %Entry
 ; GISELO0-NEXT:    cnt v0.16b, v0.16b
@@ -449,6 +526,15 @@ define <2 x i32> @popcount2x32(<2 x i32> %0) {
 ; SVE-NEXT:    uaddlp v0.2s, v0.4h
 ; SVE-NEXT:    ret
 ;
+; BE-LABEL: popcount2x32:
+; BE:       // %bb.0: // %Entry
+; BE-NEXT:    rev64 v0.8b, v0.8b
+; BE-NEXT:    cnt v0.8b, v0.8b
+; BE-NEXT:    uaddlp v0.4h, v0.8b
+; BE-NEXT:    uaddlp v0.2s, v0.4h
+; BE-NEXT:    rev64 v0.2s, v0.2s
+; BE-NEXT:    ret
+;
 ; GISELO0-LABEL: popcount2x32:
 ; GISELO0:       // %bb.0: // %Entry
 ; GISELO0-NEXT:    cnt v0.8b, v0.8b
@@ -498,6 +584,16 @@ define <8 x i16> @popcount8x16(<8 x i16> %0) {
 ; CHECK-NEXT:    uaddlp v0.8h, v0.16b
 ; CHECK-NEXT:    ret
 ;
+; BE-LABEL: popcount8x16:
+; BE:       // %bb.0: // %Entry
+; BE-NEXT:    rev64 v0.16b, v0.16b
+; BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; BE-NEXT:    cnt v0.16b, v0.16b
+; BE-NEXT:    uaddlp v0.8h, v0.16b
+; BE-NEXT:    rev64 v0.8h, v0.8h
+; BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; BE-NEXT:    ret
+;
 ; GISEL-LABEL: popcount8x16:
 ; GISEL:       // %bb.0: // %Entry
 ; GISEL-NEXT:    cnt v0.16b, v0.16b
@@ -529,6 +625,14 @@ define <4 x i16> @popcount4x16(<4 x i16> %0) {
 ; CHECK-NEXT:    uaddlp v0.4h, v0.8b
 ; CHECK-NEXT:    ret
 ;
+; BE-LABEL: popcount4x16:
+; BE:       // %bb.0: // %Entry
+; BE-NEXT:    rev64 v0.8b, v0.8b
+; BE-NEXT:    cnt v0.8b, v0.8b
+; BE-NEXT:    uaddlp v0.4h, v0.8b
+; BE-NEXT:    rev64 v0.4h, v0.4h
+; BE-NEXT:    ret
+;
 ; GISEL-LABEL: popcount4x16:
 ; GISEL:       // %bb.0: // %Entry
 ; GISEL-NEXT:    cnt v0.8b, v0.8b

From 32ce5b043c2b6e628e85d89df5461238632d9211 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Wed, 5 Mar 2025 20:08:34 +0000
Subject: [PATCH 229/282] [AArch64] Fix BE popcount casts. (#129879)

A bitcast, being defined as a load and a store, can change the lane
order. We need to use a NVCAST instead to keep the lanes out of the
VADDV the same in big-endian. The extracting from a v2i64 vector is
to keep the types of the nvcast legal, but also allow us to replace a
lane mov with a mov 0.

Fixes #129843

(cherry picked from commit ab811e75734a77247dae6df1579fa6f29394f200)
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 10 ++++++++--
 llvm/test/CodeGen/AArch64/arm64-popcnt.ll     |  8 +++-----
 llvm/test/CodeGen/AArch64/parity.ll           |  2 +-
 llvm/test/CodeGen/AArch64/popcount.ll         | 19 +++++++++----------
 4 files changed, 21 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index a2824d6a8f5af..16fcd589cecd1 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -10785,7 +10785,10 @@ SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
     if (VT == MVT::i32)
       AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, AddV,
                          DAG.getConstant(0, DL, MVT::i64));
-    AddV = DAG.getNode(ISD::BITCAST, DL, VT, AddV);
+    else
+      AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
+                         DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v1i64, AddV),
+                         DAG.getConstant(0, DL, MVT::i64));
     if (IsParity)
       AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));
     return AddV;
@@ -10794,7 +10797,10 @@ SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
 
     SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val);
     SDValue AddV = DAG.getNode(AArch64ISD::UADDV, DL, MVT::v16i8, CtPop);
-    AddV = DAG.getNode(ISD::BITCAST, DL, VT, AddV);
+    AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64,
+                       DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v2i64, AddV),
+                       DAG.getConstant(0, DL, MVT::i64));
+    AddV = DAG.getZExtOrTrunc(AddV, DL, VT);
     if (IsParity)
       AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));
     return AddV;
diff --git a/llvm/test/CodeGen/AArch64/arm64-popcnt.ll b/llvm/test/CodeGen/AArch64/arm64-popcnt.ll
index 369667ec33f66..d06e42f5405ef 100644
--- a/llvm/test/CodeGen/AArch64/arm64-popcnt.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-popcnt.ll
@@ -129,7 +129,6 @@ define i64 @cnt64_advsimd(i64 %x) nounwind readnone {
 ; CHECK-BE-NEXT:    rev64 v0.8b, v0.8b
 ; CHECK-BE-NEXT:    cnt v0.8b, v0.8b
 ; CHECK-BE-NEXT:    addv b0, v0.8b
-; CHECK-BE-NEXT:    rev64 v0.8b, v0.8b
 ; CHECK-BE-NEXT:    fmov x0, d0
 ; CHECK-BE-NEXT:    ret
   %cnt = tail call i64 @llvm.ctpop.i64(i64 %x)
@@ -436,9 +435,9 @@ define i128 @cnt128(i128 %x) nounwind readnone {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmov d0, x0
 ; CHECK-NEXT:    mov.d v0[1], x1
+; CHECK-NEXT:    mov x1, xzr
 ; CHECK-NEXT:    cnt.16b v0, v0
 ; CHECK-NEXT:    addv.16b b0, v0
-; CHECK-NEXT:    mov.d x1, v0[1]
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
 ;
@@ -481,13 +480,12 @@ define i128 @cnt128(i128 %x) nounwind readnone {
 ; CHECK-BE-LABEL: cnt128:
 ; CHECK-BE:       // %bb.0:
 ; CHECK-BE-NEXT:    fmov d0, x0
+; CHECK-BE-NEXT:    mov x0, xzr
 ; CHECK-BE-NEXT:    mov v0.d[1], x1
 ; CHECK-BE-NEXT:    rev64 v0.16b, v0.16b
 ; CHECK-BE-NEXT:    cnt v0.16b, v0.16b
 ; CHECK-BE-NEXT:    addv b0, v0.16b
-; CHECK-BE-NEXT:    rev64 v0.16b, v0.16b
-; CHECK-BE-NEXT:    mov x1, v0.d[1]
-; CHECK-BE-NEXT:    fmov x0, d0
+; CHECK-BE-NEXT:    fmov x1, d0
 ; CHECK-BE-NEXT:    ret
   %cnt = tail call i128 @llvm.ctpop.i128(i128 %x)
   ret i128 %cnt
diff --git a/llvm/test/CodeGen/AArch64/parity.ll b/llvm/test/CodeGen/AArch64/parity.ll
index 1e51793fb5f91..91515277cb3f6 100644
--- a/llvm/test/CodeGen/AArch64/parity.ll
+++ b/llvm/test/CodeGen/AArch64/parity.ll
@@ -159,7 +159,7 @@ define i32 @parity_64_trunc(i64 %x) {
 ; CHECK-NEXT:    fmov d0, x0
 ; CHECK-NEXT:    cnt v0.8b, v0.8b
 ; CHECK-NEXT:    addv b0, v0.8b
-; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    and w0, w8, #0x1
 ; CHECK-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/AArch64/popcount.ll b/llvm/test/CodeGen/AArch64/popcount.ll
index 6cc925f0ae91f..e664e73594923 100644
--- a/llvm/test/CodeGen/AArch64/popcount.ll
+++ b/llvm/test/CodeGen/AArch64/popcount.ll
@@ -41,8 +41,8 @@ define i8 @popcount128(ptr nocapture nonnull readonly %0) {
 ; BE-NEXT:    rev64 v0.16b, v0.16b
 ; BE-NEXT:    cnt v0.16b, v0.16b
 ; BE-NEXT:    addv b0, v0.16b
-; BE-NEXT:    rev32 v0.16b, v0.16b
-; BE-NEXT:    mov w0, v0.s[3]
+; BE-NEXT:    rev64 v0.4s, v0.4s
+; BE-NEXT:    mov w0, v0.s[1]
 ; BE-NEXT:    ret
 ;
 ; GISEL-LABEL: popcount128:
@@ -138,10 +138,10 @@ define i16 @popcount256(ptr nocapture nonnull readonly %0) {
 ; BE-NEXT:    cnt v1.16b, v1.16b
 ; BE-NEXT:    addv b0, v0.16b
 ; BE-NEXT:    addv b1, v1.16b
-; BE-NEXT:    rev32 v0.16b, v0.16b
-; BE-NEXT:    rev32 v1.16b, v1.16b
-; BE-NEXT:    mov w8, v0.s[3]
-; BE-NEXT:    mov w9, v1.s[3]
+; BE-NEXT:    rev64 v0.4s, v0.4s
+; BE-NEXT:    rev64 v1.4s, v1.4s
+; BE-NEXT:    mov w8, v0.s[1]
+; BE-NEXT:    mov w9, v1.s[1]
 ; BE-NEXT:    add w0, w9, w8
 ; BE-NEXT:    ret
 ;
@@ -227,22 +227,21 @@ define <1 x i128> @popcount1x128(<1 x i128> %0) {
 ; CHECK:       // %bb.0: // %Entry
 ; CHECK-NEXT:    fmov d0, x0
 ; CHECK-NEXT:    mov v0.d[1], x1
+; CHECK-NEXT:    mov x1, xzr
 ; CHECK-NEXT:    cnt v0.16b, v0.16b
 ; CHECK-NEXT:    addv b0, v0.16b
-; CHECK-NEXT:    mov x1, v0.d[1]
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
 ;
 ; BE-LABEL: popcount1x128:
 ; BE:       // %bb.0: // %Entry
 ; BE-NEXT:    fmov d0, x0
+; BE-NEXT:    mov x0, xzr
 ; BE-NEXT:    mov v0.d[1], x1
 ; BE-NEXT:    rev64 v0.16b, v0.16b
 ; BE-NEXT:    cnt v0.16b, v0.16b
 ; BE-NEXT:    addv b0, v0.16b
-; BE-NEXT:    rev64 v0.16b, v0.16b
-; BE-NEXT:    mov x1, v0.d[1]
-; BE-NEXT:    fmov x0, d0
+; BE-NEXT:    fmov x1, d0
 ; BE-NEXT:    ret
 ;
 ; GISEL-LABEL: popcount1x128:

From 9010db1b84ef81d23266b5c57569beadd7967ddb Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Wed, 5 Mar 2025 14:01:24 +0800
Subject: [PATCH 230/282] [Clang] Treat constexpr-unknown value as invalid in
 `EvaluateAsInitializer` (#128409)

It is an alternative to
https://github.com/llvm/llvm-project/pull/127525.
Close https://github.com/llvm/llvm-project/issues/127475.
---
 clang/lib/AST/ExprConstant.cpp          | 14 ++++++++++++--
 clang/lib/CodeGen/CGExprConstant.cpp    |  5 ++++-
 clang/test/CodeGenCXX/cxx23-p2280r4.cpp | 15 +++++++++++++++
 3 files changed, 31 insertions(+), 3 deletions(-)
 create mode 100644 clang/test/CodeGenCXX/cxx23-p2280r4.cpp

diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 0e41e3dbc8a32..a249c3e7d08b7 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -3628,8 +3628,6 @@ static bool evaluateVarDeclInit(EvalInfo &Info, const Expr *E,
   if (AllowConstexprUnknown) {
     if (!Result)
       Result = &Info.CurrentCall->createConstexprUnknownAPValues(VD, Base);
-    else
-      Result->setConstexprUnknown();
   }
   return true;
 }
@@ -17000,6 +16998,18 @@ bool Expr::EvaluateAsInitializer(APValue &Value, const ASTContext &Ctx,
 
     if (!Info.discardCleanups())
       llvm_unreachable("Unhandled cleanup; missing full expression marker?");
+
+    if (Value.allowConstexprUnknown()) {
+      assert(Value.isLValue() && "Expected an lvalue");
+      auto Base = Value.getLValueBase();
+      const auto *NewVD = Base.dyn_cast<const ValueDecl *>();
+      if (!NewVD)
+        NewVD = VD;
+      Info.FFDiag(getExprLoc(), diag::note_constexpr_var_init_non_constant, 1)
+          << NewVD;
+      NoteLValueLocation(Info, Base);
+      return false;
+    }
   }
 
   return CheckConstantExpression(Info, DeclLoc, DeclTy, Value,
diff --git a/clang/lib/CodeGen/CGExprConstant.cpp b/clang/lib/CodeGen/CGExprConstant.cpp
index 655fc3dc954c8..9abbe4b801d56 100644
--- a/clang/lib/CodeGen/CGExprConstant.cpp
+++ b/clang/lib/CodeGen/CGExprConstant.cpp
@@ -1881,8 +1881,11 @@ llvm::Constant *ConstantEmitter::tryEmitPrivateForVarInit(const VarDecl &D) {
 
   // Try to emit the initializer.  Note that this can allow some things that
   // are not allowed by tryEmitPrivateForMemory alone.
-  if (APValue *value = D.evaluateValue())
+  if (APValue *value = D.evaluateValue()) {
+    assert(!value->allowConstexprUnknown() &&
+           "Constexpr unknown values are not allowed in CodeGen");
     return tryEmitPrivateForMemory(*value, destType);
+  }
 
   return nullptr;
 }
diff --git a/clang/test/CodeGenCXX/cxx23-p2280r4.cpp b/clang/test/CodeGenCXX/cxx23-p2280r4.cpp
new file mode 100644
index 0000000000000..d5409be451df0
--- /dev/null
+++ b/clang/test/CodeGenCXX/cxx23-p2280r4.cpp
@@ -0,0 +1,15 @@
+// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++23 %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++17 %s -emit-llvm -o - | FileCheck %s
+
+extern int& s;
+
+// CHECK: @_Z4testv()
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[I:%.*]] = alloca ptr, align {{.*}}
+// CHECK-NEXT: [[X:%.*]] = load ptr, ptr @s, align {{.*}}
+// CHECK-NEXT: store ptr [[X]], ptr [[I]], align {{.*}}
+int& test() {
+  auto &i = s;
+  return i;
+}

From fbb2a7e74d918752aed9442ac8883ace8b636c50 Mon Sep 17 00:00:00 2001
From: Eli Friedman <efriedma@quicinc.com>
Date: Sun, 9 Mar 2025 18:38:55 -0700
Subject: [PATCH 231/282] [clang] Reject constexpr-unknown values as constant
 expressions more consistently (#129952)

Perform the check for constexpr-unknown values in the same place we
perform checks for other values which don't count as constant
expressions.

While I'm here, also fix a rejects-valid with a reference that doesn't
have an initializer. This diagnostic was also covering up some of the
bugs here.

The existing behavior with -fexperimental-new-constant-interpreter seems
to be correct, but the diagnostics are slightly different; it would be
helpful if someone could check on that as a followup.

Followup to #128409.

Fixes #129844. Fixes #129845.
---
 clang/lib/AST/ExprConstant.cpp                | 25 +++++++--------
 clang/test/CodeGenCXX/cxx23-p2280r4.cpp       | 15 ++++++++-
 .../SemaCXX/constant-expression-cxx11.cpp     |  4 +--
 .../SemaCXX/constant-expression-p2280r4.cpp   | 32 +++++++++++++++++--
 4 files changed, 57 insertions(+), 19 deletions(-)

diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index a249c3e7d08b7..5aae78dd2fee7 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -2419,6 +2419,16 @@ static bool CheckLValueConstantExpression(EvalInfo &Info, SourceLocation Loc,
           LVal.getLValueCallIndex() == 0) &&
          "have call index for global lvalue");
 
+  if (LVal.allowConstexprUnknown()) {
+    if (BaseVD) {
+      Info.FFDiag(Loc, diag::note_constexpr_var_init_non_constant, 1) << BaseVD;
+      NoteLValueLocation(Info, Base);
+    } else {
+      Info.FFDiag(Loc);
+    }
+    return false;
+  }
+
   if (Base.is<DynamicAllocLValue>()) {
     Info.FFDiag(Loc, diag::note_constexpr_dynamic_alloc)
         << IsReferenceType << !Designator.Entries.empty();
@@ -3597,7 +3607,8 @@ static bool evaluateVarDeclInit(EvalInfo &Info, const Expr *E,
   // expressions here; doing so would regress diagnostics for things like
   // reading from a volatile constexpr variable.
   if ((Info.getLangOpts().CPlusPlus && !VD->hasConstantInitialization() &&
-       VD->mightBeUsableInConstantExpressions(Info.Ctx)) ||
+       VD->mightBeUsableInConstantExpressions(Info.Ctx) &&
+       !AllowConstexprUnknown) ||
       ((Info.getLangOpts().CPlusPlus || Info.getLangOpts().OpenCL) &&
        !Info.getLangOpts().CPlusPlus11 && !VD->hasICEInitializer(Info.Ctx))) {
     if (Init) {
@@ -16998,18 +17009,6 @@ bool Expr::EvaluateAsInitializer(APValue &Value, const ASTContext &Ctx,
 
     if (!Info.discardCleanups())
       llvm_unreachable("Unhandled cleanup; missing full expression marker?");
-
-    if (Value.allowConstexprUnknown()) {
-      assert(Value.isLValue() && "Expected an lvalue");
-      auto Base = Value.getLValueBase();
-      const auto *NewVD = Base.dyn_cast<const ValueDecl *>();
-      if (!NewVD)
-        NewVD = VD;
-      Info.FFDiag(getExprLoc(), diag::note_constexpr_var_init_non_constant, 1)
-          << NewVD;
-      NoteLValueLocation(Info, Base);
-      return false;
-    }
   }
 
   return CheckConstantExpression(Info, DeclLoc, DeclTy, Value,
diff --git a/clang/test/CodeGenCXX/cxx23-p2280r4.cpp b/clang/test/CodeGenCXX/cxx23-p2280r4.cpp
index d5409be451df0..b62c68c66f0fa 100644
--- a/clang/test/CodeGenCXX/cxx23-p2280r4.cpp
+++ b/clang/test/CodeGenCXX/cxx23-p2280r4.cpp
@@ -4,7 +4,7 @@
 
 extern int& s;
 
-// CHECK: @_Z4testv()
+// CHECK-LABEL: @_Z4testv()
 // CHECK-NEXT: entry:
 // CHECK-NEXT: [[I:%.*]] = alloca ptr, align {{.*}}
 // CHECK-NEXT: [[X:%.*]] = load ptr, ptr @s, align {{.*}}
@@ -13,3 +13,16 @@ int& test() {
   auto &i = s;
   return i;
 }
+
+// CHECK-LABEL: @_Z1fv(
+// CHECK: [[X1:%.*]] = load ptr, ptr @x, align 8
+// CHECK-NEXT: store ptr [[X1]]
+// CHECK: [[X2:%.*]] = load ptr, ptr @x, align 8
+// CHECK-NEXT: store ptr [[X2]]
+// CHECK: [[X3:%.*]] = load ptr, ptr @x, align 8
+// CHECK-NEXT: store ptr [[X3]]
+int &ff();
+int &x = ff();
+struct A { int& x; };
+struct B { A x[20]; };
+B f() { return {x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x}; }
diff --git a/clang/test/SemaCXX/constant-expression-cxx11.cpp b/clang/test/SemaCXX/constant-expression-cxx11.cpp
index 76e2f81947051..c35f3a5632a05 100644
--- a/clang/test/SemaCXX/constant-expression-cxx11.cpp
+++ b/clang/test/SemaCXX/constant-expression-cxx11.cpp
@@ -1472,8 +1472,8 @@ namespace ConvertedConstantExpr {
   enum class E {
     em = m,
     en = n, // expected-error {{enumerator value is not a constant expression}} cxx11_20-note {{initializer of 'n' is unknown}}
-    eo = (m + // pre-cxx23-error {{not a constant expression}}
-          n // cxx11_20-note {{initializer of 'n' is unknown}} cxx23-error {{not a constant expression}}
+    eo = (m + // expected-error {{not a constant expression}}
+          n // cxx11_20-note {{initializer of 'n' is unknown}}
           ),
     eq = reinterpret_cast<long>((int*)0) // expected-error {{not a constant expression}} expected-note {{reinterpret_cast}}
   };
diff --git a/clang/test/SemaCXX/constant-expression-p2280r4.cpp b/clang/test/SemaCXX/constant-expression-p2280r4.cpp
index 8648350b397e0..6c9a87267109c 100644
--- a/clang/test/SemaCXX/constant-expression-p2280r4.cpp
+++ b/clang/test/SemaCXX/constant-expression-p2280r4.cpp
@@ -1,4 +1,7 @@
-// RUN: %clang_cc1 -std=c++23 -verify %s
+// RUN: %clang_cc1 -std=c++23 -verify=expected,nointerpreter %s
+// (Run line removed for backport to 20.x, so we don't need to backport
+// fexperimental-new-constant-interpreter changes)
+// UN: %clang_cc1 -std=c++23 -verify %s -fexperimental-new-constant-interpreter
 
 using size_t = decltype(sizeof(0));
 
@@ -38,8 +41,8 @@ void splash(Swim& swam) {
   static_assert(swam.phelps() == 28);     // ok
   static_assert((&swam)->phelps() == 28); // ok
   Swim* pswam = &swam;                    // expected-note {{declared here}}
-  static_assert(pswam->phelps() == 28);   // expected-error {{static assertion expression is not an integral constant expression}}
-                                          // expected-note@-1 {{read of non-constexpr variable 'pswam' is not allowed in a constant expression}}
+  static_assert(pswam->phelps() == 28);   // expected-error {{static assertion expression is not an integral constant expression}} \
+                                          // expected-note {{read of non-constexpr variable 'pswam' is not allowed in a constant expression}}
   static_assert(how_many(swam) == 28);    // ok
   static_assert(Swim().lochte() == 12);   // ok
   static_assert(swam.lochte() == 12);     // expected-error {{static assertion expression is not an integral constant expression}}
@@ -153,3 +156,26 @@ int g() {
     static_assert(f(arr) == 5);
 }
 }
+
+namespace GH128409 {
+  int &ff();
+  int &x = ff(); // nointerpreter-note {{declared here}}
+  constinit int &z = x; // expected-error {{variable does not have a constant initializer}} \
+                        // expected-note {{required by 'constinit' specifier here}} \
+                        // nointerpreter-note {{initializer of 'x' is not a constant expression}}
+}
+
+namespace GH129845 {
+  int &ff();
+  int &x = ff(); // nointerpreter-note {{declared here}}
+  struct A { int& x; };
+  constexpr A g = {x}; // expected-error {{constexpr variable 'g' must be initialized by a constant expression}} \
+                       // nointerpreter-note {{initializer of 'x' is not a constant expression}}
+  const A* gg = &g;
+}
+
+namespace extern_reference_used_as_unknown {
+  extern int &x;
+  int y;
+  constinit int& g = (x,y); // expected-warning {{left operand of comma operator has no effect}}
+}

From 72c4a3f419f4ec5397a32859a20b112e225df4ea Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Mon, 10 Mar 2025 13:45:44 +0000
Subject: [PATCH 232/282] [clang][test] Don't require specific alignment in
 test case (#130589)

https://github.com/llvm/llvm-project/pull/129952 /
42d49a77241df73a17cb442973702fc460e7fb90 added this test which is
failing on 32-bit ARM because the alignment chosen is 4 not 8. Which
would make sense if this is a 32/64 bit difference

https://lab.llvm.org/buildbot/#/builders/154/builds/13059
```
<stdin>:34:30: note: scanning from here
define dso_local void @_Z1fv(ptr dead_on_unwind noalias writable sret(%struct.B) align 4 %agg.result) #0 {
                             ^
<stdin>:38:2: note: possible intended match here
 %0 = load ptr, ptr @x, align 4
 ^
```
The other test does not check alignment, so I'm assuming that it is not
important here.
---
 clang/test/CodeGenCXX/cxx23-p2280r4.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/clang/test/CodeGenCXX/cxx23-p2280r4.cpp b/clang/test/CodeGenCXX/cxx23-p2280r4.cpp
index b62c68c66f0fa..53b00695d9d6d 100644
--- a/clang/test/CodeGenCXX/cxx23-p2280r4.cpp
+++ b/clang/test/CodeGenCXX/cxx23-p2280r4.cpp
@@ -15,11 +15,11 @@ int& test() {
 }
 
 // CHECK-LABEL: @_Z1fv(
-// CHECK: [[X1:%.*]] = load ptr, ptr @x, align 8
+// CHECK: [[X1:%.*]] = load ptr, ptr @x, align {{.*}}
 // CHECK-NEXT: store ptr [[X1]]
-// CHECK: [[X2:%.*]] = load ptr, ptr @x, align 8
+// CHECK: [[X2:%.*]] = load ptr, ptr @x, align {{.*}}
 // CHECK-NEXT: store ptr [[X2]]
-// CHECK: [[X3:%.*]] = load ptr, ptr @x, align 8
+// CHECK: [[X3:%.*]] = load ptr, ptr @x, align {{.*}}
 // CHECK-NEXT: store ptr [[X3]]
 int &ff();
 int &x = ff();

From 0412f708c380d8fefc2136bbcfbcb5d551fe5730 Mon Sep 17 00:00:00 2001
From: DianQK <dianqk@dianqk.net>
Date: Mon, 10 Mar 2025 19:34:07 +0800
Subject: [PATCH 233/282] [TailDuplicator] Do not restrict the computed gotos
 (#114990)

Fixes #106846.

This is what I learned from GCC. I found that GCC does not duplicate the
BB that has indirect jumps with the jump table. I believe GCC has
provided a clear explanation here:

> Duplicate the blocks containing computed gotos. This basically
unfactors computed gotos that were factored early on in the compilation
process to speed up edge based data flow. We used to not unfactor them
again, which can seriously pessimize code with many computed jumps in
the source code, such as interpreters.

(cherry picked from commit dd21aacd76e36d4db157a5d7a7b5370d456426e6)
---
 llvm/include/llvm/CodeGen/MachineInstr.h      |  13 +-
 llvm/lib/CodeGen/TailDuplicator.cpp           |  12 +-
 .../CodeGen/X86/tail-dup-computed-goto.mir    | 255 ++++++++++++++++++
 3 files changed, 276 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/tail-dup-computed-goto.mir

diff --git a/llvm/include/llvm/CodeGen/MachineInstr.h b/llvm/include/llvm/CodeGen/MachineInstr.h
index 102b1eb07358e..b26cabe801ee8 100644
--- a/llvm/include/llvm/CodeGen/MachineInstr.h
+++ b/llvm/include/llvm/CodeGen/MachineInstr.h
@@ -994,8 +994,17 @@ class MachineInstr
 
   /// Return true if this is an indirect branch, such as a
   /// branch through a register.
-  bool isIndirectBranch(QueryType Type = AnyInBundle) const {
-    return hasProperty(MCID::IndirectBranch, Type);
+  bool isIndirectBranch(QueryType Type = AnyInBundle,
+                        bool IncludeJumpTable = true) const {
+    return hasProperty(MCID::IndirectBranch, Type) &&
+           (IncludeJumpTable || !llvm::any_of(operands(), [](const auto &Op) {
+              return Op.isJTI();
+            }));
+  }
+
+  bool isComputedGoto(QueryType Type = AnyInBundle) const {
+    // Jump tables are not considered computed gotos.
+    return isIndirectBranch(Type, /*IncludeJumpTable=*/false);
   }
 
   /// Return true if this is a branch which may fall
diff --git a/llvm/lib/CodeGen/TailDuplicator.cpp b/llvm/lib/CodeGen/TailDuplicator.cpp
index 6c6d38462484a..21f75458c90f3 100644
--- a/llvm/lib/CodeGen/TailDuplicator.cpp
+++ b/llvm/lib/CodeGen/TailDuplicator.cpp
@@ -601,8 +601,11 @@ bool TailDuplicator::shouldTailDuplicate(bool IsSimple,
   // that rearrange the predecessors of the indirect branch.
 
   bool HasIndirectbr = false;
-  if (!TailBB.empty())
+  bool HasComputedGoto = false;
+  if (!TailBB.empty()) {
     HasIndirectbr = TailBB.back().isIndirectBranch();
+    HasComputedGoto = TailBB.back().isComputedGoto();
+  }
 
   if (HasIndirectbr && PreRegAlloc)
     MaxDuplicateCount = TailDupIndirectBranchSize;
@@ -660,7 +663,12 @@ bool TailDuplicator::shouldTailDuplicate(bool IsSimple,
   // Duplicating a BB which has both multiple predecessors and successors will
   // may cause huge amount of PHI nodes. If we want to remove this limitation,
   // we have to address https://github.com/llvm/llvm-project/issues/78578.
-  if (TailBB.pred_size() > TailDupPredSize &&
+  // NB. This basically unfactors computed gotos that were factored early on in
+  // the compilation process to speed up edge based data flow. If we do not
+  // unfactor them again, it can seriously pessimize code with many computed
+  // jumps in the source code, such as interpreters. Therefore we do not
+  // restrict the computed gotos.
+  if (!HasComputedGoto && TailBB.pred_size() > TailDupPredSize &&
       TailBB.succ_size() > TailDupSuccSize) {
     // If TailBB or any of its successors contains a phi, we may have to add a
     // large number of additional phis with additional incoming values.
diff --git a/llvm/test/CodeGen/X86/tail-dup-computed-goto.mir b/llvm/test/CodeGen/X86/tail-dup-computed-goto.mir
new file mode 100644
index 0000000000000..a472dc67d8d51
--- /dev/null
+++ b/llvm/test/CodeGen/X86/tail-dup-computed-goto.mir
@@ -0,0 +1,255 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=x86_64-unknown-linux-gnu -run-pass=early-tailduplication -tail-dup-pred-size=1 -tail-dup-succ-size=1 %s -o - | FileCheck %s
+# Check that only the computed goto is not be restrict by tail-dup-pred-size and tail-dup-succ-size.
+--- |
+  declare i64 @f0()
+  declare i64 @f1()
+  declare i64 @f2()
+  declare i64 @f3()
+  declare i64 @f4()
+  declare i64 @f5()
+  @computed_goto.dispatch = external global [5 x ptr]
+  define void @computed_goto() { ret void }
+  define void @jump_table() { ret void }
+...
+---
+name:            computed_goto
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: computed_goto
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   CALL64pcrel32 target-flags(x86-plt) @f0, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+  ; CHECK-NEXT:   ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gr64 = COPY $rax
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gr64_nosp = COPY [[COPY]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gr64_nosp = COPY [[COPY1]]
+  ; CHECK-NEXT:   JMP64m $noreg, 8, [[COPY1]], @computed_goto.dispatch, $noreg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   CALL64pcrel32 target-flags(x86-plt) @f1, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+  ; CHECK-NEXT:   ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:gr64 = COPY $rax
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:gr64_nosp = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:gr64_nosp = COPY [[COPY4]]
+  ; CHECK-NEXT:   JMP64m $noreg, 8, [[COPY4]], @computed_goto.dispatch, $noreg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   CALL64pcrel32 target-flags(x86-plt) @f2, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+  ; CHECK-NEXT:   ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:gr64 = COPY $rax
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:gr64_nosp = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:gr64_nosp = COPY [[COPY7]]
+  ; CHECK-NEXT:   JMP64m $noreg, 8, [[COPY7]], @computed_goto.dispatch, $noreg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   CALL64pcrel32 target-flags(x86-plt) @f3, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+  ; CHECK-NEXT:   ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:gr64 = COPY $rax
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:gr64_nosp = COPY [[COPY9]]
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:gr64_nosp = COPY [[COPY10]]
+  ; CHECK-NEXT:   JMP64m $noreg, 8, [[COPY10]], @computed_goto.dispatch, $noreg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   CALL64pcrel32 target-flags(x86-plt) @f4, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+  ; CHECK-NEXT:   ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:gr64 = COPY $rax
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:gr64_nosp = COPY [[COPY12]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:gr64_nosp = COPY [[COPY13]]
+  ; CHECK-NEXT:   JMP64m $noreg, 8, [[COPY13]], @computed_goto.dispatch, $noreg
+  bb.0:
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    CALL64pcrel32 target-flags(x86-plt) @f0, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    %6:gr64 = COPY $rax
+    %0:gr64 = COPY %6
+    JMP_1 %bb.5
+
+  bb.1:
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    CALL64pcrel32 target-flags(x86-plt) @f1, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    %10:gr64 = COPY $rax
+    %1:gr64 = COPY %10
+    JMP_1 %bb.5
+
+  bb.2:
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    CALL64pcrel32 target-flags(x86-plt) @f2, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    %9:gr64 = COPY $rax
+    %2:gr64 = COPY %9
+    JMP_1 %bb.5
+
+  bb.3:
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    CALL64pcrel32 target-flags(x86-plt) @f3, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    %8:gr64 = COPY $rax
+    %3:gr64 = COPY %8
+    JMP_1 %bb.5
+
+  bb.4:
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    CALL64pcrel32 target-flags(x86-plt) @f4, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    %7:gr64 = COPY $rax
+    %4:gr64 = COPY %7
+
+  bb.5:
+    successors: %bb.1, %bb.2, %bb.3, %bb.4
+
+    %5:gr64_nosp = PHI %0, %bb.0, %4, %bb.4, %3, %bb.3, %2, %bb.2, %1, %bb.1
+    JMP64m $noreg, 8, %5, @computed_goto.dispatch, $noreg
+
+...
+---
+name:            jump_table
+tracksRegLiveness: true
+jumpTable:
+  kind:            block-address
+  entries:
+    - id:              0
+      blocks:          [ '%bb.2', '%bb.3', '%bb.4', '%bb.5', '%bb.6' ]
+body:             |
+  ; CHECK-LABEL: name: jump_table
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   CALL64pcrel32 target-flags(x86-plt) @f0, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+  ; CHECK-NEXT:   ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gr64 = COPY $rax
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gr64 = COPY [[COPY]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x1999999a), %bb.4(0x1999999a), %bb.5(0x1999999a), %bb.6(0x1999999a), %bb.7(0x1999999a)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:gr64 = PHI [[COPY1]], %bb.0, %3, %bb.7, %4, %bb.6, %5, %bb.5, %6, %bb.4, %7, %bb.3
+  ; CHECK-NEXT:   [[DEC64r:%[0-9]+]]:gr64_nosp = DEC64r [[PHI]], implicit-def dead $eflags
+  ; CHECK-NEXT:   JMP64m $noreg, 8, [[DEC64r]], %jump-table.0, $noreg :: (load (s64) from jump-table)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   CALL64pcrel32 target-flags(x86-plt) @f1, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+  ; CHECK-NEXT:   ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gr64 = COPY $rax
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:gr64 = COPY [[COPY2]]
+  ; CHECK-NEXT:   JMP_1 %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   CALL64pcrel32 target-flags(x86-plt) @f2, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+  ; CHECK-NEXT:   ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:gr64 = COPY $rax
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:gr64 = COPY [[COPY4]]
+  ; CHECK-NEXT:   JMP_1 %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   CALL64pcrel32 target-flags(x86-plt) @f3, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+  ; CHECK-NEXT:   ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:gr64 = COPY $rax
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:gr64 = COPY [[COPY6]]
+  ; CHECK-NEXT:   JMP_1 %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   CALL64pcrel32 target-flags(x86-plt) @f4, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+  ; CHECK-NEXT:   ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:gr64 = COPY $rax
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:gr64 = COPY [[COPY8]]
+  ; CHECK-NEXT:   JMP_1 %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   CALL64pcrel32 target-flags(x86-plt) @f5, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+  ; CHECK-NEXT:   ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:gr64 = COPY $rax
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:gr64 = COPY [[COPY10]]
+  ; CHECK-NEXT:   JMP_1 %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.8:
+  bb.0:
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    CALL64pcrel32 target-flags(x86-plt) @f0, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    %7:gr64 = COPY $rax
+    %0:gr64 = COPY %7
+
+  bb.1:
+    %1:gr64 = PHI %0, %bb.0, %6, %bb.6, %5, %bb.5, %4, %bb.4, %3, %bb.3, %2, %bb.2
+    %8:gr64_nosp = DEC64r %1, implicit-def dead $eflags
+
+  bb.8:
+    successors: %bb.2(0x1999999a), %bb.3(0x1999999a), %bb.4(0x1999999a), %bb.5(0x1999999a), %bb.6(0x1999999a)
+
+    JMP64m $noreg, 8, %8, %jump-table.0, $noreg :: (load (s64) from jump-table)
+
+  bb.2:
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    CALL64pcrel32 target-flags(x86-plt) @f1, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    %13:gr64 = COPY $rax
+    %2:gr64 = COPY %13
+    JMP_1 %bb.1
+
+  bb.3:
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    CALL64pcrel32 target-flags(x86-plt) @f2, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    %12:gr64 = COPY $rax
+    %3:gr64 = COPY %12
+    JMP_1 %bb.1
+
+  bb.4:
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    CALL64pcrel32 target-flags(x86-plt) @f3, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    %11:gr64 = COPY $rax
+    %4:gr64 = COPY %11
+    JMP_1 %bb.1
+
+  bb.5:
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    CALL64pcrel32 target-flags(x86-plt) @f4, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    %10:gr64 = COPY $rax
+    %5:gr64 = COPY %10
+    JMP_1 %bb.1
+
+  bb.6:
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    CALL64pcrel32 target-flags(x86-plt) @f5, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    %9:gr64 = COPY $rax
+    %6:gr64 = COPY %9
+    JMP_1 %bb.1
+
+  bb.7:
+
+...

From 50343e517992970f62726d601e53054e588df437 Mon Sep 17 00:00:00 2001
From: aankit-ca <quic_aankit@quicinc.com>
Date: Thu, 6 Mar 2025 15:02:10 -0800
Subject: [PATCH 234/282] [HEXAGON] Fix hvx-isel for extract_subvector op
 (#129672)

Fixes a crash with extract_subvectors in Hexagon backend seen when the
source vector is a vector-pair and result vector is not hvx vector size.

LLVM Issue: https://github.com/llvm/llvm-project/issues/128775
Fixes #128775
---------

Co-authored-by: aankit-quic <aankit@quicinc.com>
(cherry picked from commit 29d3fc3f11d272a72ac255af9277c740f26c3dfc)
---
 .../Target/Hexagon/HexagonISelLoweringHVX.cpp |  10 +-
 .../test/CodeGen/Hexagon/autohvx/fp-to-int.ll | 406 +++++++++---------
 .../test/CodeGen/Hexagon/autohvx/int-to-fp.ll | 120 +++---
 .../CodeGen/Hexagon/isel/extract-subvec.ll    |  34 ++
 4 files changed, 302 insertions(+), 268 deletions(-)
 create mode 100644 llvm/test/CodeGen/Hexagon/isel/extract-subvec.ll

diff --git a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
index 816e063f8dbbe..1a19e81a68f08 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
@@ -1265,11 +1265,15 @@ HexagonTargetLowering::extractHvxSubvectorReg(SDValue OrigOp, SDValue VecV,
   // the subvector of interest. The subvector will never overlap two single
   // vectors.
   if (isHvxPairTy(VecTy)) {
-    if (Idx * ElemWidth >= 8*HwLen)
+    unsigned SubIdx = Hexagon::vsub_lo;
+    if (Idx * ElemWidth >= 8 * HwLen) {
+      SubIdx = Hexagon::vsub_hi;
       Idx -= VecTy.getVectorNumElements() / 2;
+    }
 
-    VecV = OrigOp;
-    if (typeSplit(VecTy).first == ResTy)
+    VecTy = typeSplit(VecTy).first;
+    VecV = DAG.getTargetExtractSubreg(SubIdx, dl, VecTy, VecV);
+    if (VecTy == ResTy)
       return VecV;
   }
 
diff --git a/llvm/test/CodeGen/Hexagon/autohvx/fp-to-int.ll b/llvm/test/CodeGen/Hexagon/autohvx/fp-to-int.ll
index ac51662242de8..196b37678be61 100644
--- a/llvm/test/CodeGen/Hexagon/autohvx/fp-to-int.ll
+++ b/llvm/test/CodeGen/Hexagon/autohvx/fp-to-int.ll
@@ -13,13 +13,13 @@ define void @f16s8_0(ptr %a0, ptr %a1) #0 {
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     r3:2 = combine(##32768,#1)
 ; CHECK-NEXT:     r4 = #14
-; CHECK-NEXT:     v1 = vmem(r0+#0)
+; CHECK-NEXT:     v0 = vmem(r0+#0)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v2.h = vsplat(r3)
 ; CHECK-NEXT:     r6 = #5
 ; CHECK-NEXT:     v3.h = vasl(v0.h,r2)
-; CHECK-NEXT:     v0.cur = vmem(r0+#1)
+; CHECK-NEXT:     v1 = vmem(r0+#1)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v4.h = vsplat(r4)
@@ -33,55 +33,55 @@ define void @f16s8_0(ptr %a0, ptr %a1) #0 {
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     r3 = #16
-; CHECK-NEXT:     v5.h = vasl(v1.h,r6)
-; CHECK-NEXT:     q1 = vcmp.gt(v7.h,v0.h)
+; CHECK-NEXT:     v5.h = vasl(v0.h,r6)
+; CHECK-NEXT:     q1 = vcmp.gt(v7.h,v1.h)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v6.h = vsplat(r3)
-; CHECK-NEXT:     v27.h = vasr(v3.h,r5)
+; CHECK-NEXT:     v28.h = vasr(v3.h,r5)
 ; CHECK-NEXT:     v5 = vor(v5,v2)
-; CHECK-NEXT:     q0 = vcmp.gt(v7.h,v1.h)
+; CHECK-NEXT:     q0 = vcmp.gt(v7.h,v0.h)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v9.h = vsplat(r4)
 ; CHECK-NEXT:     v8.h = vasr(v8.h,r5)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v26.h = vasl(v0.h,r6)
-; CHECK-NEXT:     v0.h = vsub(v4.h,v27.h)
+; CHECK-NEXT:     v27.h = vasl(v1.h,r6)
+; CHECK-NEXT:     v1.h = vsub(v4.h,v28.h)
 ; CHECK-NEXT:     v4.h = vsub(v4.h,v8.h)
-; CHECK-NEXT:     v28 = vmux(q0,v2,v9)
+; CHECK-NEXT:     v29 = vmux(q0,v2,v9)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
+; CHECK-NEXT:     v1.h = vmin(v1.h,v6.h)
+; CHECK-NEXT:     v0 = vor(v27,v2)
 ; CHECK-NEXT:     v4.h = vmin(v4.h,v6.h)
-; CHECK-NEXT:     v1 = vor(v26,v2)
-; CHECK-NEXT:     v0.h = vmin(v0.h,v6.h)
 ; CHECK-NEXT:     v2 = vmux(q1,v2,v9)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     q2 = vcmp.gt(v4.h,v7.h)
-; CHECK-NEXT:     q3 = vcmp.gt(v0.h,v7.h)
+; CHECK-NEXT:     q2 = vcmp.gt(v1.h,v7.h)
+; CHECK-NEXT:     q3 = vcmp.gt(v4.h,v7.h)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v5.h = vlsr(v5.h,v4.h)
+; CHECK-NEXT:     v5.h = vlsr(v5.h,v1.h)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v1.h = vlsr(v1.h,v0.h)
-; CHECK-NEXT:     v29.h = vsub(v7.h,v5.h)
+; CHECK-NEXT:     v0.h = vlsr(v0.h,v4.h)
+; CHECK-NEXT:     v30.h = vsub(v7.h,v5.h)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v30.h = vsub(v7.h,v1.h)
-; CHECK-NEXT:     v5 = vmux(q0,v29,v5)
+; CHECK-NEXT:     v31.h = vsub(v7.h,v0.h)
+; CHECK-NEXT:     v5 = vmux(q0,v30,v5)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v1 = vmux(q1,v30,v1)
-; CHECK-NEXT:     v31 = vmux(q2,v5,v28)
+; CHECK-NEXT:     v0 = vmux(q1,v31,v0)
+; CHECK-NEXT:     v1 = vmux(q2,v5,v29)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v1 = vmux(q3,v1,v2)
+; CHECK-NEXT:     v0 = vmux(q3,v0,v2)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v0.b = vpack(v1.h,v31.h):sat
+; CHECK-NEXT:     v0.b = vpack(v0.h,v1.h):sat
 ; CHECK-NEXT:     jumpr r31
 ; CHECK-NEXT:     vmem(r1+#0) = v0.new
 ; CHECK-NEXT:    }
@@ -491,127 +491,127 @@ define void @f32s8_0(ptr %a0, ptr %a1) #0 {
 ; CHECK:         .cfi_startproc
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     r4 = ##-2147483648
 ; CHECK-NEXT:     r3:2 = combine(#1,#8)
-; CHECK-NEXT:     v5 = vmem(r0+#0)
+; CHECK-NEXT:     r4 = ##-2147483648
+; CHECK-NEXT:     v5 = vmem(r0+#1)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v1 = vsplat(r4)
+; CHECK-NEXT:     v0 = vsplat(r4)
 ; CHECK-NEXT:     r7 = #30
 ; CHECK-NEXT:     r6 = #24
-; CHECK-NEXT:     v2 = vmem(r0+#2)
+; CHECK-NEXT:     v4 = vmem(r0+#0)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v10 = vsplat(r7)
 ; CHECK-NEXT:     r5 = #32
-; CHECK-NEXT:     v8.w = vasl(v4.w,r3)
-; CHECK-NEXT:     v4.cur = vmem(r0+#1)
+; CHECK-NEXT:     v9.w = vasl(v5.w,r3)
+; CHECK-NEXT:     v1 = vmem(r0+#3)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v7.w = vasl(v5.w,r3)
-; CHECK-NEXT:     v12 = vxor(v12,v12)
-; CHECK-NEXT:     v8.w = vsub(v8.w,v1.w)
-; CHECK-NEXT:     v0 = vmem(r0+#3)
+; CHECK-NEXT:     v8.w = vasl(v4.w,r3)
+; CHECK-NEXT:     v14 = vxor(v14,v14)
+; CHECK-NEXT:     v9.w = vsub(v9.w,v0.w)
+; CHECK-NEXT:     v2 = vmem(r0+#2)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v13 = vsplat(r5)
-; CHECK-NEXT:     v11.w = vasl(v0.w,r3)
-; CHECK-NEXT:     v7.w = vsub(v7.w,v1.w)
-; CHECK-NEXT:     q0 = vcmp.gt(v12.w,v5.w)
+; CHECK-NEXT:     v11.w = vasl(v2.w,r3)
+; CHECK-NEXT:     v8.w = vsub(v8.w,v0.w)
+; CHECK-NEXT:     q1 = vcmp.gt(v14.w,v5.w)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v9.w = vasl(v2.w,r3)
-; CHECK-NEXT:     q1 = vcmp.gt(v12.w,v4.w)
-; CHECK-NEXT:     v11.w = vsub(v11.w,v1.w)
+; CHECK-NEXT:     v12.w = vasl(v1.w,r3)
+; CHECK-NEXT:     q0 = vcmp.gt(v14.w,v4.w)
+; CHECK-NEXT:     v11.w = vsub(v11.w,v0.w)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     r3 = ##2147483647
 ; CHECK-NEXT:     r7 = #64
-; CHECK-NEXT:     v8.w = vasr(v8.w,r6)
+; CHECK-NEXT:     v9.w = vasr(v9.w,r6)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v22 = vsplat(r3)
-; CHECK-NEXT:     v7.w = vasr(v7.w,r6)
-; CHECK-NEXT:     v19.w = vsub(v9.w,v1.w)
-; CHECK-NEXT:     v8.w = vsub(v10.w,v8.w)
+; CHECK-NEXT:     v18 = vsplat(r3)
+; CHECK-NEXT:     v7.w = vasl(v5.w,r2)
+; CHECK-NEXT:     v19.w = vsub(v12.w,v0.w)
+; CHECK-NEXT:     v9.w = vsub(v10.w,v9.w)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v20.w = vasl(v4.w,r2)
-; CHECK-NEXT:     v27 = vmux(q1,v1,v22)
-; CHECK-NEXT:     v25 = vmux(q0,v1,v22)
-; CHECK-NEXT:     v7.w = vsub(v10.w,v7.w)
+; CHECK-NEXT:     v8.w = vasr(v8.w,r6)
+; CHECK-NEXT:     v25 = vmux(q1,v0,v18)
+; CHECK-NEXT:     v23 = vmux(q0,v0,v18)
+; CHECK-NEXT:     v9.w = vmin(v9.w,v13.w)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v6.w = vasl(v5.w,r2)
+; CHECK-NEXT:     v6.w = vasl(v4.w,r2)
+; CHECK-NEXT:     v7 = vor(v7,v0)
+; CHECK-NEXT:     v8.w = vsub(v10.w,v8.w)
+; CHECK-NEXT:     q3 = vcmp.gt(v9.w,v14.w)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     v11.w = vasr(v11.w,r6)
 ; CHECK-NEXT:     v8.w = vmin(v8.w,v13.w)
-; CHECK-NEXT:     v9 = vor(v20,v1)
-; CHECK-NEXT:     v21.w = vmin(v7.w,v13.w)
+; CHECK-NEXT:     v6 = vor(v6,v0)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v5.w = vasr(v19.w,r6)
-; CHECK-NEXT:     q3 = vcmp.gt(v8.w,v12.w)
-; CHECK-NEXT:     v6 = vor(v6,v1)
-; CHECK-NEXT:     q2 = vcmp.gt(v21.w,v12.w)
+; CHECK-NEXT:     v11.w = vsub(v10.w,v11.w)
+; CHECK-NEXT:     q2 = vcmp.gt(v8.w,v14.w)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v11.w = vasr(v11.w,r6)
+; CHECK-NEXT:     v3.w = vasl(v1.w,r2)
 ; CHECK-NEXT:     v5.w = vsub(v10.w,v5.w)
+; CHECK-NEXT:     v21.w = vmin(v11.w,v13.w)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v3.w = vasl(v2.w,r2)
-; CHECK-NEXT:     v10.w = vsub(v10.w,v11.w)
+; CHECK-NEXT:     v20.w = vasl(v2.w,r2)
+; CHECK-NEXT:     v3 = vor(v3,v0)
 ; CHECK-NEXT:     v5.w = vmin(v5.w,v13.w)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v23.w = vasl(v0.w,r2)
-; CHECK-NEXT:     v3 = vor(v3,v1)
-; CHECK-NEXT:     v10.w = vmin(v10.w,v13.w)
+; CHECK-NEXT:     v7.w = vlsr(v7.w,v9.w)
+; CHECK-NEXT:     v12 = vor(v20,v0)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v8.w = vlsr(v9.w,v8.w)
-; CHECK-NEXT:     v4 = vor(v23,v1)
+; CHECK-NEXT:     v6.w = vlsr(v6.w,v8.w)
+; CHECK-NEXT:     v24.w = vsub(v14.w,v7.w)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v6.w = vlsr(v6.w,v21.w)
-; CHECK-NEXT:     v26.w = vsub(v12.w,v8.w)
+; CHECK-NEXT:     v26.w = vlsr(v12.w,v21.w)
+; CHECK-NEXT:     v22.w = vsub(v14.w,v6.w)
+; CHECK-NEXT:     v7 = vmux(q1,v24,v7)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v3.w = vlsr(v3.w,v5.w)
-; CHECK-NEXT:     v24.w = vsub(v12.w,v6.w)
-; CHECK-NEXT:     v8 = vmux(q1,v26,v8)
+; CHECK-NEXT:     v6 = vmux(q0,v22,v6)
+; CHECK-NEXT:     q0 = vcmp.gt(v14.w,v2.w)
+; CHECK-NEXT:     v27.w = vsub(v14.w,v26.w)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v4.w = vlsr(v4.w,v10.w)
-; CHECK-NEXT:     v6 = vmux(q0,v24,v6)
-; CHECK-NEXT:     q0 = vcmp.gt(v12.w,v2.w)
-; CHECK-NEXT:     v28.w = vsub(v12.w,v3.w)
+; CHECK-NEXT:     v2 = vmux(q3,v7,v25)
+; CHECK-NEXT:     v29.w = vsub(v14.w,v3.w)
+; CHECK-NEXT:     q3 = vcmp.gt(v14.w,v1.w)
+; CHECK-NEXT:     v6 = vmux(q2,v6,v23)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v2 = vmux(q3,v8,v27)
-; CHECK-NEXT:     v29.w = vsub(v12.w,v4.w)
-; CHECK-NEXT:     q3 = vcmp.gt(v12.w,v0.w)
-; CHECK-NEXT:     v6 = vmux(q2,v6,v25)
-; CHECK-NEXT:    }
-; CHECK-NEXT:    {
-; CHECK-NEXT:     v30 = vmux(q0,v1,v22)
-; CHECK-NEXT:     v3 = vmux(q0,v28,v3)
-; CHECK-NEXT:     q2 = vcmp.gt(v5.w,v12.w)
-; CHECK-NEXT:     v4 = vmux(q3,v29,v4)
+; CHECK-NEXT:     v30 = vmux(q0,v0,v18)
+; CHECK-NEXT:     v28 = vmux(q0,v27,v26)
+; CHECK-NEXT:     q2 = vcmp.gt(v21.w,v14.w)
+; CHECK-NEXT:     v3 = vmux(q3,v29,v3)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v2.h = vpack(v2.w,v6.w):sat
-; CHECK-NEXT:     v1 = vmux(q3,v1,v22)
-; CHECK-NEXT:     q3 = vcmp.gt(v10.w,v12.w)
-; CHECK-NEXT:     v0 = vmux(q2,v3,v30)
+; CHECK-NEXT:     v0 = vmux(q3,v0,v18)
+; CHECK-NEXT:     q3 = vcmp.gt(v5.w,v14.w)
+; CHECK-NEXT:     v1 = vmux(q2,v28,v30)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v1 = vmux(q3,v4,v1)
+; CHECK-NEXT:     v0 = vmux(q3,v3,v0)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v3.h = vpack(v1.w,v0.w):sat
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v0.h = vpack(v1.w,v0.w):sat
+; CHECK-NEXT:     v0.h = vpack(v0.w,v1.w):sat
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v31.b = vpack(v3.h,v2.h):sat
@@ -638,13 +638,13 @@ define void @f32s8_1(ptr %a0, ptr %a1) #0 {
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     r3:2 = combine(##-2147483648,#8)
 ; CHECK-NEXT:     r4 = #1
-; CHECK-NEXT:     v1 = vmem(r0+#0)
+; CHECK-NEXT:     v1 = vmem(r0+#1)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v3 = vsplat(r3)
 ; CHECK-NEXT:     r5 = #30
 ; CHECK-NEXT:     v4.w = vasl(v0.w,r4)
-; CHECK-NEXT:     v0.cur = vmem(r0+#1)
+; CHECK-NEXT:     v0.cur = vmem(r0+#0)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v5.w = vasl(v1.w,r4)
@@ -653,64 +653,64 @@ define void @f32s8_1(ptr %a0, ptr %a1) #0 {
 ; CHECK-NEXT:     r4 = #32
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v6 = vsplat(r5)
-; CHECK-NEXT:     v7 = vsplat(r4)
+; CHECK-NEXT:     v7 = vsplat(r5)
+; CHECK-NEXT:     v8 = vsplat(r4)
 ; CHECK-NEXT:     v2.w = vasl(v1.w,r2)
 ; CHECK-NEXT:     v5.w = vsub(v5.w,v3.w)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v4.w = vasr(v4.w,r6)
-; CHECK-NEXT:     v26 = vxor(v26,v26)
+; CHECK-NEXT:     v27 = vxor(v27,v27)
 ; CHECK-NEXT:     v2 = vor(v2,v3)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     r3 = ##2147483647
 ; CHECK-NEXT:     v5.w = vasr(v5.w,r6)
-; CHECK-NEXT:     q0 = vcmp.gt(v26.w,v1.w)
+; CHECK-NEXT:     q0 = vcmp.gt(v27.w,v0.w)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v27 = vsplat(r3)
-; CHECK-NEXT:     v4.w = vsub(v6.w,v4.w)
-; CHECK-NEXT:     q2 = vcmp.gt(v26.w,v0.w)
-; CHECK-NEXT:     v5.w = vsub(v6.w,v5.w)
+; CHECK-NEXT:     v28 = vsplat(r3)
+; CHECK-NEXT:     v6.w = vasl(v0.w,r2)
+; CHECK-NEXT:     v4.w = vsub(v7.w,v4.w)
+; CHECK-NEXT:     q2 = vcmp.gt(v27.w,v1.w)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v8.w = vasl(v0.w,r2)
-; CHECK-NEXT:     v4.w = vmin(v4.w,v7.w)
-; CHECK-NEXT:     v30 = vmux(q0,v3,v27)
-; CHECK-NEXT:     v5.w = vmin(v5.w,v7.w)
+; CHECK-NEXT:     v5.w = vsub(v7.w,v5.w)
+; CHECK-NEXT:     v4.w = vmin(v4.w,v8.w)
+; CHECK-NEXT:     v31 = vmux(q0,v3,v28)
+; CHECK-NEXT:     v6 = vor(v6,v3)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v25 = vor(v8,v3)
-; CHECK-NEXT:     v1 = vmux(q2,v3,v27)
-; CHECK-NEXT:     q3 = vcmp.gt(v4.w,v26.w)
-; CHECK-NEXT:     q1 = vcmp.gt(v5.w,v26.w)
+; CHECK-NEXT:     v5.w = vmin(v5.w,v8.w)
+; CHECK-NEXT:     q1 = vcmp.gt(v4.w,v27.w)
+; CHECK-NEXT:     v0 = vmux(q2,v3,v28)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     r2 = #64
-; CHECK-NEXT:     v2.w = vlsr(v2.w,v5.w)
+; CHECK-NEXT:     v6.w = vlsr(v6.w,v4.w)
+; CHECK-NEXT:     q3 = vcmp.gt(v5.w,v27.w)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v28.w = vlsr(v25.w,v4.w)
-; CHECK-NEXT:     v29.w = vsub(v26.w,v2.w)
+; CHECK-NEXT:     v2.w = vlsr(v2.w,v5.w)
+; CHECK-NEXT:     v29.w = vsub(v27.w,v6.w)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v6.w = vsub(v26.w,v28.w)
-; CHECK-NEXT:     v0 = vmux(q0,v29,v2)
+; CHECK-NEXT:     v30.w = vsub(v27.w,v2.w)
+; CHECK-NEXT:     v1 = vmux(q0,v29,v6)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v31 = vmux(q2,v6,v28)
-; CHECK-NEXT:     v0 = vmux(q1,v0,v30)
+; CHECK-NEXT:     v2 = vmux(q2,v30,v2)
+; CHECK-NEXT:     v1 = vmux(q1,v1,v31)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     q3 = vsetq(r2)
-; CHECK-NEXT:     v1 = vmux(q3,v31,v1)
+; CHECK-NEXT:     v0 = vmux(q3,v2,v0)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v2.h = vpack(v1.w,v0.w):sat
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v0.h = vpack(v1.w,v0.w):sat
+; CHECK-NEXT:     v0.h = vpack(v0.w,v1.w):sat
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v0.b = vpack(v2.h,v0.h):sat
@@ -808,13 +808,13 @@ define void @f32s16_0(ptr %a0, ptr %a1) #0 {
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     r3:2 = combine(##-2147483648,#1)
 ; CHECK-NEXT:     r4 = #30
-; CHECK-NEXT:     v1 = vmem(r0+#0)
+; CHECK-NEXT:     v0 = vmem(r0+#0)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v2 = vsplat(r3)
 ; CHECK-NEXT:     r6 = #8
 ; CHECK-NEXT:     v3.w = vasl(v0.w,r2)
-; CHECK-NEXT:     v0.cur = vmem(r0+#1)
+; CHECK-NEXT:     v1 = vmem(r0+#1)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v4 = vsplat(r4)
@@ -828,55 +828,55 @@ define void @f32s16_0(ptr %a0, ptr %a1) #0 {
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     r3 = #32
-; CHECK-NEXT:     v5.w = vasl(v1.w,r6)
-; CHECK-NEXT:     q1 = vcmp.gt(v7.w,v0.w)
+; CHECK-NEXT:     v5.w = vasl(v0.w,r6)
+; CHECK-NEXT:     q1 = vcmp.gt(v7.w,v1.w)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v6 = vsplat(r3)
-; CHECK-NEXT:     v27.w = vasr(v3.w,r5)
+; CHECK-NEXT:     v28.w = vasr(v3.w,r5)
 ; CHECK-NEXT:     v5 = vor(v5,v2)
-; CHECK-NEXT:     q0 = vcmp.gt(v7.w,v1.w)
+; CHECK-NEXT:     q0 = vcmp.gt(v7.w,v0.w)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v9 = vsplat(r4)
 ; CHECK-NEXT:     v8.w = vasr(v8.w,r5)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v26.w = vasl(v0.w,r6)
-; CHECK-NEXT:     v0.w = vsub(v4.w,v27.w)
+; CHECK-NEXT:     v27.w = vasl(v1.w,r6)
+; CHECK-NEXT:     v1.w = vsub(v4.w,v28.w)
 ; CHECK-NEXT:     v4.w = vsub(v4.w,v8.w)
-; CHECK-NEXT:     v28 = vmux(q0,v2,v9)
+; CHECK-NEXT:     v29 = vmux(q0,v2,v9)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
+; CHECK-NEXT:     v1.w = vmin(v1.w,v6.w)
+; CHECK-NEXT:     v0 = vor(v27,v2)
 ; CHECK-NEXT:     v4.w = vmin(v4.w,v6.w)
-; CHECK-NEXT:     v1 = vor(v26,v2)
-; CHECK-NEXT:     v0.w = vmin(v0.w,v6.w)
 ; CHECK-NEXT:     v2 = vmux(q1,v2,v9)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     q2 = vcmp.gt(v4.w,v7.w)
-; CHECK-NEXT:     q3 = vcmp.gt(v0.w,v7.w)
+; CHECK-NEXT:     q2 = vcmp.gt(v1.w,v7.w)
+; CHECK-NEXT:     q3 = vcmp.gt(v4.w,v7.w)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v5.w = vlsr(v5.w,v4.w)
+; CHECK-NEXT:     v5.w = vlsr(v5.w,v1.w)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v1.w = vlsr(v1.w,v0.w)
-; CHECK-NEXT:     v29.w = vsub(v7.w,v5.w)
+; CHECK-NEXT:     v0.w = vlsr(v0.w,v4.w)
+; CHECK-NEXT:     v30.w = vsub(v7.w,v5.w)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v30.w = vsub(v7.w,v1.w)
-; CHECK-NEXT:     v5 = vmux(q0,v29,v5)
+; CHECK-NEXT:     v31.w = vsub(v7.w,v0.w)
+; CHECK-NEXT:     v5 = vmux(q0,v30,v5)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v1 = vmux(q1,v30,v1)
-; CHECK-NEXT:     v31 = vmux(q2,v5,v28)
+; CHECK-NEXT:     v0 = vmux(q1,v31,v0)
+; CHECK-NEXT:     v1 = vmux(q2,v5,v29)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v1 = vmux(q3,v1,v2)
+; CHECK-NEXT:     v0 = vmux(q3,v0,v2)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v0.h = vpack(v1.w,v31.w):sat
+; CHECK-NEXT:     v0.h = vpack(v0.w,v1.w):sat
 ; CHECK-NEXT:     jumpr r31
 ; CHECK-NEXT:     vmem(r1+#0) = v0.new
 ; CHECK-NEXT:    }
@@ -1097,13 +1097,13 @@ define void @f16u8_0(ptr %a0, ptr %a1) #0 {
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     r3:2 = combine(##32768,#1)
 ; CHECK-NEXT:     r4 = #14
-; CHECK-NEXT:     v0 = vmem(r0+#1)
+; CHECK-NEXT:     v0 = vmem(r0+#0)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v2.h = vsplat(r3)
 ; CHECK-NEXT:     r7:6 = combine(#11,#16)
 ; CHECK-NEXT:     v3.h = vasl(v0.h,r2)
-; CHECK-NEXT:     v1 = vmem(r0+#0)
+; CHECK-NEXT:     v1 = vmem(r0+#1)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v6.h = vsplat(r4)
@@ -1113,7 +1113,7 @@ define void @f16u8_0(ptr %a0, ptr %a1) #0 {
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v7.h = vsplat(r6)
-; CHECK-NEXT:     v5.h = vasl(v1.h,r5)
+; CHECK-NEXT:     v5.h = vasl(v0.h,r5)
 ; CHECK-NEXT:     v4.h = vsub(v4.h,v2.h)
 ; CHECK-NEXT:     v28 = vxor(v28,v28)
 ; CHECK-NEXT:    }
@@ -1125,28 +1125,26 @@ define void @f16u8_0(ptr %a0, ptr %a1) #0 {
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v29.h = vsplat(r2)
 ; CHECK-NEXT:     v4.h = vasr(v4.h,r7)
-; CHECK-NEXT:     q2 = vcmp.gt(v28.h,v1.h)
+; CHECK-NEXT:     q2 = vcmp.gt(v28.h,v0.h)
 ; CHECK-NEXT:     v3.h = vsub(v6.h,v3.h)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v8.h = vasl(v0.h,r5)
-; CHECK-NEXT:     q3 = vcmp.gt(v28.h,v0.h)
+; CHECK-NEXT:     v8.h = vasl(v1.h,r5)
+; CHECK-NEXT:     q3 = vcmp.gt(v28.h,v1.h)
 ; CHECK-NEXT:     v4.h = vsub(v6.h,v4.h)
 ; CHECK-NEXT:     v3.h = vmin(v3.h,v7.h)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v4.h = vmin(v4.h,v7.h)
 ; CHECK-NEXT:     v2 = vor(v8,v2)
-; CHECK-NEXT:     q1 = vcmp.gt(v28.h,v3.h)
+; CHECK-NEXT:     q0 = vcmp.gt(v28.h,v3.h)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     q0 = vcmp.gt(v28.h,v4.h)
+; CHECK-NEXT:     v5.h = vlsr(v5.h,v3.h)
+; CHECK-NEXT:     q1 = vcmp.gt(v28.h,v4.h)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v5.h = vlsr(v5.h,v4.h)
-; CHECK-NEXT:    }
-; CHECK-NEXT:    {
-; CHECK-NEXT:     v2.h = vlsr(v2.h,v3.h)
+; CHECK-NEXT:     v2.h = vlsr(v2.h,v4.h)
 ; CHECK-NEXT:     v30 = vmux(q0,v29,v5)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
@@ -1552,7 +1550,7 @@ define void @f32u8_0(ptr %a0, ptr %a1) #0 {
 ; CHECK-NEXT:     v5 = vmem(r0+#0)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v3 = vsplat(r4)
+; CHECK-NEXT:     v4 = vsplat(r4)
 ; CHECK-NEXT:     r5 = #30
 ; CHECK-NEXT:     r6 = #24
 ; CHECK-NEXT:     v2 = vmem(r0+#1)
@@ -1561,32 +1559,32 @@ define void @f32u8_0(ptr %a0, ptr %a1) #0 {
 ; CHECK-NEXT:     v14 = vsplat(r5)
 ; CHECK-NEXT:     r4 = #32
 ; CHECK-NEXT:     v8.w = vasl(v5.w,r2)
-; CHECK-NEXT:     v0 = vmem(r0+#3)
+; CHECK-NEXT:     v0 = vmem(r0+#2)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v9.w = vasl(v2.w,r2)
 ; CHECK-NEXT:     v13 = vxor(v13,v13)
-; CHECK-NEXT:     v8.w = vsub(v8.w,v3.w)
-; CHECK-NEXT:     v1 = vmem(r0+#2)
+; CHECK-NEXT:     v8.w = vsub(v8.w,v4.w)
+; CHECK-NEXT:     v1 = vmem(r0+#3)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v20 = vsplat(r4)
-; CHECK-NEXT:     v12.w = vasl(v0.w,r2)
-; CHECK-NEXT:     v9.w = vsub(v9.w,v3.w)
+; CHECK-NEXT:     v21 = vsplat(r4)
+; CHECK-NEXT:     v11.w = vasl(v0.w,r2)
+; CHECK-NEXT:     v9.w = vsub(v9.w,v4.w)
 ; CHECK-NEXT:     q0 = vcmp.gt(v13.w,v5.w)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v11.w = vasl(v1.w,r2)
+; CHECK-NEXT:     v12.w = vasl(v1.w,r2)
 ; CHECK-NEXT:     q3 = vcmp.gt(v13.w,v2.w)
-; CHECK-NEXT:     v12.w = vsub(v12.w,v3.w)
+; CHECK-NEXT:     v11.w = vsub(v11.w,v4.w)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     r2 = ##2147483647
 ; CHECK-NEXT:     r7 = #64
-; CHECK-NEXT:     v11.w = vsub(v11.w,v3.w)
+; CHECK-NEXT:     v12.w = vsub(v12.w,v4.w)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v22 = vsplat(r2)
+; CHECK-NEXT:     v23 = vsplat(r2)
 ; CHECK-NEXT:     v8.w = vasr(v8.w,r6)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
@@ -1596,68 +1594,68 @@ define void @f32u8_0(ptr %a0, ptr %a1) #0 {
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v6.w = vasl(v5.w,r3)
 ; CHECK-NEXT:     v9.w = vsub(v14.w,v9.w)
-; CHECK-NEXT:     v8.w = vmin(v8.w,v20.w)
+; CHECK-NEXT:     v8.w = vmin(v8.w,v21.w)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v7.w = vasl(v2.w,r3)
-; CHECK-NEXT:     v6 = vor(v6,v3)
-; CHECK-NEXT:     v9.w = vmin(v9.w,v20.w)
+; CHECK-NEXT:     v6 = vor(v6,v4)
+; CHECK-NEXT:     v9.w = vmin(v9.w,v21.w)
 ; CHECK-NEXT:     q1 = vcmp.gt(v13.w,v8.w)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v19.w = vasr(v11.w,r6)
-; CHECK-NEXT:     v7 = vor(v7,v3)
+; CHECK-NEXT:     v20.w = vasr(v11.w,r6)
+; CHECK-NEXT:     v7 = vor(v7,v4)
 ; CHECK-NEXT:     q2 = vcmp.gt(v13.w,v9.w)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v12.w = vasr(v12.w,r6)
-; CHECK-NEXT:     v5.w = vsub(v14.w,v19.w)
+; CHECK-NEXT:     v5.w = vsub(v14.w,v20.w)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v4.w = vasl(v1.w,r3)
-; CHECK-NEXT:     v21.w = vsub(v14.w,v12.w)
-; CHECK-NEXT:     v5.w = vmin(v5.w,v20.w)
+; CHECK-NEXT:     v3.w = vasl(v1.w,r3)
+; CHECK-NEXT:     v22.w = vsub(v14.w,v12.w)
+; CHECK-NEXT:     v5.w = vmin(v5.w,v21.w)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v10.w = vasl(v0.w,r3)
-; CHECK-NEXT:     v4 = vor(v4,v3)
+; CHECK-NEXT:     v3 = vor(v3,v4)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v6.w = vlsr(v6.w,v8.w)
-; CHECK-NEXT:     v3 = vor(v10,v3)
-; CHECK-NEXT:     v10.w = vmin(v21.w,v20.w)
+; CHECK-NEXT:     v10 = vor(v10,v4)
+; CHECK-NEXT:     v4.w = vmin(v22.w,v21.w)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v7.w = vlsr(v7.w,v9.w)
-; CHECK-NEXT:     v24 = vmux(q1,v22,v6)
+; CHECK-NEXT:     v6 = vmux(q1,v23,v6)
 ; CHECK-NEXT:     q1 = vcmp.gt(v13.w,v5.w)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v23.w = vlsr(v4.w,v5.w)
-; CHECK-NEXT:     v25 = vmux(q2,v22,v7)
-; CHECK-NEXT:     q2 = vcmp.gt(v13.w,v10.w)
-; CHECK-NEXT:     v4 = vmux(q0,v13,v24)
+; CHECK-NEXT:     v24.w = vlsr(v10.w,v5.w)
+; CHECK-NEXT:     v7 = vmux(q2,v23,v7)
+; CHECK-NEXT:     q2 = vcmp.gt(v13.w,v4.w)
+; CHECK-NEXT:     v25 = vmux(q0,v13,v6)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v3.w = vlsr(v3.w,v10.w)
-; CHECK-NEXT:     v26 = vmux(q3,v13,v25)
-; CHECK-NEXT:     v2 = vmux(q1,v22,v23)
-; CHECK-NEXT:     q1 = vcmp.gt(v13.w,v1.w)
+; CHECK-NEXT:     v3.w = vlsr(v3.w,v4.w)
+; CHECK-NEXT:     v26 = vmux(q3,v13,v7)
+; CHECK-NEXT:     v2 = vmux(q1,v23,v24)
+; CHECK-NEXT:     q1 = vcmp.gt(v13.w,v0.w)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v27 = vmux(q2,v22,v3)
-; CHECK-NEXT:     q3 = vcmp.gt(v13.w,v0.w)
+; CHECK-NEXT:     v27 = vmux(q2,v23,v3)
+; CHECK-NEXT:     q3 = vcmp.gt(v13.w,v1.w)
 ; CHECK-NEXT:     v28 = vmux(q1,v13,v2)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v29.uh = vpack(v26.w,v4.w):sat
-; CHECK-NEXT:     v1 = vmux(q3,v13,v27)
+; CHECK-NEXT:     v29.uh = vpack(v26.w,v25.w):sat
+; CHECK-NEXT:     v0 = vmux(q3,v13,v27)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v30.uh = vpack(v1.w,v28.w):sat
+; CHECK-NEXT:     v30.uh = vpack(v28.w,v0.w):sat
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v0.uh = vpack(v1.w,v28.w):sat
+; CHECK-NEXT:     v0.uh = vpack(v0.w,v28.w):sat
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v31.ub = vpack(v30.h,v29.h):sat
@@ -1684,13 +1682,13 @@ define void @f32u8_1(ptr %a0, ptr %a1) #0 {
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     r3:2 = combine(##-2147483648,#1)
 ; CHECK-NEXT:     r4 = #30
-; CHECK-NEXT:     v0 = vmem(r0+#1)
+; CHECK-NEXT:     v0 = vmem(r0+#0)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v2 = vsplat(r3)
 ; CHECK-NEXT:     r7:6 = combine(#24,#32)
 ; CHECK-NEXT:     v3.w = vasl(v0.w,r2)
-; CHECK-NEXT:     v1 = vmem(r0+#0)
+; CHECK-NEXT:     v1 = vmem(r0+#1)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v6 = vsplat(r4)
@@ -1700,7 +1698,7 @@ define void @f32u8_1(ptr %a0, ptr %a1) #0 {
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v7 = vsplat(r6)
-; CHECK-NEXT:     v5.w = vasl(v1.w,r5)
+; CHECK-NEXT:     v5.w = vasl(v0.w,r5)
 ; CHECK-NEXT:     v4.w = vsub(v4.w,v2.w)
 ; CHECK-NEXT:     v27 = vxor(v27,v27)
 ; CHECK-NEXT:    }
@@ -1712,13 +1710,13 @@ define void @f32u8_1(ptr %a0, ptr %a1) #0 {
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v28 = vsplat(r3)
 ; CHECK-NEXT:     v4.w = vasr(v4.w,r7)
-; CHECK-NEXT:     q2 = vcmp.gt(v27.w,v1.w)
+; CHECK-NEXT:     q2 = vcmp.gt(v27.w,v0.w)
 ; CHECK-NEXT:     v3.w = vsub(v6.w,v3.w)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     r2 = #64
-; CHECK-NEXT:     v8.w = vasl(v0.w,r5)
-; CHECK-NEXT:     q3 = vcmp.gt(v27.w,v0.w)
+; CHECK-NEXT:     v8.w = vasl(v1.w,r5)
+; CHECK-NEXT:     q3 = vcmp.gt(v27.w,v1.w)
 ; CHECK-NEXT:     v4.w = vsub(v6.w,v4.w)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
@@ -1727,14 +1725,14 @@ define void @f32u8_1(ptr %a0, ptr %a1) #0 {
 ; CHECK-NEXT:     v2 = vor(v8,v2)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     q1 = vcmp.gt(v27.w,v3.w)
-; CHECK-NEXT:     q0 = vcmp.gt(v27.w,v4.w)
+; CHECK-NEXT:     q0 = vcmp.gt(v27.w,v3.w)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v5.w = vlsr(v5.w,v4.w)
+; CHECK-NEXT:     v5.w = vlsr(v5.w,v3.w)
+; CHECK-NEXT:     q1 = vcmp.gt(v27.w,v4.w)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v2.w = vlsr(v2.w,v3.w)
+; CHECK-NEXT:     v2.w = vlsr(v2.w,v4.w)
 ; CHECK-NEXT:     v29 = vmux(q0,v28,v5)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
@@ -1843,13 +1841,13 @@ define void @f32u16_0(ptr %a0, ptr %a1) #0 {
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     r3:2 = combine(##-2147483648,#1)
 ; CHECK-NEXT:     r4 = #30
-; CHECK-NEXT:     v0 = vmem(r0+#1)
+; CHECK-NEXT:     v0 = vmem(r0+#0)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v2 = vsplat(r3)
 ; CHECK-NEXT:     r7:6 = combine(#24,#32)
 ; CHECK-NEXT:     v3.w = vasl(v0.w,r2)
-; CHECK-NEXT:     v1 = vmem(r0+#0)
+; CHECK-NEXT:     v1 = vmem(r0+#1)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v6 = vsplat(r4)
@@ -1859,7 +1857,7 @@ define void @f32u16_0(ptr %a0, ptr %a1) #0 {
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v7 = vsplat(r6)
-; CHECK-NEXT:     v5.w = vasl(v1.w,r5)
+; CHECK-NEXT:     v5.w = vasl(v0.w,r5)
 ; CHECK-NEXT:     v4.w = vsub(v4.w,v2.w)
 ; CHECK-NEXT:     v28 = vxor(v28,v28)
 ; CHECK-NEXT:    }
@@ -1871,28 +1869,26 @@ define void @f32u16_0(ptr %a0, ptr %a1) #0 {
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v29 = vsplat(r2)
 ; CHECK-NEXT:     v4.w = vasr(v4.w,r7)
-; CHECK-NEXT:     q2 = vcmp.gt(v28.w,v1.w)
+; CHECK-NEXT:     q2 = vcmp.gt(v28.w,v0.w)
 ; CHECK-NEXT:     v3.w = vsub(v6.w,v3.w)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v8.w = vasl(v0.w,r5)
-; CHECK-NEXT:     q3 = vcmp.gt(v28.w,v0.w)
+; CHECK-NEXT:     v8.w = vasl(v1.w,r5)
+; CHECK-NEXT:     q3 = vcmp.gt(v28.w,v1.w)
 ; CHECK-NEXT:     v4.w = vsub(v6.w,v4.w)
 ; CHECK-NEXT:     v3.w = vmin(v3.w,v7.w)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v4.w = vmin(v4.w,v7.w)
 ; CHECK-NEXT:     v2 = vor(v8,v2)
-; CHECK-NEXT:     q1 = vcmp.gt(v28.w,v3.w)
+; CHECK-NEXT:     q0 = vcmp.gt(v28.w,v3.w)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     q0 = vcmp.gt(v28.w,v4.w)
+; CHECK-NEXT:     v5.w = vlsr(v5.w,v3.w)
+; CHECK-NEXT:     q1 = vcmp.gt(v28.w,v4.w)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v5.w = vlsr(v5.w,v4.w)
-; CHECK-NEXT:    }
-; CHECK-NEXT:    {
-; CHECK-NEXT:     v2.w = vlsr(v2.w,v3.w)
+; CHECK-NEXT:     v2.w = vlsr(v2.w,v4.w)
 ; CHECK-NEXT:     v30 = vmux(q0,v29,v5)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
diff --git a/llvm/test/CodeGen/Hexagon/autohvx/int-to-fp.ll b/llvm/test/CodeGen/Hexagon/autohvx/int-to-fp.ll
index c0e38b9243033..c3308ec193995 100644
--- a/llvm/test/CodeGen/Hexagon/autohvx/int-to-fp.ll
+++ b/llvm/test/CodeGen/Hexagon/autohvx/int-to-fp.ll
@@ -1042,7 +1042,7 @@ define void @s32f16_0(ptr %a0, ptr %a1) #0 {
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     r3:2 = combine(#8,#1)
 ; CHECK-NEXT:     r6 = #255
-; CHECK-NEXT:     v6.w = vabs(v1.w)
+; CHECK-NEXT:     v4.w = vabs(v1.w)
 ; CHECK-NEXT:     v1.cur = vmem(r0+#0)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
@@ -1054,102 +1054,102 @@ define void @s32f16_0(ptr %a0, ptr %a1) #0 {
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v9 = vsplat(r4)
 ; CHECK-NEXT:     v8 = vsplat(r6)
-; CHECK-NEXT:     v3.uw = vcl0(v6.uw)
-; CHECK-NEXT:     v20 = vxor(v20,v20)
+; CHECK-NEXT:     r4 = #159
+; CHECK-NEXT:     v3.uw = vcl0(v4.uw)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     r4 = #159
-; CHECK-NEXT:     v4.uw = vcl0(v5.uw)
-; CHECK-NEXT:     v3.w = vadd(v3.w,v2.w)
+; CHECK-NEXT:     v6.uw = vcl0(v5.uw)
+; CHECK-NEXT:     v7.w = vadd(v3.w,v2.w)
+; CHECK-NEXT:     v3 = vxor(v3,v3)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v27 = vsplat(r4)
+; CHECK-NEXT:     v26 = vsplat(r4)
 ; CHECK-NEXT:     r5 = ##-2147483648
-; CHECK-NEXT:     v7.w = vadd(v4.w,v2.w)
+; CHECK-NEXT:     v6.w = vadd(v6.w,v2.w)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v13 = vsplat(r5)
-; CHECK-NEXT:     v6.w = vasl(v6.w,v3.w)
-; CHECK-NEXT:     q0 = vcmp.gt(v20.w,v1.w)
+; CHECK-NEXT:     v4.w = vasl(v4.w,v7.w)
+; CHECK-NEXT:     q0 = vcmp.gt(v3.w,v1.w)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v5.w = vasl(v5.w,v7.w)
-; CHECK-NEXT:     v26 = vmux(q0,v13,v20)
-; CHECK-NEXT:     v10.w = vadd(v6.w,v8.w)
-; CHECK-NEXT:     v11 = vand(v6,v9)
+; CHECK-NEXT:     v5.w = vasl(v5.w,v6.w)
+; CHECK-NEXT:     v25 = vmux(q0,v13,v3)
+; CHECK-NEXT:     v10.w = vadd(v4.w,v8.w)
+; CHECK-NEXT:     v11 = vand(v4,v9)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v9 = vand(v5,v9)
-; CHECK-NEXT:     q3 = vcmp.eq(v11.w,v20.w)
+; CHECK-NEXT:     q3 = vcmp.eq(v11.w,v3.w)
 ; CHECK-NEXT:     v8.w = vadd(v5.w,v8.w)
-; CHECK-NEXT:     q1 = vcmp.gt(v6.uw,v10.uw)
+; CHECK-NEXT:     q1 = vcmp.gt(v4.uw,v10.uw)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v21.uw = vlsr(v10.uw,r3)
-; CHECK-NEXT:     q2 = vcmp.eq(v9.w,v20.w)
-; CHECK-NEXT:     v22 = vmux(q3,v20,v2)
+; CHECK-NEXT:     v20.uw = vlsr(v10.uw,r3)
+; CHECK-NEXT:     q2 = vcmp.eq(v9.w,v3.w)
+; CHECK-NEXT:     v21 = vmux(q3,v3,v2)
 ; CHECK-NEXT:     q3 = vcmp.gt(v5.uw,v8.uw)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v8.uw = vlsr(v8.uw,r3)
-; CHECK-NEXT:     v9.w = vadd(v21.w,v22.w)
-; CHECK-NEXT:     v24 = vmux(q2,v20,v2)
-; CHECK-NEXT:     v23 = vmux(q1,v2,v20)
+; CHECK-NEXT:     v9.w = vadd(v20.w,v21.w)
+; CHECK-NEXT:     v23 = vmux(q2,v3,v2)
+; CHECK-NEXT:     v22 = vmux(q1,v2,v3)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v12.uw = vlsr(v6.uw,r3)
-; CHECK-NEXT:     v2 = vmux(q3,v2,v20)
-; CHECK-NEXT:     v25.w = vadd(v8.w,v24.w)
-; CHECK-NEXT:     v3.w = vsub(v23.w,v3.w)
+; CHECK-NEXT:     v12.uw = vlsr(v4.uw,r3)
+; CHECK-NEXT:     v2 = vmux(q3,v2,v3)
+; CHECK-NEXT:     v24.w = vadd(v8.w,v23.w)
+; CHECK-NEXT:     v7.w = vsub(v22.w,v7.w)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v5.uw = vlsr(v5.uw,r3)
-; CHECK-NEXT:     v2.w = vsub(v2.w,v7.w)
-; CHECK-NEXT:     q3 = vcmp.eq(v12.w,v21.w)
-; CHECK-NEXT:     v3.w = vadd(v3.w,v27.w)
+; CHECK-NEXT:     v2.w = vsub(v2.w,v6.w)
+; CHECK-NEXT:     q3 = vcmp.eq(v12.w,v20.w)
+; CHECK-NEXT:     v7.w = vadd(v7.w,v26.w)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     r3 = #23
-; CHECK-NEXT:     v6.uw = vlsr(v21.uw,r2)
+; CHECK-NEXT:     v4.uw = vlsr(v20.uw,r2)
 ; CHECK-NEXT:     q2 = vcmp.eq(v5.w,v8.w)
-; CHECK-NEXT:     v2.w = vadd(v2.w,v27.w)
+; CHECK-NEXT:     v2.w = vadd(v2.w,v26.w)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v9.uw = vlsr(v9.uw,r2)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v28.uw = vlsr(v25.uw,r2)
-; CHECK-NEXT:     v6 = vmux(q3,v9,v6)
-; CHECK-NEXT:     q3 = vcmp.gt(v20.w,v0.w)
+; CHECK-NEXT:     v27.uw = vlsr(v24.uw,r2)
+; CHECK-NEXT:     v4 = vmux(q3,v9,v4)
+; CHECK-NEXT:     q3 = vcmp.gt(v3.w,v0.w)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v29.uw = vlsr(v8.uw,r2)
-; CHECK-NEXT:     v30 = vmux(q3,v13,v20)
-; CHECK-NEXT:     v6 = vor(v26,v6)
-; CHECK-NEXT:     q3 = vcmp.eq(v0.w,v20.w)
+; CHECK-NEXT:     v28.uw = vlsr(v8.uw,r2)
+; CHECK-NEXT:     v30 = vmux(q3,v13,v3)
+; CHECK-NEXT:     v4 = vor(v25,v4)
+; CHECK-NEXT:     q3 = vcmp.eq(v0.w,v3.w)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v3.w = vasl(v3.w,r3)
-; CHECK-NEXT:     v5 = vmux(q2,v28,v29)
-; CHECK-NEXT:     q2 = vcmp.eq(v1.w,v20.w)
+; CHECK-NEXT:     v29.w = vasl(v7.w,r3)
+; CHECK-NEXT:     v5 = vmux(q2,v27,v28)
+; CHECK-NEXT:     q2 = vcmp.eq(v1.w,v3.w)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v2.w = vasl(v2.w,r3)
 ; CHECK-NEXT:     v31 = vor(v30,v5)
-; CHECK-NEXT:     v3 = vor(v6,v3)
+; CHECK-NEXT:     v4 = vor(v4,v29)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v1 = vor(v31,v2)
-; CHECK-NEXT:     v3 = vmux(q2,v20,v3)
+; CHECK-NEXT:     v4 = vmux(q2,v3,v4)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v0 = vmux(q3,v20,v1)
+; CHECK-NEXT:     v0 = vmux(q3,v3,v1)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v2.qf32 = vadd(v3.sf,v20.sf)
+; CHECK-NEXT:     v2.qf32 = vadd(v4.sf,v3.sf)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v3.qf32 = vadd(v0.sf,v20.sf)
+; CHECK-NEXT:     v3.qf32 = vadd(v0.sf,v3.sf)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v0.hf = v3:2.qf32
@@ -2369,20 +2369,20 @@ define void @u32f16_0(ptr %a0, ptr %a1) #0 {
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     r3:2 = combine(#8,#1)
 ; CHECK-NEXT:     r6 = #255
-; CHECK-NEXT:     v3.uw = vcl0(v0.uw)
-; CHECK-NEXT:     v0.cur = vmem(r0+#1)
+; CHECK-NEXT:     v3.uw = vcl0(v1.uw)
+; CHECK-NEXT:     v1.cur = vmem(r0+#0)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v2 = vsplat(r2)
 ; CHECK-NEXT:     r4 = #512
-; CHECK-NEXT:     v4.uw = vcl0(v1.uw)
-; CHECK-NEXT:     v1.cur = vmem(r0+#0)
+; CHECK-NEXT:     v4.uw = vcl0(v0.uw)
+; CHECK-NEXT:     v0.cur = vmem(r0+#1)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v7 = vsplat(r4)
 ; CHECK-NEXT:     v6 = vsplat(r6)
-; CHECK-NEXT:     v4.w = vadd(v4.w,v2.w)
 ; CHECK-NEXT:     v3.w = vadd(v3.w,v2.w)
+; CHECK-NEXT:     v4.w = vadd(v4.w,v2.w)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     r4 = #159
@@ -2390,10 +2390,10 @@ define void @u32f16_0(ptr %a0, ptr %a1) #0 {
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v10 = vsplat(r4)
-; CHECK-NEXT:     v5.w = vasl(v1.w,v4.w)
+; CHECK-NEXT:     v5.w = vasl(v1.w,v3.w)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v8.w = vasl(v0.w,v3.w)
+; CHECK-NEXT:     v8.w = vasl(v0.w,v4.w)
 ; CHECK-NEXT:     v11.w = vadd(v5.w,v6.w)
 ; CHECK-NEXT:     v13 = vand(v5,v7)
 ; CHECK-NEXT:    }
@@ -2416,15 +2416,15 @@ define void @u32f16_0(ptr %a0, ptr %a1) #0 {
 ; CHECK-NEXT:     v2 = vmux(q0,v9,v2)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v4.w = vsub(v29.w,v4.w)
+; CHECK-NEXT:     v3.w = vsub(v29.w,v3.w)
 ; CHECK-NEXT:     v7.w = vadd(v27.w,v28.w)
-; CHECK-NEXT:     v3.w = vsub(v30.w,v3.w)
+; CHECK-NEXT:     v4.w = vsub(v30.w,v4.w)
 ; CHECK-NEXT:     v2.w = vadd(v6.w,v2.w)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v12.uw = vlsr(v5.uw,r3)
-; CHECK-NEXT:     v4.w = vadd(v4.w,v10.w)
 ; CHECK-NEXT:     v3.w = vadd(v3.w,v10.w)
+; CHECK-NEXT:     v4.w = vadd(v4.w,v10.w)
 ; CHECK-NEXT:     q2 = vcmp.eq(v1.w,v9.w)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
@@ -2448,16 +2448,16 @@ define void @u32f16_0(ptr %a0, ptr %a1) #0 {
 ; CHECK-NEXT:     v6.uw = vlsr(v6.uw,r2)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v4.w = vasl(v4.w,r3)
+; CHECK-NEXT:     v3.w = vasl(v3.w,r3)
 ; CHECK-NEXT:     v31 = vmux(q1,v2,v6)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v2.w = vasl(v3.w,r3)
-; CHECK-NEXT:     v4 = vor(v5,v4)
+; CHECK-NEXT:     v2.w = vasl(v4.w,r3)
+; CHECK-NEXT:     v3 = vor(v5,v3)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v1 = vor(v31,v2)
-; CHECK-NEXT:     v3 = vmux(q2,v9,v4)
+; CHECK-NEXT:     v3 = vmux(q2,v9,v3)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v0 = vmux(q3,v9,v1)
diff --git a/llvm/test/CodeGen/Hexagon/isel/extract-subvec.ll b/llvm/test/CodeGen/Hexagon/isel/extract-subvec.ll
new file mode 100644
index 0000000000000..f7262eacabe8e
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/isel/extract-subvec.ll
@@ -0,0 +1,34 @@
+; Check if extract_subvectors is handled properly in Hexagon backend when the
+; the source vector is a vector-pair and result vector is not hvx vector size.
+; https://github.com/llvm/llvm-project/issues/128775
+;
+; Example of such a case:
+;    ...
+;    t2: v64i32,ch = CopyFromReg t0, Register:v64i32 %0
+;    t17: v2i32 = extract_subvector t2, Constant:i32<4>
+;    ...
+
+; RUN: llc -mtriple=hexagon -mattr="hvx-length128b" < %s | FileCheck %s
+
+; CHECK-LABEL: extract_subvec:
+; CHECK: r29 = and(r29,#-128)
+; CHECK: [[R1:r([0-9]+)]] = add(r29,#0)
+; CHECK: vmem([[R1]]+#0) = v0
+; CHECK-DAG: r[[R4:[0-9]+]] = memw([[R1]]+#0)
+; CHECK-DAG: r[[R5:[0-9]+]] = memw([[R1]]+#4)
+; CHECK-DAG: r[[R6:[0-9]+]] = memw([[R1]]+#8)
+; CHECK-DAG: r[[R7:[0-9]+]] = memw([[R1]]+#12)
+; CHECK-DAG: r[[R8:[0-9]+]] = memw([[R1]]+#16)
+; CHECK-DAG: r[[R9:[0-9]+]] = memw([[R1]]+#20)
+; CHECK-DAG: r[[R2:[0-9]+]] = memw([[R1]]+#24)
+; CHECK-DAG: r[[R3:[0-9]+]] = memw([[R1]]+#28)
+; CHECK-DAG: memd(r0+#0) = r[[R5]]:[[R4]]
+; CHECK-DAG: memd(r0+#8) = r[[R7]]:[[R6]]
+; CHECK-DAG: memd(r0+#16) = r[[R9]]:[[R8]]
+; CHECK-DAG: memw(r0+#24) = r[[R2]]
+define void @extract_subvec(<56 x i32> %val, ptr %buf) {
+entry:
+  %split = shufflevector <56 x i32> %val, <56 x i32> zeroinitializer, <7 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
+  store <7 x i32> %split, ptr %buf, align 32
+  ret void
+}

From f62b50e0e8f170412c95d7c1263c46a400bbeba8 Mon Sep 17 00:00:00 2001
From: DianQK <dianqk@dianqk.net>
Date: Fri, 7 Mar 2025 05:46:32 +0800
Subject: [PATCH 235/282] [ValueTracking] Skip incoming values that are the
 same as the phi in `isGuaranteedNotToBeUndefOrPoison` (#130111)

Fixes (keep it open) #130110.

If the incoming value is PHI itself, we can skip this. If we can
guarantee that the other incoming values are neither undef nor poison,
then we can also guarantee that the value isn't either. If we cannot
guarantee that, it makes no sense in calculating it.

(cherry picked from commit 462eb7e28ef4507b16a4b45efb356bc6a3523615)
---
 llvm/lib/Analysis/ValueTracking.cpp          |  2 +
 llvm/test/Analysis/ValueTracking/phi-self.ll | 89 ++++++++++++++++++++
 2 files changed, 91 insertions(+)
 create mode 100644 llvm/test/Analysis/ValueTracking/phi-self.ll

diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 4e79f41df2eb9..02a48d35a74aa 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -7777,6 +7777,8 @@ static bool isGuaranteedNotToBeUndefOrPoison(
       unsigned Num = PN->getNumIncomingValues();
       bool IsWellDefined = true;
       for (unsigned i = 0; i < Num; ++i) {
+        if (PN == PN->getIncomingValue(i))
+          continue;
         auto *TI = PN->getIncomingBlock(i)->getTerminator();
         if (!isGuaranteedNotToBeUndefOrPoison(PN->getIncomingValue(i), AC, TI,
                                               DT, Depth + 1, Kind)) {
diff --git a/llvm/test/Analysis/ValueTracking/phi-self.ll b/llvm/test/Analysis/ValueTracking/phi-self.ll
new file mode 100644
index 0000000000000..17afd872cab03
--- /dev/null
+++ b/llvm/test/Analysis/ValueTracking/phi-self.ll
@@ -0,0 +1,89 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=instsimplify < %s | FileCheck %s
+
+; Test `%r` can be replaced by `%nonpoison`.
+
+define i32 @other_noundef(i32 noundef %arg) {
+; CHECK-LABEL: define i32 @other_noundef(
+; CHECK-SAME: i32 noundef [[ARG:%.*]]) {
+; CHECK-NEXT:  [[START:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[NONPOISON:%.*]] = phi i32 [ 0, %[[START]] ], [ [[NONPOISON]], %[[BB0:.*]] ], [ [[ARG]], %[[BB1:.*]] ]
+; CHECK-NEXT:    [[I:%.*]] = call i32 @opaque()
+; CHECK-NEXT:    switch i32 [[I]], label %[[EXIT:.*]] [
+; CHECK-NEXT:      i32 0, label %[[BB0]]
+; CHECK-NEXT:      i32 1, label %[[BB1]]
+; CHECK-NEXT:    ]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret i32 [[NONPOISON]]
+; CHECK:       [[BB0]]:
+; CHECK-NEXT:    br label %[[LOOP]]
+; CHECK:       [[BB1]]:
+; CHECK-NEXT:    br label %[[LOOP]]
+;
+start:
+  br label %loop
+
+loop:
+  %nonpoison = phi i32 [ 0, %start ], [ %nonpoison, %bb0 ], [ %arg, %bb1 ]
+  %i = call i32 @opaque()
+  switch i32 %i, label %exit [
+  i32 0, label %bb0
+  i32 1, label %bb1
+  ]
+
+exit:
+  %r = freeze i32 %nonpoison
+  ret i32 %r
+
+bb0:
+  br label %loop
+
+bb1:
+  br label %loop
+}
+
+define i32 @other_poison(i32 %arg) {
+; CHECK-LABEL: define i32 @other_poison(
+; CHECK-SAME: i32 [[ARG:%.*]]) {
+; CHECK-NEXT:  [[START:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[MAYPOISON:%.*]] = phi i32 [ 0, %[[START]] ], [ [[MAYPOISON]], %[[BB0:.*]] ], [ [[ARG]], %[[BB1:.*]] ]
+; CHECK-NEXT:    [[I:%.*]] = call i32 @opaque()
+; CHECK-NEXT:    switch i32 [[I]], label %[[EXIT:.*]] [
+; CHECK-NEXT:      i32 0, label %[[BB0]]
+; CHECK-NEXT:      i32 1, label %[[BB1]]
+; CHECK-NEXT:    ]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[R:%.*]] = freeze i32 [[MAYPOISON]]
+; CHECK-NEXT:    ret i32 [[R]]
+; CHECK:       [[BB0]]:
+; CHECK-NEXT:    br label %[[LOOP]]
+; CHECK:       [[BB1]]:
+; CHECK-NEXT:    br label %[[LOOP]]
+;
+start:
+  br label %loop
+
+loop:
+  %maypoison = phi i32 [ 0, %start ], [ %maypoison, %bb0 ], [ %arg, %bb1 ]
+  %i = call i32 @opaque()
+  switch i32 %i, label %exit [
+  i32 0, label %bb0
+  i32 1, label %bb1
+  ]
+
+exit:
+  %r = freeze i32 %maypoison
+  ret i32 %r
+
+bb0:
+  br label %loop
+
+bb1:
+  br label %loop
+}
+
+declare i32 @opaque()

From f09bcfbdc90b300b77cb1ddb6f0cffe386388897 Mon Sep 17 00:00:00 2001
From: Lu Weining <luweining@loongson.cn>
Date: Thu, 6 Mar 2025 16:17:12 +0800
Subject: [PATCH 236/282] [LoongArch] Relax the restrictions of inlineasm
 operand modifier 'u' and 'w' (#129864)

- Allow 'u' and 'w' on LASX, LSX or floating point register operands.
- Also add missing description in LangRef.

Fixes #129863.

(cherry picked from commit bae6644e1227b2555f92b1962dac6c2444eaaaf2)
---
 llvm/docs/LangRef.rst                         |  2 +
 .../Target/LoongArch/LoongArchAsmPrinter.cpp  | 35 ++++++++++------
 .../lasx/inline-asm-operand-modifier.ll       | 40 +++++++++++++++++++
 3 files changed, 64 insertions(+), 13 deletions(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index e002195cb7ed5..1c8eaa60e1c8a 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -5826,6 +5826,8 @@ Hexagon:
 
 LoongArch:
 
+- ``u``: Print an LASX register.
+- ``w``: Print an LSX register.
 - ``z``: Print $zero register if operand is zero, otherwise print it normally.
 
 MSP430:
diff --git a/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp b/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp
index 169f9568e5362..895a8e2646692 100644
--- a/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp
@@ -90,20 +90,29 @@ bool LoongArchAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
         return false;
       }
       break;
-    case 'w': // Print LSX registers.
-      if (MO.getReg().id() >= LoongArch::VR0 &&
-          MO.getReg().id() <= LoongArch::VR31)
-        break;
-      // The modifier is 'w' but the operand is not an LSX register; Report an
-      // unknown operand error.
-      return true;
     case 'u': // Print LASX registers.
-      if (MO.getReg().id() >= LoongArch::XR0 &&
-          MO.getReg().id() <= LoongArch::XR31)
-        break;
-      // The modifier is 'u' but the operand is not an LASX register; Report an
-      // unknown operand error.
-      return true;
+    case 'w': // Print LSX registers.
+    {
+      // If the operand is an LASX, LSX or floating point register, print the
+      // name of LASX or LSX register with the same index in that register
+      // class.
+      unsigned RegID = MO.getReg().id(), FirstReg;
+      if (RegID >= LoongArch::XR0 && RegID <= LoongArch::XR31)
+        FirstReg = LoongArch::XR0;
+      else if (RegID >= LoongArch::VR0 && RegID <= LoongArch::VR31)
+        FirstReg = LoongArch::VR0;
+      else if (RegID >= LoongArch::F0_64 && RegID <= LoongArch::F31_64)
+        FirstReg = LoongArch::F0_64;
+      else if (RegID >= LoongArch::F0 && RegID <= LoongArch::F31)
+        FirstReg = LoongArch::F0;
+      else
+        return true;
+      OS << '$'
+         << LoongArchInstPrinter::getRegisterName(
+                RegID - FirstReg +
+                (ExtraCode[0] == 'u' ? LoongArch::XR0 : LoongArch::VR0));
+      return false;
+    }
       // TODO: handle other extra codes if any.
     }
   }
diff --git a/llvm/test/CodeGen/LoongArch/lasx/inline-asm-operand-modifier.ll b/llvm/test/CodeGen/LoongArch/lasx/inline-asm-operand-modifier.ll
index 201e34c8b5ae0..8b25a6525381b 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/inline-asm-operand-modifier.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/inline-asm-operand-modifier.ll
@@ -12,3 +12,43 @@ entry:
   %0 = tail call <4 x i64> asm sideeffect "xvldi ${0:u}, 1", "=f"()
   ret void
 }
+
+define void @test_u_2xi64() nounwind {
+; CHECK-LABEL: test_u_2xi64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    xvldi $xr0, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call <2 x i64> asm sideeffect "xvldi ${0:u}, 1", "=f"()
+  ret void
+}
+
+define void @test_w_4xi64() nounwind {
+; CHECK-LABEL: test_w_4xi64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vldi $vr0, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call <4 x i64> asm sideeffect "vldi ${0:w}, 1", "=f"()
+  ret void
+}
+
+define void @m128i_to_m256i(ptr %out, ptr %in) nounwind {
+; CHECK-LABEL: m128i_to_m256i:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    xvrepli.b $xr1, 0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 32
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    xvst $xr1, $a0, 0
+; CHECK-NEXT:    ret
+  %v = load <2 x i64>, ptr %in
+  %x = call <4 x i64> asm sideeffect "xvpermi.q ${0:u}, ${1:u}, 32", "=f,f,0"(<2 x i64> %v, <4 x i64> zeroinitializer)
+  store <4 x i64> %x, ptr %out
+  ret void
+}

From 946780474f3b04568184183d2db28e0480212eb2 Mon Sep 17 00:00:00 2001
From: Paul Osmialowski <pawel.osmialowski@arm.com>
Date: Mon, 3 Mar 2025 18:10:36 +0000
Subject: [PATCH 237/282] [libc++][test] extend XFAIL clauses to cover Amazon
 Linux too (#129377)

The default triple of Amazon Linux on AArch64 is aarch64-amazon-linux,
see issue highlighded by PR #109263, somewhat serious linker issues are
encountered if any other triple is being used.

Unfortunately, this makes XFAIL lines like
`XFAIL: target=aarch64{{.*}}-linux-gnu` ineffective, making it
impossible to complete all of the check-cxx on Amazon Linux without
failing.

(cherry picked from commit 8f4ee42d59976a9343d7576ef9a1fe2cf482a057)
---
 .../iostream.format/std.manip/setfill_wchar_max.pass.cpp         | 1 +
 libcxx/test/std/re/re.alg/re.alg.match/awk.locale.pass.cpp       | 1 +
 libcxx/test/std/re/re.alg/re.alg.match/basic.locale.pass.cpp     | 1 +
 libcxx/test/std/re/re.alg/re.alg.match/ecma.locale.pass.cpp      | 1 +
 libcxx/test/std/re/re.alg/re.alg.match/extended.locale.pass.cpp  | 1 +
 libcxx/test/std/re/re.alg/re.alg.search/awk.locale.pass.cpp      | 1 +
 libcxx/test/std/re/re.alg/re.alg.search/basic.locale.pass.cpp    | 1 +
 libcxx/test/std/re/re.alg/re.alg.search/ecma.locale.pass.cpp     | 1 +
 libcxx/test/std/re/re.alg/re.alg.search/extended.locale.pass.cpp | 1 +
 libcxx/test/std/re/re.traits/lookup_collatename.pass.cpp         | 1 +
 10 files changed, 10 insertions(+)

diff --git a/libcxx/test/std/input.output/iostream.format/std.manip/setfill_wchar_max.pass.cpp b/libcxx/test/std/input.output/iostream.format/std.manip/setfill_wchar_max.pass.cpp
index 9d4126153cc23..b24c0b90d86f1 100644
--- a/libcxx/test/std/input.output/iostream.format/std.manip/setfill_wchar_max.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.format/std.manip/setfill_wchar_max.pass.cpp
@@ -17,6 +17,7 @@
 // XFAIL: target={{.*}}-windows{{.*}} && libcpp-abi-version=1
 // XFAIL: target=armv{{7|8}}{{l?}}{{.*}}-linux-gnueabihf && libcpp-abi-version=1
 // XFAIL: target=aarch64{{.*}}-linux-gnu && libcpp-abi-version=1
+// XFAIL: target=aarch64{{.*}}-amazon-linux && libcpp-abi-version=1
 
 #include <iomanip>
 #include <ostream>
diff --git a/libcxx/test/std/re/re.alg/re.alg.match/awk.locale.pass.cpp b/libcxx/test/std/re/re.alg/re.alg.match/awk.locale.pass.cpp
index 57b8c13aa3c14..151c9160b0fee 100644
--- a/libcxx/test/std/re/re.alg/re.alg.match/awk.locale.pass.cpp
+++ b/libcxx/test/std/re/re.alg/re.alg.match/awk.locale.pass.cpp
@@ -19,6 +19,7 @@
 // TODO: investigation needed
 // TODO(netbsd): incomplete support for locales
 // XFAIL: target={{.*}}-linux-gnu{{.*}}, netbsd, freebsd
+// XFAIL: target={{.*}}-amazon-linux{{.*}}
 // REQUIRES: locale.cs_CZ.ISO8859-2
 
 #include <regex>
diff --git a/libcxx/test/std/re/re.alg/re.alg.match/basic.locale.pass.cpp b/libcxx/test/std/re/re.alg/re.alg.match/basic.locale.pass.cpp
index 430d35fe739e5..463d0fd3bd1eb 100644
--- a/libcxx/test/std/re/re.alg/re.alg.match/basic.locale.pass.cpp
+++ b/libcxx/test/std/re/re.alg/re.alg.match/basic.locale.pass.cpp
@@ -23,6 +23,7 @@
 
 // TODO: investigation needed
 // XFAIL: target={{.*}}-linux-gnu{{.*}}, freebsd
+// XFAIL: target={{.*}}-amazon-linux{{.*}}
 
 #include <regex>
 #include <cassert>
diff --git a/libcxx/test/std/re/re.alg/re.alg.match/ecma.locale.pass.cpp b/libcxx/test/std/re/re.alg/re.alg.match/ecma.locale.pass.cpp
index b512fa9b5fcf8..b83bfabfb96be 100644
--- a/libcxx/test/std/re/re.alg/re.alg.match/ecma.locale.pass.cpp
+++ b/libcxx/test/std/re/re.alg/re.alg.match/ecma.locale.pass.cpp
@@ -23,6 +23,7 @@
 
 // TODO: investigation needed
 // XFAIL: target={{.*}}-linux-gnu{{.*}}, freebsd
+// XFAIL: target={{.*}}-amazon-linux{{.*}}
 
 #include <regex>
 #include <cassert>
diff --git a/libcxx/test/std/re/re.alg/re.alg.match/extended.locale.pass.cpp b/libcxx/test/std/re/re.alg/re.alg.match/extended.locale.pass.cpp
index 472dc19680263..24c38e9f631c9 100644
--- a/libcxx/test/std/re/re.alg/re.alg.match/extended.locale.pass.cpp
+++ b/libcxx/test/std/re/re.alg/re.alg.match/extended.locale.pass.cpp
@@ -23,6 +23,7 @@
 
 // TODO: investigation needed
 // XFAIL: target={{.*}}-linux-gnu{{.*}}, freebsd
+// XFAIL: target={{.*}}-amazon-linux{{.*}}
 
 #include <regex>
 #include <cassert>
diff --git a/libcxx/test/std/re/re.alg/re.alg.search/awk.locale.pass.cpp b/libcxx/test/std/re/re.alg/re.alg.search/awk.locale.pass.cpp
index 9125df404b1de..3a0229ec3be2f 100644
--- a/libcxx/test/std/re/re.alg/re.alg.search/awk.locale.pass.cpp
+++ b/libcxx/test/std/re/re.alg/re.alg.search/awk.locale.pass.cpp
@@ -23,6 +23,7 @@
 
 // TODO: investigation needed
 // XFAIL: target={{.*}}-linux-gnu{{.*}}, freebsd
+// XFAIL: target={{.*}}-amazon-linux{{.*}}
 
 #include <regex>
 #include <cassert>
diff --git a/libcxx/test/std/re/re.alg/re.alg.search/basic.locale.pass.cpp b/libcxx/test/std/re/re.alg/re.alg.search/basic.locale.pass.cpp
index f85b6a40ce129..eba6b3383059a 100644
--- a/libcxx/test/std/re/re.alg/re.alg.search/basic.locale.pass.cpp
+++ b/libcxx/test/std/re/re.alg/re.alg.search/basic.locale.pass.cpp
@@ -23,6 +23,7 @@
 
 // TODO: investigation needed
 // XFAIL: target={{.*}}-linux-gnu{{.*}}, freebsd
+// XFAIL: target={{.*}}-amazon-linux{{.*}}
 
 #include <regex>
 #include <cassert>
diff --git a/libcxx/test/std/re/re.alg/re.alg.search/ecma.locale.pass.cpp b/libcxx/test/std/re/re.alg/re.alg.search/ecma.locale.pass.cpp
index aa9441cb3e58f..175a05a9ef7c4 100644
--- a/libcxx/test/std/re/re.alg/re.alg.search/ecma.locale.pass.cpp
+++ b/libcxx/test/std/re/re.alg/re.alg.search/ecma.locale.pass.cpp
@@ -23,6 +23,7 @@
 
 // TODO: investigation needed
 // XFAIL: target={{.*}}-linux-gnu{{.*}}, freebsd
+// XFAIL: target={{.*}}-amazon-linux{{.*}}
 
 #include <regex>
 #include <cassert>
diff --git a/libcxx/test/std/re/re.alg/re.alg.search/extended.locale.pass.cpp b/libcxx/test/std/re/re.alg/re.alg.search/extended.locale.pass.cpp
index 9746e45f29da5..e7dbccd6f0ba4 100644
--- a/libcxx/test/std/re/re.alg/re.alg.search/extended.locale.pass.cpp
+++ b/libcxx/test/std/re/re.alg/re.alg.search/extended.locale.pass.cpp
@@ -23,6 +23,7 @@
 
 // TODO: investigation needed
 // XFAIL: target={{.*}}-linux-gnu{{.*}}, freebsd
+// XFAIL: target={{.*}}-amazon-linux{{.*}}
 
 #include <regex>
 #include <cassert>
diff --git a/libcxx/test/std/re/re.traits/lookup_collatename.pass.cpp b/libcxx/test/std/re/re.traits/lookup_collatename.pass.cpp
index 178979d5b9ce8..803d062693c33 100644
--- a/libcxx/test/std/re/re.traits/lookup_collatename.pass.cpp
+++ b/libcxx/test/std/re/re.traits/lookup_collatename.pass.cpp
@@ -24,6 +24,7 @@
 
 // TODO: investigation needed
 // XFAIL: target={{.*}}-linux-gnu{{.*}}
+// XFAIL: target={{.*}}-amazon-linux{{.*}}
 
 #include <regex>
 #include <iterator>

From 7c154dad4d1538f80bac3c6da7a4b74e1f25e2b9 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Mon, 3 Mar 2025 14:26:43 -0600
Subject: [PATCH 238/282] [Clang] Fix GPU intrinsic helpers incorrectly sign
 extending (#129560)

Summary:
These return values are actually signed, meaning that casting will
extend it and then all the bits will be one.

(cherry picked from commit 4ca8ea8c972ae05a891687eda6704ec607184fae)
---
 clang/lib/Headers/amdgpuintrin.h | 2 +-
 clang/lib/Headers/nvptxintrin.h  | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
index 9dad99ffe9439..d12c7e244c2be 100644
--- a/clang/lib/Headers/amdgpuintrin.h
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -121,7 +121,7 @@ __gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) {
   uint32_t __hi = (uint32_t)(__x >> 32ull);
   uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
   return ((uint64_t)__builtin_amdgcn_readfirstlane(__hi) << 32ull) |
-         ((uint64_t)__builtin_amdgcn_readfirstlane(__lo));
+         ((uint64_t)__builtin_amdgcn_readfirstlane(__lo) & 0xFFFFFFFF);
 }
 
 // Returns a bitmask of threads in the current lane for which \p x is true.
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
index 0afcb1c5ff0f0..ea21359840ed8 100644
--- a/clang/lib/Headers/nvptxintrin.h
+++ b/clang/lib/Headers/nvptxintrin.h
@@ -127,7 +127,8 @@ __gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) {
                                              __gpu_num_lanes() - 1)
           << 32ull) |
          ((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __lo, __id,
-                                             __gpu_num_lanes() - 1));
+                                             __gpu_num_lanes() - 1) &
+          0xFFFFFFFF);
 }
 
 // Returns a bitmask of threads in the current lane for which \p x is true.

From 548d057ebcfa21eb9445a4bd77914ec875dbf66a Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 1 Mar 2025 16:54:22 +0000
Subject: [PATCH 239/282] [VectorCombine] scalarizeLoadExtract - don't create
 scalar loads if any extract is waiting to be erased (#129375)

If any extract is waiting to be erased, then bail out as this will distort the cost calculation and possibly lead to infinite loops.

Fixes #129373

(cherry picked from commit 5ddf40fa78705384966c22da78e12134df7bd723)
---
 llvm/lib/Transforms/Vectorize/VectorCombine.cpp      |  5 +++++
 .../X86/load-extractelement-scalarization.ll         | 12 ++++++++++++
 2 files changed, 17 insertions(+)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 59920b5a4dd20..1deaaca05d98f 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1403,6 +1403,11 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
     if (!UI || UI->getParent() != LI->getParent())
       return false;
 
+    // If any extract is waiting to be erased, then bail out as this will
+    // distort the cost calculation and possibly lead to infinite loops.
+    if (UI->use_empty())
+      return false;
+
     // Check if any instruction between the load and the extract may modify
     // memory.
     if (LastCheckedInst->comesBefore(UI)) {
diff --git a/llvm/test/Transforms/VectorCombine/X86/load-extractelement-scalarization.ll b/llvm/test/Transforms/VectorCombine/X86/load-extractelement-scalarization.ll
index 0acfeccb92ef7..d46c8c0de4037 100644
--- a/llvm/test/Transforms/VectorCombine/X86/load-extractelement-scalarization.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load-extractelement-scalarization.ll
@@ -24,3 +24,15 @@ define void @multiple_extract(ptr %p) {
   store i32 %e1, ptr %p1, align 4
   ret void
 }
+
+; infinite loop if we fold an extract that is waiting to be erased
+define void @unused_extract(ptr %p) {
+; CHECK-LABEL: @unused_extract(
+; CHECK-NEXT:    ret void
+;
+  %load = load <4 x float>, ptr %p, align 8
+  %shuffle0 = shufflevector <4 x float> zeroinitializer, <4 x float> %load, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  %shuffle1 = shufflevector <4 x float> %shuffle0, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
+  %extract = extractelement <4 x float> %load, i64 1
+  ret void
+}

From 64ae6413559e2f0fa9218b2f83919ec757404f3b Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1@linux.ibm.com>
Date: Wed, 12 Mar 2025 11:33:12 -0600
Subject: [PATCH 240/282] [SystemZ]  Move disabling of arg verification to
 before isFullyInternal(). (#130693)

It has found to be quite a slowdown to traverse the users of a
function from each call site when it is called many (~70k)
times. This patch fixes this for now as long as this verification
is disabled by default, but there is still a need to eventually
cache the results to avoid recomputation.

Fixes #130541

(cherry picked from commit 378739f18208165f9831571a57f34d82f6663bc6)
---
 llvm/lib/Target/SystemZ/SystemZISelLowering.cpp | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 1fb31c26e20d3..2b8269e440e90 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -10231,6 +10231,11 @@ static void printFunctionArgExts(const Function *F, raw_fd_ostream &OS) {
 void SystemZTargetLowering::
 verifyNarrowIntegerArgs_Call(const SmallVectorImpl<ISD::OutputArg> &Outs,
                              const Function *F, SDValue Callee) const {
+  // Temporarily only do the check when explicitly requested, until it can be
+  // enabled by default.
+  if (!EnableIntArgExtCheck)
+    return;
+
   bool IsInternal = false;
   const Function *CalleeFn = nullptr;
   if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee))
@@ -10252,6 +10257,11 @@ verifyNarrowIntegerArgs_Call(const SmallVectorImpl<ISD::OutputArg> &Outs,
 void SystemZTargetLowering::
 verifyNarrowIntegerArgs_Ret(const SmallVectorImpl<ISD::OutputArg> &Outs,
                             const Function *F) const {
+  // Temporarily only do the check when explicitly requested, until it can be
+  // enabled by default.
+  if (!EnableIntArgExtCheck)
+    return;
+
   if (!verifyNarrowIntegerArgs(Outs, isFullyInternal(F))) {
     errs() << "ERROR: Missing extension attribute of returned "
            << "value from function:\n";
@@ -10268,11 +10278,6 @@ verifyNarrowIntegerArgs(const SmallVectorImpl<ISD::OutputArg> &Outs,
   if (IsInternal || !Subtarget.isTargetELF())
     return true;
 
-  // Temporarily only do the check when explicitly requested, until it can be
-  // enabled by default.
-  if (!EnableIntArgExtCheck)
-    return true;
-
   if (EnableIntArgExtCheck.getNumOccurrences()) {
     if (!EnableIntArgExtCheck)
       return true;

From b09b05b8e7c392428baaaf6bcaa8c85ce798ef72 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 12 Mar 2025 14:52:01 +0100
Subject: [PATCH 241/282] [MemCpyOpt] Fix clobber check in fca2memcpy
 optimization

This effectively reverts #108535. The old AA code was looking for
the *first* clobber between the load and store and then trying to
move all the way up there. The new MSSA based code instead found
the *last* clobber. There might still be an earlier clobber that
has not been accounted for.

Fixes #130632.

(cherry picked from commit 5da9044c40840187330526ca888290a95927a629)
---
 .../lib/Transforms/Scalar/MemCpyOptimizer.cpp | 24 ++++++-----
 llvm/test/Transforms/MemCpyOpt/fca2memcpy.ll  | 40 +++++++++++++++----
 2 files changed, 46 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index a80a85f38e74d..971d6012f6129 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -638,17 +638,19 @@ bool MemCpyOptPass::processStoreOfLoad(StoreInst *SI, LoadInst *LI,
       (EnableMemCpyOptWithoutLibcalls ||
        (TLI->has(LibFunc_memcpy) && TLI->has(LibFunc_memmove)))) {
     MemoryLocation LoadLoc = MemoryLocation::get(LI);
-    MemoryUseOrDef *LoadAccess = MSSA->getMemoryAccess(LI),
-                   *StoreAccess = MSSA->getMemoryAccess(SI);
-
-    // We use MSSA to check if an instruction may store to the memory we load
-    // from in between the load and the store. If such an instruction is found,
-    // we try to promote there instead of at the store position.
-    auto *Clobber = MSSA->getWalker()->getClobberingMemoryAccess(
-        StoreAccess->getDefiningAccess(), LoadLoc, BAA);
-    Instruction *P = MSSA->dominates(LoadAccess, Clobber)
-                         ? cast<MemoryUseOrDef>(Clobber)->getMemoryInst()
-                         : SI;
+
+    // We use alias analysis to check if an instruction may store to
+    // the memory we load from in between the load and the store. If
+    // such an instruction is found, we try to promote there instead
+    // of at the store position.
+    // TODO: Can use MSSA for this.
+    Instruction *P = SI;
+    for (auto &I : make_range(++LI->getIterator(), SI->getIterator())) {
+      if (isModSet(BAA.getModRefInfo(&I, LoadLoc))) {
+        P = &I;
+        break;
+      }
+    }
 
     // If we found an instruction that may write to the loaded memory,
     // we can try to promote at this position instead of the store
diff --git a/llvm/test/Transforms/MemCpyOpt/fca2memcpy.ll b/llvm/test/Transforms/MemCpyOpt/fca2memcpy.ll
index 61e349e01ed91..7d4557aa331c4 100644
--- a/llvm/test/Transforms/MemCpyOpt/fca2memcpy.ll
+++ b/llvm/test/Transforms/MemCpyOpt/fca2memcpy.ll
@@ -51,8 +51,8 @@ define void @destroysrc(ptr %src, ptr %dst) {
 
 define void @destroynoaliassrc(ptr noalias %src, ptr %dst) {
 ; CHECK-LABEL: @destroynoaliassrc(
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[DST:%.*]], ptr align 8 [[SRC]], i64 16, i1 false)
-; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[SRC:%.*]], i8 0, i64 16, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[DST:%.*]], ptr align 8 [[SRC:%.*]], i64 16, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[SRC]], i8 0, i64 16, i1 false)
 ; CHECK-NEXT:    ret void
 ;
   %1 = load %S, ptr %src
@@ -79,9 +79,9 @@ define void @copyalias(ptr %src, ptr %dst) {
 ; sure we lift the computation as well if needed and possible.
 define void @addrproducer(ptr %src, ptr %dst) {
 ; CHECK-LABEL: @addrproducer(
-; CHECK-NEXT:    [[DST2:%.*]] = getelementptr [[S:%.*]], ptr [[DST]], i64 1
+; CHECK-NEXT:    [[DST2:%.*]] = getelementptr [[S:%.*]], ptr [[DST:%.*]], i64 1
 ; CHECK-NEXT:    call void @llvm.memmove.p0.p0.i64(ptr align 8 [[DST2]], ptr align 8 [[SRC:%.*]], i64 16, i1 false)
-; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[DST:%.*]], i8 undef, i64 16, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[DST]], i8 undef, i64 16, i1 false)
 ; CHECK-NEXT:    ret void
 ;
   %1 = load %S, ptr %src
@@ -113,8 +113,8 @@ define void @noaliasaddrproducer(ptr %src, ptr noalias %dst, ptr noalias %dstidp
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DSTIDPTR:%.*]], align 4
 ; CHECK-NEXT:    [[DSTINDEX:%.*]] = or i32 [[TMP2]], 1
 ; CHECK-NEXT:    [[DST2:%.*]] = getelementptr [[S:%.*]], ptr [[DST:%.*]], i32 [[DSTINDEX]]
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[DST2]], ptr align 8 [[SRC]], i64 16, i1 false)
-; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[SRC:%.*]], i8 undef, i64 16, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[DST2]], ptr align 8 [[SRC:%.*]], i64 16, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[SRC]], i8 undef, i64 16, i1 false)
 ; CHECK-NEXT:    ret void
 ;
   %1 = load %S, ptr %src
@@ -130,7 +130,7 @@ define void @throwing_call(ptr noalias %src, ptr %dst) {
 ; CHECK-LABEL: @throwing_call(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load [[S:%.*]], ptr [[SRC:%.*]], align 8
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[SRC]], i8 0, i64 16, i1 false)
-; CHECK-NEXT:    call void @call() [[ATTR2:#.*]]
+; CHECK-NEXT:    call void @call() #[[ATTR2:[0-9]+]]
 ; CHECK-NEXT:    store [[S]] [[TMP1]], ptr [[DST:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
@@ -156,4 +156,30 @@ loop:
   br label %loop
 }
 
+; There are multiple instructions that can clobber the source memory here.
+; We can move the dest write past the store to %ptr.24, but not the memcpy.
+; Make sure we don't perform fca2memcpy conversion in this case.
+define void @multiple_clobbering(ptr %ptr, ptr %ptr.copy) {
+; CHECK-LABEL: @multiple_clobbering(
+; CHECK-NEXT:    [[PTR_8:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR:%.*]], i64 8
+; CHECK-NEXT:    [[PTR_24:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR]], i64 24
+; CHECK-NEXT:    [[PTR_32:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR]], i64 32
+; CHECK-NEXT:    [[PTR_COPY_8:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR_COPY:%.*]], i64 8
+; CHECK-NEXT:    [[STRUCT:%.*]] = load { i32, i64 }, ptr [[PTR_COPY_8]], align 8
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[PTR_8]], ptr [[PTR_32]], i64 12, i1 false)
+; CHECK-NEXT:    store i64 1, ptr [[PTR_24]], align 8
+; CHECK-NEXT:    store { i32, i64 } [[STRUCT]], ptr [[PTR_32]], align 8
+; CHECK-NEXT:    ret void
+;
+  %ptr.8 = getelementptr inbounds nuw i8, ptr %ptr, i64 8
+  %ptr.24 = getelementptr inbounds nuw i8, ptr %ptr, i64 24
+  %ptr.32 = getelementptr inbounds nuw i8, ptr %ptr, i64 32
+  %ptr.copy.8 = getelementptr inbounds nuw i8, ptr %ptr.copy, i64 8
+  %struct = load { i32, i64 }, ptr %ptr.copy.8, align 8
+  call void @llvm.memcpy.p0.p0.i64(ptr %ptr.8, ptr %ptr.32, i64 12, i1 false)
+  store i64 1, ptr %ptr.24, align 8
+  store { i32, i64 } %struct, ptr %ptr.32, align 8
+  ret void
+}
+
 declare void @call()

From 63e63f3061287d2ce45dac05169616b16942a9c4 Mon Sep 17 00:00:00 2001
From: Younan Zhang <zyn7109@gmail.com>
Date: Wed, 26 Feb 2025 16:54:19 +0800
Subject: [PATCH 242/282] [Clang] Fix an integer overflow issue in computing
 CTAD's parameter depth (#128704)

Backports b8d1f3d62.

This fixes a potential integer overflow bug that has been around for
many versions and was exposed by my patch recently. So we think it
warrants a backport
---
 clang/docs/ReleaseNotes.rst                   |  1 +
 clang/lib/Sema/SemaTemplateDeductionGuide.cpp |  9 +++++-
 clang/test/SemaTemplate/deduction-guide.cpp   | 32 +++++++++++++++++++
 3 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 57a567509a068..02292c10e6964 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -1058,6 +1058,7 @@ Bug Fixes to C++ Support
 - Fixed a substitution bug in transforming CTAD aliases when the type alias contains a non-pack template argument
   corresponding to a pack parameter (#GH124715)
 - Clang is now better at keeping track of friend function template instance contexts. (#GH55509)
+- Fixed an integer overflow bug in computing template parameter depths when synthesizing CTAD guides. (#GH128691)
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
index 00c5dfd3d7a43..b424de9c8a945 100644
--- a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
+++ b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
@@ -377,8 +377,15 @@ struct ConvertConstructorToDeductionGuideTransform {
         if (NestedPattern)
           Args.addOuterRetainedLevels(NestedPattern->getTemplateDepth());
         auto [Depth, Index] = getDepthAndIndex(Param);
+        // Depth can still be 0 if FTD belongs to an explicit class template
+        // specialization with an empty template parameter list. In that case,
+        // we don't want the NewDepth to overflow, and it should remain 0.
+        assert(Depth ||
+               cast<ClassTemplateSpecializationDecl>(FTD->getDeclContext())
+                   ->isExplicitSpecialization());
         NamedDecl *NewParam = transformTemplateParameter(
-            SemaRef, DC, Param, Args, Index + Depth1IndexAdjustment, Depth - 1);
+            SemaRef, DC, Param, Args, Index + Depth1IndexAdjustment,
+            Depth ? Depth - 1 : 0);
         if (!NewParam)
           return nullptr;
         // Constraints require that we substitute depth-1 arguments
diff --git a/clang/test/SemaTemplate/deduction-guide.cpp b/clang/test/SemaTemplate/deduction-guide.cpp
index a4c523595fca2..ecd152abebd74 100644
--- a/clang/test/SemaTemplate/deduction-guide.cpp
+++ b/clang/test/SemaTemplate/deduction-guide.cpp
@@ -691,3 +691,35 @@ Test test(42);
 // CHECK-NEXT: | `-ParmVarDecl {{.*}} 'auto:1'
 
 } // namespace GH122134
+
+namespace GH128691 {
+
+template <typename = void>
+class NewDeleteAllocator;
+
+template <>
+struct NewDeleteAllocator<> {
+  template <typename T>
+  NewDeleteAllocator(T); // expected-note {{candidate template ignored}} \
+                         // expected-note {{implicit deduction guide declared as}}
+};
+
+template <typename>
+struct NewDeleteAllocator : NewDeleteAllocator<> { // expected-note {{candidate template ignored}} \
+                                                   // expected-note {{implicit deduction guide declared as}}
+  using NewDeleteAllocator<>::NewDeleteAllocator;
+};
+
+void test() { NewDeleteAllocator abc(42); } // expected-error {{no viable constructor or deduction guide}}
+
+// CHECK-LABEL: Dumping GH128691::<deduction guide for NewDeleteAllocator>:
+// CHECK-NEXT: FunctionTemplateDecl {{.+}} <deduction guide for NewDeleteAllocator>
+// CHECK-NEXT: |-TemplateTypeParmDecl {{.+}} typename depth 0 index 0
+// CHECK-NEXT: | `-TemplateArgument type 'void'
+// CHECK-NEXT: |   |-inherited from TemplateTypeParm {{.+}} depth 0 index 0
+// CHECK-NEXT: |   `-BuiltinType {{.+}} 'void'
+// CHECK-NEXT: |-TemplateTypeParmDecl {{.+}} typename depth 0 index 1 T
+// CHECK-NEXT: `-CXXDeductionGuideDecl {{.+}} <deduction guide for NewDeleteAllocator> 'auto (T) -> NewDeleteAllocator<type-parameter-0-0>'
+// CHECK-NEXT:  `-ParmVarDecl {{.+}} 'T'
+
+} // namespace GH128691

From cb50aaf8a11b89f2785641fba5ffd4b67f566a32 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20Rodr=C3=ADguez=20Troiti=C3=B1o?=
 <drodriguez@users.noreply.github.com>
Date: Fri, 14 Mar 2025 13:43:48 -0700
Subject: [PATCH 243/282] [llvm-objcopy] Apply encryptable offset to first
 segment, not section (#130517)

Bug introduced #120995. The LLD code calculates the "size" of the Mach-O
headers, and then uses that size to place the segments, but the `__TEXT`
section stays at `fileoff` zero. When I wrote the code into llvm-objcopy
I calculated the extra space into the initial offset, which moved all
the sections back 1 page.

Besides the modified test checking for the right `fileoff` values of the
sections and the segments, I also manually checked the generated
binaries after `llvm-objcopy` using `dyld_info`, as the bug report
suggested.

Fixes #130472

(cherry picked from commit 8413f4d837a96458104f63bab72c751b8285a458)
---
 llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.cpp       | 13 ++++++++-----
 .../MachO/strip-with-encryption-info.test           | 12 +++++++++---
 2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.cpp b/llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.cpp
index d4eb6a9b9fc0b..8ecd669e67178 100644
--- a/llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.cpp
+++ b/llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.cpp
@@ -116,11 +116,10 @@ uint64_t MachOLayoutBuilder::layoutSegments() {
   const bool IsObjectFile =
       O.Header.FileType == MachO::HeaderFileType::MH_OBJECT;
   uint64_t Offset = IsObjectFile ? (HeaderSize + O.Header.SizeOfCmds) : 0;
-  if (O.EncryptionInfoCommandIndex) {
-    // If we are emitting an encryptable binary, our load commands must have a
-    // separate (non-encrypted) page to themselves.
-    Offset = alignToPowerOf2(HeaderSize + O.Header.SizeOfCmds, PageSize);
-  }
+  // If we are emitting an encryptable binary, our load commands must have a
+  // separate (non-encrypted) page to themselves.
+  bool RequiresFirstSectionOutsideFirstPage =
+      O.EncryptionInfoCommandIndex.has_value();
   for (LoadCommand &LC : O.LoadCommands) {
     auto &MLC = LC.MachOLoadCommand;
     StringRef Segname;
@@ -174,6 +173,10 @@ uint64_t MachOLayoutBuilder::layoutSegments() {
         if (!Sec->hasValidOffset()) {
           Sec->Offset = 0;
         } else {
+          if (RequiresFirstSectionOutsideFirstPage) {
+            SectOffset = alignToPowerOf2(SectOffset, PageSize);
+            RequiresFirstSectionOutsideFirstPage = false;
+          }
           Sec->Offset = SegOffset + SectOffset;
           Sec->Size = Sec->Content.size();
           SegFileSize = std::max(SegFileSize, SectOffset + Sec->Size);
diff --git a/llvm/test/tools/llvm-objcopy/MachO/strip-with-encryption-info.test b/llvm/test/tools/llvm-objcopy/MachO/strip-with-encryption-info.test
index 19b06b1ec02c8..2b2bd670613de 100644
--- a/llvm/test/tools/llvm-objcopy/MachO/strip-with-encryption-info.test
+++ b/llvm/test/tools/llvm-objcopy/MachO/strip-with-encryption-info.test
@@ -1,13 +1,19 @@
 # RUN: rm -rf %t && mkdir %t
 # RUN: yaml2obj %s -o %t/original
 # RUN: llvm-strip --strip-all %t/original -o %t/stripped
-# RUN: llvm-readobj --macho-segment %t/stripped | FileCheck %s
+# RUN: llvm-readobj --macho-segment --section-headers %t/stripped | FileCheck %s
+
+# CHECK-LABEL: Sections [
+# CHECK:      Index: 0
+# CHECK-NEXT: Name: __text
+# CHECK-NEXT: Segment: __TEXT
+# CHECK:      Offset: 16384
 
 # CHECK-LABEL: Name: __PAGEZERO
-# CHECK:       fileoff: 16384
+# CHECK:       fileoff: 0
 
 # CHECK-LABEL: Name: __TEXT
-# CHECK:       fileoff: 16384
+# CHECK:       fileoff: 0
 
 # The YAML below is the following code
 # int main(int argc, char **argv) { return 0; }

From 073ae08864b40310448289c6f40b95ca4f330444 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 18 Mar 2025 02:50:18 +0700
Subject: [PATCH 244/282]  AMDGPU: Fix broken broken negative test for gfx950
 assembler (#129667) (#129686)

Fix's not rejecting global_load_lds_dwordx3 and x4 on other targets.
The encoded versions of instructions should not touch
SubtargetPredicate,
and only AssemblerPredicate.

(cherry picked from commit f319a6546613d65661e1ad1ef1a2a648cefee84b)
---
 llvm/lib/Target/AMDGPU/FLATInstructions.td | 2 +-
 llvm/test/MC/AMDGPU/gfx950_asm_features.s  | 7 +++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 8fa708b74dde3..dbd5723142d20 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -1873,7 +1873,7 @@ multiclass FLAT_Real_AllAddr_LDS<bits<7> op, bits<7> pre_gfx940_op,
     }
   }
 
-  let SubtargetPredicate = isGFX940Plus in {
+  let AssemblerPredicate = isGFX940Plus in {
     def _gfx940 : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(NAME)>;
     def _SADDR_gfx940 : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(NAME#"_SADDR")>;
   }
diff --git a/llvm/test/MC/AMDGPU/gfx950_asm_features.s b/llvm/test/MC/AMDGPU/gfx950_asm_features.s
index 389b17296c045..7bc47914f40b7 100644
--- a/llvm/test/MC/AMDGPU/gfx950_asm_features.s
+++ b/llvm/test/MC/AMDGPU/gfx950_asm_features.s
@@ -1,11 +1,10 @@
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx950 -show-encoding %s | FileCheck --check-prefix=GFX950 --strict-whitespace %s
-// xUN: not llvm-mc -triple=amdgcn -mcpu=gfx940 %s 2>&1 | FileCheck --check-prefixes=NOT-GFX950,GFX940 --implicit-check-not=error: %s
-// xUN: not llvm-mc -triple=amdgcn -mcpu=gfx90a %s 2>&1 | FileCheck --check-prefixes=NOT-GFX950,GFX90A --implicit-check-not=error: %s
-// xUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck --check-prefixes=NOT-GFX950,GFX10 --implicit-check-not=error: %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx942 %s 2>&1 | FileCheck --check-prefixes=NOT-GFX950 --implicit-check-not=error: %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx90a %s 2>&1 | FileCheck --check-prefixes=NOT-GFX950 --implicit-check-not=error: %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck --check-prefixes=NOT-GFX950 --implicit-check-not=error: %s
 
 // NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error: instruction not supported on this GPU
 // GFX950: global_load_lds_dwordx3 v[2:3], off     ; encoding: [0x00,0x80,0xf8,0xdd,0x02,0x00,0x7f,0x00]
-
 global_load_lds_dwordx3 v[2:3], off
 
 // NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:

From 0b23d98dceaa9f965bfa196a6adfa38b1b8bda8e Mon Sep 17 00:00:00 2001
From: higher-performance <higher.performance.github@gmail.com>
Date: Thu, 13 Mar 2025 16:02:39 -0400
Subject: [PATCH 245/282] Reduce memory usage in AST parent map generation by
 lazily checking if nodes have been seen (#129934)

This mitigates a regression introduced in #87824.

The mitigation here is to store pointers the deduplicated AST nodes, rather than copies of the nodes themselves. This allows a pointer-optimized set to be used and saves a lot of memory because `clang::DynTypedNode` is ~5 times larger than a pointer.

Fixes #129808.

(cherry picked from commit 8c7f0eaa6ee3f84e3d8260535cced234bed4fa28)
---
 clang/lib/AST/ParentMapContext.cpp | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/clang/lib/AST/ParentMapContext.cpp b/clang/lib/AST/ParentMapContext.cpp
index 7ff492443031d..d8dd352c42d6b 100644
--- a/clang/lib/AST/ParentMapContext.cpp
+++ b/clang/lib/AST/ParentMapContext.cpp
@@ -12,10 +12,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/AST/ParentMapContext.h"
-#include "clang/AST/RecursiveASTVisitor.h"
 #include "clang/AST/Decl.h"
 #include "clang/AST/Expr.h"
+#include "clang/AST/RecursiveASTVisitor.h"
 #include "clang/AST/TemplateBase.h"
+#include "llvm/ADT/SmallPtrSet.h"
 
 using namespace clang;
 
@@ -69,17 +70,21 @@ class ParentMapContext::ParentMap {
       for (; N > 0; --N)
         push_back(Value);
     }
-    bool contains(const DynTypedNode &Value) {
-      return Seen.contains(Value);
+    bool contains(const DynTypedNode &Value) const {
+      const void *Identity = Value.getMemoizationData();
+      assert(Identity);
+      return Dedup.contains(Identity);
     }
     void push_back(const DynTypedNode &Value) {
-      if (!Value.getMemoizationData() || Seen.insert(Value).second)
+      const void *Identity = Value.getMemoizationData();
+      if (!Identity || Dedup.insert(Identity).second) {
         Items.push_back(Value);
+      }
     }
     llvm::ArrayRef<DynTypedNode> view() const { return Items; }
   private:
-    llvm::SmallVector<DynTypedNode, 2> Items;
-    llvm::SmallDenseSet<DynTypedNode, 2> Seen;
+    llvm::SmallVector<DynTypedNode, 1> Items;
+    llvm::SmallPtrSet<const void *, 2> Dedup;
   };
 
   /// Maps from a node to its parents. This is used for nodes that have

From 0fcfeacd8b996e47353e24bfaf78ae86cc4090fc Mon Sep 17 00:00:00 2001
From: yonghong-song <yhs@fb.com>
Date: Tue, 11 Mar 2025 11:23:53 -0700
Subject: [PATCH 246/282] [BPF] Fix BitCast Assertion with NonZero AddrSpace

Alexei reported a bpf selftest failure with recent llvm for bpf prog
file progs/arena_spin_lock.c. The failure only happens when clang is
built with cmake option LLVM_ENABLE_ASSERTIONS=ON.

The error message looks like:
```
 clang: /home/yhs/work/yhs/llvm-project/llvm/lib/IR/Instructions.cpp:3460:
   llvm::BitCastInst::BitCastInst(Value *, Type *, const Twine &, InsertPosition):
   Assertion `castIsValid(getOpcode(), S, Ty) && "Illegal BitCast"' failed.
```
Further investigation shows that the problem is triggered in
  BPF/BPFAbstractMemberAccess.cpp
for code
```
  auto *BCInst =
      new BitCastInst(Base, PointerType::getUnqual(BB->getContext()));
```
For the above BitCastInst, Since 'Base' has non-zero AddrSapce, the
compiler expects the type also has the same AddrSpace. But the above
PointerType::getUnqual(...) does not have AddrSpace and hence causes the
assertion failure.

Providing the proper AddrSpace for the BitCast type fixed the issue.

Co-authored-by: Yonghong Song <yonghong.song@linux.dev>
(cherry picked from commit 5686786c550c6da6d1169b9bffc31cece1161902)
---
 .../Target/BPF/BPFAbstractMemberAccess.cpp    |  5 +-
 llvm/test/CodeGen/BPF/CORE/arena_bitcast.ll   | 80 +++++++++++++++++++
 2 files changed, 83 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/BPF/CORE/arena_bitcast.ll

diff --git a/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp b/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp
index 646d57770164a..77ed246edbadf 100644
--- a/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp
+++ b/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp
@@ -1113,8 +1113,9 @@ bool BPFAbstractMemberAccess::transformGEPChain(CallInst *Call,
                               Call->getIterator());
 
   // Generate a BitCast
-  auto *BCInst =
-      new BitCastInst(Base, PointerType::getUnqual(BB->getContext()));
+  auto *BCInst = new BitCastInst(
+      Base, PointerType::get(BB->getContext(),
+                             Base->getType()->getPointerAddressSpace()));
   BCInst->insertBefore(Call->getIterator());
 
   // Generate a GetElementPtr
diff --git a/llvm/test/CodeGen/BPF/CORE/arena_bitcast.ll b/llvm/test/CodeGen/BPF/CORE/arena_bitcast.ll
new file mode 100644
index 0000000000000..bcd71c04d264d
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/CORE/arena_bitcast.ll
@@ -0,0 +1,80 @@
+; RUN: opt -O2 %s | llvm-dis > %t1
+; RUN: llc -mcpu=v3 -filetype=asm -o - %t1 | FileCheck %s
+; Source code:
+;   struct lock_t {
+;     int counter;
+;   } __attribute__((preserve_access_index));
+;
+;   #define __arena __attribute__((address_space(1)))
+;   int test(struct lock_t __arena *lock, unsigned val)
+;   {
+;     return __sync_val_compare_and_swap((&lock->counter), val, 1);
+;   }
+; Compilation flag:
+;   clang -target bpf -O2 -g -S -emit-llvm -Xclang -disable-llvm-passes arena_bitcast.c
+
+target triple = "bpf"
+
+%struct.lock_t = type { i32 }
+
+; Function Attrs: nounwind
+define dso_local i32 @test(ptr addrspace(1) noundef %lock, i32 noundef %val) #0 !dbg !7 {
+entry:
+  %lock.addr = alloca ptr addrspace(1), align 8
+  %val.addr = alloca i32, align 4
+  store ptr addrspace(1) %lock, ptr %lock.addr, align 8, !tbaa !19
+    #dbg_declare(ptr %lock.addr, !17, !DIExpression(), !24)
+  store i32 %val, ptr %val.addr, align 4, !tbaa !25
+    #dbg_declare(ptr %val.addr, !18, !DIExpression(), !27)
+  %0 = load ptr addrspace(1), ptr %lock.addr, align 8, !dbg !28, !tbaa !19
+  %1 = call ptr addrspace(1) @llvm.preserve.struct.access.index.p1.p1(ptr addrspace(1) elementtype(%struct.lock_t) %0, i32 0, i32 0), !dbg !29, !llvm.preserve.access.index !12
+  %2 = load i32, ptr %val.addr, align 4, !dbg !30, !tbaa !25
+  %3 = cmpxchg ptr addrspace(1) %1, i32 %2, i32 1 seq_cst seq_cst, align 4, !dbg !31
+  %4 = extractvalue { i32, i1 } %3, 0, !dbg !31
+  ret i32 %4, !dbg !32
+}
+; CHECK:  r1 = addr_space_cast(r1, 0, 1)
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare ptr addrspace(1) @llvm.preserve.struct.access.index.p1.p1(ptr addrspace(1), i32 immarg, i32 immarg) #1
+
+attributes #0 = { nounwind "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang version 21.0.0git (https://github.com/llvm/llvm-project.git 7d4d8509cbec7eecd8aaf2510015b54bc5c173e1)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "arena_bitcast.c", directory: "/root/home/yhs/tests/arena/simple", checksumkind: CSK_MD5, checksum: "51cb51c1fc09d3033dbd9aee9044dc9b")
+!2 = !{i32 7, !"Dwarf Version", i32 5}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"wchar_size", i32 4}
+!5 = !{i32 7, !"frame-pointer", i32 2}
+!6 = !{!"clang version 21.0.0git (https://github.com/llvm/llvm-project.git 7d4d8509cbec7eecd8aaf2510015b54bc5c173e1)"}
+!7 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 6, type: !8, scopeLine: 7, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !16)
+!8 = !DISubroutineType(types: !9)
+!9 = !{!10, !11, !15}
+!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!11 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !12, size: 64)
+!12 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "lock_t", file: !1, line: 1, size: 32, elements: !13)
+!13 = !{!14}
+!14 = !DIDerivedType(tag: DW_TAG_member, name: "counter", scope: !12, file: !1, line: 2, baseType: !10, size: 32)
+!15 = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned)
+!16 = !{!17, !18}
+!17 = !DILocalVariable(name: "lock", arg: 1, scope: !7, file: !1, line: 6, type: !11)
+!18 = !DILocalVariable(name: "val", arg: 2, scope: !7, file: !1, line: 6, type: !15)
+!19 = !{!20, !20, i64 0}
+!20 = !{!"p1 _ZTS6lock_t", !21, i64 0}
+!21 = !{!"any pointer", !22, i64 0}
+!22 = !{!"omnipotent char", !23, i64 0}
+!23 = !{!"Simple C/C++ TBAA"}
+!24 = !DILocation(line: 6, column: 33, scope: !7)
+!25 = !{!26, !26, i64 0}
+!26 = !{!"int", !22, i64 0}
+!27 = !DILocation(line: 6, column: 48, scope: !7)
+!28 = !DILocation(line: 8, column: 40, scope: !7)
+!29 = !DILocation(line: 8, column: 46, scope: !7)
+!30 = !DILocation(line: 8, column: 56, scope: !7)
+!31 = !DILocation(line: 8, column: 10, scope: !7)
+!32 = !DILocation(line: 8, column: 3, scope: !7)

From 1cfbb9f334360fc836966a79eba6d00c8d3d22c7 Mon Sep 17 00:00:00 2001
From: Younan Zhang <zyn7109@gmail.com>
Date: Mon, 17 Mar 2025 16:53:57 +0800
Subject: [PATCH 247/282] Backport/20.x: [Clang] Fix an incorrect assumption on
 getTemplatedDecl()

This backports d9110858ee because it fixes a regression introduced in 19
and we don't want it to persist in 20
---
 clang/docs/ReleaseNotes.rst                        |  1 +
 clang/lib/Sema/SemaAccess.cpp                      |  4 ++--
 clang/test/SemaCXX/concept-crash-on-diagnostic.cpp | 12 ++++++++++++
 3 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 02292c10e6964..dc63b5213c546 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -1059,6 +1059,7 @@ Bug Fixes to C++ Support
   corresponding to a pack parameter (#GH124715)
 - Clang is now better at keeping track of friend function template instance contexts. (#GH55509)
 - Fixed an integer overflow bug in computing template parameter depths when synthesizing CTAD guides. (#GH128691)
+- Fixed an incorrect pointer access when checking access-control on concepts. (#GH131530)
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/lib/Sema/SemaAccess.cpp b/clang/lib/Sema/SemaAccess.cpp
index f79d9a758e7af..6813786df3fc4 100644
--- a/clang/lib/Sema/SemaAccess.cpp
+++ b/clang/lib/Sema/SemaAccess.cpp
@@ -1518,8 +1518,8 @@ void Sema::HandleDelayedAccessCheck(DelayedDiagnostic &DD, Decl *D) {
   } else if (FunctionDecl *FN = dyn_cast<FunctionDecl>(D)) {
     DC = FN;
   } else if (TemplateDecl *TD = dyn_cast<TemplateDecl>(D)) {
-    if (isa<DeclContext>(TD->getTemplatedDecl()))
-      DC = cast<DeclContext>(TD->getTemplatedDecl());
+    if (auto *D = dyn_cast_if_present<DeclContext>(TD->getTemplatedDecl()))
+      DC = D;
   } else if (auto *RD = dyn_cast<RequiresExprBodyDecl>(D)) {
     DC = RD;
   }
diff --git a/clang/test/SemaCXX/concept-crash-on-diagnostic.cpp b/clang/test/SemaCXX/concept-crash-on-diagnostic.cpp
index 71e55c8290ee4..c38f8888075de 100644
--- a/clang/test/SemaCXX/concept-crash-on-diagnostic.cpp
+++ b/clang/test/SemaCXX/concept-crash-on-diagnostic.cpp
@@ -36,3 +36,15 @@ void function() {
 // expected-note@#4 {{candidate template ignored: constraints not satisfied [with IteratorL = Object *, IteratorR = Object *]}}
 // We don't know exactly the substituted type for `lhs == rhs`, thus a placeholder 'expr-type' is emitted.
 // expected-note@#3 {{because 'convertible_to<expr-type, bool>' would be invalid}}
+
+namespace GH131530 {
+
+class foo {
+  struct bar {}; // expected-note {{implicitly declared private}}
+};
+
+template <typename T>
+concept is_foo_concept = __is_same(foo::bar, T);
+// expected-error@-1 {{'bar' is a private member of 'GH131530::foo'}}
+
+}

From 1515c4ac202ddb684523e86f2e14c231c23e3d82 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Fri, 28 Feb 2025 20:56:12 +0000
Subject: [PATCH 248/282] [LAA] Consider accessed addrspace when mapping
 underlying obj to access. (#129087)

In some cases, it is possible for the same underlying object to be
accessed via pointers to different address spaces. This could lead to
pointers from different address spaces ending up in the same dependency
set, which isn't allowed (and triggers an assertion).

Update the mapping from underlying object -> last access to also include
the accessing address space.

Fixes https://github.com/llvm/llvm-project/issues/124759.

PR: https://github.com/llvm/llvm-project/pull/129087
---
 llvm/lib/Analysis/LoopAccessAnalysis.cpp      | 12 ++++--
 ...erlying-object-different-address-spaces.ll | 39 +++++++++++++++++++
 2 files changed, 47 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/Analysis/LoopAccessAnalysis/underlying-object-different-address-spaces.ll

diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 697b40403902c..dcd8a910d0f49 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -1283,8 +1283,10 @@ void AccessAnalysis::processMemAccesses() {
 
     bool SetHasWrite = false;
 
-    // Map of pointers to last access encountered.
-    typedef DenseMap<const Value*, MemAccessInfo> UnderlyingObjToAccessMap;
+    // Map of (pointer to underlying objects, accessed address space) to last
+    // access encountered.
+    typedef DenseMap<std::pair<const Value *, unsigned>, MemAccessInfo>
+        UnderlyingObjToAccessMap;
     UnderlyingObjToAccessMap ObjToLastAccess;
 
     // Set of access to check after all writes have been processed.
@@ -1364,12 +1366,14 @@ void AccessAnalysis::processMemAccesses() {
                     UnderlyingObj->getType()->getPointerAddressSpace()))
               continue;
 
+            unsigned AccessAS = cast<PointerType>(Ptr->getType())->getAddressSpace();
             UnderlyingObjToAccessMap::iterator Prev =
-                ObjToLastAccess.find(UnderlyingObj);
+                ObjToLastAccess.find({UnderlyingObj,AccessAS 
+                 });
             if (Prev != ObjToLastAccess.end())
               DepCands.unionSets(Access, Prev->second);
 
-            ObjToLastAccess[UnderlyingObj] = Access;
+            ObjToLastAccess[{UnderlyingObj, AccessAS}] = Access;
             LLVM_DEBUG(dbgs() << "  " << *UnderlyingObj << "\n");
           }
         }
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/underlying-object-different-address-spaces.ll b/llvm/test/Analysis/LoopAccessAnalysis/underlying-object-different-address-spaces.ll
new file mode 100644
index 0000000000000..adf73c091be00
--- /dev/null
+++ b/llvm/test/Analysis/LoopAccessAnalysis/underlying-object-different-address-spaces.ll
@@ -0,0 +1,39 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes='print<access-info>' -disable-output %s 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+
+; Test case for https://github.com/llvm/llvm-project/issues/124759. The same
+; underlying object is access through pointers with different address spaces.
+define void @same_underlying_object_different_address_spaces(ptr %dst1.as1, ptr %dst2.as1) {
+; CHECK-LABEL: 'same_underlying_object_different_address_spaces'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: cannot identify array bounds
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %alloc = alloca i8, i64 0, align 128
+  %as3 = addrspacecast ptr %alloc to ptr addrspace(3)
+  %as4 = addrspacecast ptr %alloc to ptr addrspace(4)
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  store i32 0, ptr addrspace(4) %as4, align 4
+  store i32 0, ptr %dst1.as1, align 4
+  %l = load i64, ptr addrspace(3) %as3, align 4
+  store i64 %l, ptr %dst2.as1, align 4
+  %iv.next = add i64 %iv, 1
+  %c = icmp eq i64 %iv.next, 100
+  br i1 %c, label %loop, label %exit
+
+exit:
+  ret void
+}

From 1f9d00524b8cd20d3c4962be15320e6861b455b4 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Thu, 30 Jan 2025 12:46:24 -0500
Subject: [PATCH 249/282] [libc++] Forward-proof some tests for AppleClang 17

---
 .../new.delete/new.delete.array/sized_delete_array.pass.cpp     | 1 +
 .../new.delete/new.delete.single/sized_delete.pass.cpp          | 1 +
 libcxx/test/std/numerics/c.math/signbit.pass.cpp                | 2 +-
 .../meta.unary/meta.unary.prop/is_implicit_lifetime.pass.cpp    | 2 +-
 .../meta.unary/meta.unary.prop/is_implicit_lifetime.verify.cpp  | 2 +-
 5 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array.pass.cpp
index 1d763d6caba6a..01387feed67b6 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array.pass.cpp
@@ -14,6 +14,7 @@
 // ADDITIONAL_COMPILE_FLAGS(clang-18): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(apple-clang-15): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(apple-clang-16): -fsized-deallocation
+// ADDITIONAL_COMPILE_FLAGS(apple-clang-17): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(target=x86_64-w64-windows-gnu): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(target=i686-w64-windows-gnu): -fsized-deallocation
 
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete.pass.cpp
index 462037e53374b..06d3b0e5b3c35 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete.pass.cpp
@@ -14,6 +14,7 @@
 // ADDITIONAL_COMPILE_FLAGS(clang-18): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(apple-clang-15): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(apple-clang-16): -fsized-deallocation
+// ADDITIONAL_COMPILE_FLAGS(apple-clang-17): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(target=x86_64-w64-windows-gnu): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(target=i686-w64-windows-gnu): -fsized-deallocation
 
diff --git a/libcxx/test/std/numerics/c.math/signbit.pass.cpp b/libcxx/test/std/numerics/c.math/signbit.pass.cpp
index 143baf1fec941..b5e63dedf136e 100644
--- a/libcxx/test/std/numerics/c.math/signbit.pass.cpp
+++ b/libcxx/test/std/numerics/c.math/signbit.pass.cpp
@@ -12,7 +12,7 @@
 // UNSUPPORTED: windows
 
 // These compilers don't support constexpr `__builtin_signbit` yet.
-// UNSUPPORTED: clang-18, clang-19, apple-clang-15, apple-clang-16
+// UNSUPPORTED: clang-18, clang-19, apple-clang-15, apple-clang-16, apple-clang-17
 
 // XFAIL: FROZEN-CXX03-HEADERS-FIXME
 
diff --git a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.pass.cpp b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.pass.cpp
index 24adec37431e7..681ad13a07dfd 100644
--- a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.pass.cpp
+++ b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.pass.cpp
@@ -9,7 +9,7 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 
 // These compilers don't support __builtin_is_implicit_lifetime yet.
-// UNSUPPORTED: clang-18, clang-19, gcc-14, apple-clang-15, apple-clang-16
+// UNSUPPORTED: clang-18, clang-19, gcc-14, apple-clang-15, apple-clang-16, apple-clang-17
 
 // <type_traits>
 
diff --git a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.verify.cpp b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.verify.cpp
index 4bcb10d0b7579..34462f9bf0ec6 100644
--- a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.verify.cpp
+++ b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.verify.cpp
@@ -9,7 +9,7 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 
 // These compilers don't support __builtin_is_implicit_lifetime yet.
-// UNSUPPORTED: clang-18, clang-19, gcc-14, apple-clang-15, apple-clang-16
+// UNSUPPORTED: clang-18, clang-19, gcc-14, apple-clang-15, apple-clang-16, apple-clang-17
 
 // <type_traits>
 

From 1058e693f090bdb71505ca8c8efc6cb080509dce Mon Sep 17 00:00:00 2001
From: "A. Jiang" <de34@live.cn>
Date: Sat, 15 Mar 2025 21:07:53 +0800
Subject: [PATCH 250/282] [libc++][test] Skip a `is_virtual_base_of` test for
 apple-clang-17 (#131438)

It seems that Apple Clang 17 starts to be used for CI, while it hasn't
supported `__builtin_is_virtual_base_of` yet. And thus we need to skip
the test for `is_virtual_base_of`.

Follows up #131302.
---
 .../std/utilities/meta/meta.rel/is_virtual_base_of.pass.cpp     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcxx/test/std/utilities/meta/meta.rel/is_virtual_base_of.pass.cpp b/libcxx/test/std/utilities/meta/meta.rel/is_virtual_base_of.pass.cpp
index bcffa5812d04e..f443d2030961d 100644
--- a/libcxx/test/std/utilities/meta/meta.rel/is_virtual_base_of.pass.cpp
+++ b/libcxx/test/std/utilities/meta/meta.rel/is_virtual_base_of.pass.cpp
@@ -9,7 +9,7 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
 
 // These compilers don't support __builtin_is_virtual_base_of yet.
-// UNSUPPORTED: clang-18, clang-19, gcc-14, apple-clang-16
+// UNSUPPORTED: clang-18, clang-19, gcc-14, apple-clang-16, apple-clang-17
 
 // <type_traits>
 

From 0ceb4efefeaa650166254ec8f1836a2ee76be207 Mon Sep 17 00:00:00 2001
From: "Yaxun (Sam) Liu" <yaxun.liu@amd.com>
Date: Fri, 28 Feb 2025 09:58:19 -0500
Subject: [PATCH 251/282] [CUDA][HIP] fix virtual dtor host/device attr
 (#128926)

When inferring host device attr of virtual dtor of explicit
template class instantiation, clang should be conservative.
This guarantees dtors that may call host functions not to
have implicit device attr, therefore will not be emitted
on device side.

Backports: 0f0665db067f d37a39207bc1

Fixes: #108548
---
 clang/docs/HIPSupport.rst       |  20 ++++++
 clang/include/clang/Sema/Sema.h |   2 +-
 clang/lib/Sema/Sema.cpp         |  43 +++++++++++++
 clang/lib/Sema/SemaCUDA.cpp     |  23 ++++++-
 clang/lib/Sema/SemaDecl.cpp     |  15 +++++
 clang/test/SemaCUDA/dtor.cu     | 104 ++++++++++++++++++++++++++++++++
 6 files changed, 204 insertions(+), 3 deletions(-)
 create mode 100644 clang/test/SemaCUDA/dtor.cu

diff --git a/clang/docs/HIPSupport.rst b/clang/docs/HIPSupport.rst
index 481ed39230813..8f473c21e1918 100644
--- a/clang/docs/HIPSupport.rst
+++ b/clang/docs/HIPSupport.rst
@@ -286,6 +286,26 @@ Example Usage
       basePtr->virtualFunction(); // Allowed since obj is constructed in device code
    }
 
+Host and Device Attributes of Default Destructors
+===================================================
+
+If a default destructor does not have explicit host or device attributes,
+clang infers these attributes based on the destructors of its data members
+and base classes. If any conflicts are detected among these destructors,
+clang diagnoses the issue. Otherwise, clang adds an implicit host or device
+attribute according to whether the data members's and base classes's
+destructors can execute on the host or device side.
+
+For explicit template classes with virtual destructors, which must be emitted,
+the inference adopts a conservative approach. In this case, implicit host or
+device attributes from member and base class destructors are ignored. This
+precaution is necessary because, although a constexpr destructor carries
+implicit host or device attributes, a constexpr function may call a
+non-constexpr function, which is by default a host function.
+
+Users can override the inferred host and device attributes of default
+destructors by adding explicit host and device attributes to them.
+
 C++ Standard Parallelism Offload Support: Compiler And Runtime
 ==============================================================
 
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index a30a7076ea5d4..af648d7f9c63f 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -4336,11 +4336,11 @@ class Sema final : public SemaBase {
   // Whether the callee should be ignored in CUDA/HIP/OpenMP host/device check.
   bool shouldIgnoreInHostDeviceCheck(FunctionDecl *Callee);
 
-private:
   /// Function or variable declarations to be checked for whether the deferred
   /// diagnostics should be emitted.
   llvm::SmallSetVector<Decl *, 4> DeclsToCheckForDeferredDiags;
 
+private:
   /// Map of current shadowing declarations to shadowed declarations. Warn if
   /// it looks like the user is trying to modify the shadowing declaration.
   llvm::DenseMap<const NamedDecl *, const NamedDecl *> ShadowingDecls;
diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp
index 9507d7602aa40..e0eac690e6e65 100644
--- a/clang/lib/Sema/Sema.cpp
+++ b/clang/lib/Sema/Sema.cpp
@@ -1789,6 +1789,47 @@ class DeferredDiagnosticsEmitter
       Inherited::visitUsedDecl(Loc, D);
   }
 
+  // Visitor member and parent dtors called by this dtor.
+  void VisitCalledDestructors(CXXDestructorDecl *DD) {
+    const CXXRecordDecl *RD = DD->getParent();
+
+    // Visit the dtors of all members
+    for (const FieldDecl *FD : RD->fields()) {
+      QualType FT = FD->getType();
+      if (const auto *RT = FT->getAs<RecordType>())
+        if (const auto *ClassDecl = dyn_cast<CXXRecordDecl>(RT->getDecl()))
+          if (ClassDecl->hasDefinition())
+            if (CXXDestructorDecl *MemberDtor = ClassDecl->getDestructor())
+              asImpl().visitUsedDecl(MemberDtor->getLocation(), MemberDtor);
+    }
+
+    // Also visit base class dtors
+    for (const auto &Base : RD->bases()) {
+      QualType BaseType = Base.getType();
+      if (const auto *RT = BaseType->getAs<RecordType>())
+        if (const auto *BaseDecl = dyn_cast<CXXRecordDecl>(RT->getDecl()))
+          if (BaseDecl->hasDefinition())
+            if (CXXDestructorDecl *BaseDtor = BaseDecl->getDestructor())
+              asImpl().visitUsedDecl(BaseDtor->getLocation(), BaseDtor);
+    }
+  }
+
+  void VisitDeclStmt(DeclStmt *DS) {
+    // Visit dtors called by variables that need destruction
+    for (auto *D : DS->decls())
+      if (auto *VD = dyn_cast<VarDecl>(D))
+        if (VD->isThisDeclarationADefinition() &&
+            VD->needsDestruction(S.Context)) {
+          QualType VT = VD->getType();
+          if (const auto *RT = VT->getAs<RecordType>())
+            if (const auto *ClassDecl = dyn_cast<CXXRecordDecl>(RT->getDecl()))
+              if (ClassDecl->hasDefinition())
+                if (CXXDestructorDecl *Dtor = ClassDecl->getDestructor())
+                  asImpl().visitUsedDecl(Dtor->getLocation(), Dtor);
+        }
+
+    Inherited::VisitDeclStmt(DS);
+  }
   void checkVar(VarDecl *VD) {
     assert(VD->isFileVarDecl() &&
            "Should only check file-scope variables");
@@ -1830,6 +1871,8 @@ class DeferredDiagnosticsEmitter
     if (auto *S = FD->getBody()) {
       this->Visit(S);
     }
+    if (CXXDestructorDecl *Dtor = dyn_cast<CXXDestructorDecl>(FD))
+      asImpl().VisitCalledDestructors(Dtor);
     UsePath.pop_back();
     InUsePath.erase(FD);
   }
diff --git a/clang/lib/Sema/SemaCUDA.cpp b/clang/lib/Sema/SemaCUDA.cpp
index 0e1bf727d72d2..0e5fc5e1a40b4 100644
--- a/clang/lib/Sema/SemaCUDA.cpp
+++ b/clang/lib/Sema/SemaCUDA.cpp
@@ -372,6 +372,21 @@ bool SemaCUDA::inferTargetForImplicitSpecialMember(CXXRecordDecl *ClassDecl,
                                                    CXXMethodDecl *MemberDecl,
                                                    bool ConstRHS,
                                                    bool Diagnose) {
+  // If MemberDecl is virtual destructor of an explicit template class
+  // instantiation, it must be emitted, therefore it needs to be inferred
+  // conservatively by ignoring implicit host/device attrs of member and parent
+  // dtors called by it. Also, it needs to be checed by deferred diag visitor.
+  bool IsExpVDtor = false;
+  if (isa<CXXDestructorDecl>(MemberDecl) && MemberDecl->isVirtual()) {
+    if (auto *Spec = dyn_cast<ClassTemplateSpecializationDecl>(ClassDecl)) {
+      TemplateSpecializationKind TSK = Spec->getTemplateSpecializationKind();
+      IsExpVDtor = TSK == TSK_ExplicitInstantiationDeclaration ||
+                   TSK == TSK_ExplicitInstantiationDefinition;
+    }
+  }
+  if (IsExpVDtor)
+    SemaRef.DeclsToCheckForDeferredDiags.insert(MemberDecl);
+
   // If the defaulted special member is defined lexically outside of its
   // owning class, or the special member already has explicit device or host
   // attributes, do not infer.
@@ -422,7 +437,9 @@ bool SemaCUDA::inferTargetForImplicitSpecialMember(CXXRecordDecl *ClassDecl,
     if (!SMOR.getMethod())
       continue;
 
-    CUDAFunctionTarget BaseMethodTarget = IdentifyTarget(SMOR.getMethod());
+    CUDAFunctionTarget BaseMethodTarget =
+        IdentifyTarget(SMOR.getMethod(), IsExpVDtor);
+
     if (!InferredTarget) {
       InferredTarget = BaseMethodTarget;
     } else {
@@ -466,7 +483,9 @@ bool SemaCUDA::inferTargetForImplicitSpecialMember(CXXRecordDecl *ClassDecl,
     if (!SMOR.getMethod())
       continue;
 
-    CUDAFunctionTarget FieldMethodTarget = IdentifyTarget(SMOR.getMethod());
+    CUDAFunctionTarget FieldMethodTarget =
+        IdentifyTarget(SMOR.getMethod(), IsExpVDtor);
+
     if (!InferredTarget) {
       InferredTarget = FieldMethodTarget;
     } else {
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 01f09aba8c2ad..f70401ea33b4a 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -20388,6 +20388,21 @@ Sema::FunctionEmissionStatus Sema::getEmissionStatus(const FunctionDecl *FD,
 
     if (IsEmittedForExternalSymbol())
       return FunctionEmissionStatus::Emitted;
+
+    // If FD is a virtual destructor of an explicit instantiation
+    // of a template class, return Emitted.
+    if (auto *Destructor = dyn_cast<CXXDestructorDecl>(FD)) {
+      if (Destructor->isVirtual()) {
+        if (auto *Spec = dyn_cast<ClassTemplateSpecializationDecl>(
+                Destructor->getParent())) {
+          TemplateSpecializationKind TSK =
+              Spec->getTemplateSpecializationKind();
+          if (TSK == TSK_ExplicitInstantiationDeclaration ||
+              TSK == TSK_ExplicitInstantiationDefinition)
+            return FunctionEmissionStatus::Emitted;
+        }
+      }
+    }
   }
 
   // Otherwise, the function is known-emitted if it's in our set of
diff --git a/clang/test/SemaCUDA/dtor.cu b/clang/test/SemaCUDA/dtor.cu
new file mode 100644
index 0000000000000..cc37837e70791
--- /dev/null
+++ b/clang/test/SemaCUDA/dtor.cu
@@ -0,0 +1,104 @@
+// RUN: %clang_cc1 %s -std=c++20 -fsyntax-only -verify=host
+// RUN: %clang_cc1 %s -std=c++20 -fcuda-is-device -fsyntax-only -verify=dev
+
+// host-no-diagnostics
+
+#include "Inputs/cuda.h"
+
+// Virtual dtor ~B() of explicit instantiation B<float> must
+// be emitted, which causes host_fun() called.
+namespace ExplicitInstantiationExplicitDevDtor {
+void host_fun() // dev-note {{'host_fun' declared here}}
+{}
+
+template <unsigned>
+constexpr void hd_fun() {
+  host_fun(); // dev-error {{reference to __host__ function 'host_fun' in __host__ __device__ function}}
+}
+
+struct A {
+  constexpr ~A() { // dev-note {{called by '~B'}}
+     hd_fun<8>(); // dev-note {{called by '~A'}}
+  }
+};
+
+template <typename T>
+struct B {
+public:
+  virtual __device__ ~B() = default;
+  A _a;
+};
+
+template class B<float>;
+}
+
+// The implicit host/device attrs of virtual dtor ~B() should be
+// conservatively inferred, where constexpr member dtor's should
+// not be considered device since they may call host functions.
+// Therefore B<float>::~B() should not have implicit device attr.
+// However C<float>::~C() should have implicit device attr since
+// it is trivial.
+namespace ExplicitInstantiationDtorNoAttr {
+void host_fun()
+{}
+
+template <unsigned>
+constexpr void hd_fun() {
+  host_fun();
+}
+
+struct A {
+  constexpr ~A() {
+     hd_fun<8>();
+  }
+};
+
+template <typename T>
+struct B {
+public:
+  virtual ~B() = default;
+  A _a;
+};
+
+template <typename T>
+struct C {
+public:
+  virtual ~C() = default;
+};
+
+template class B<float>;
+template class C<float>;
+__device__ void foo() {
+  C<float> x;
+}
+}
+
+// Dtors of implicit template class instantiation are not
+// conservatively inferred because the invalid usage can
+// be diagnosed.
+namespace ImplicitInstantiation {
+void host_fun() // dev-note {{'host_fun' declared here}}
+{}
+
+template <unsigned>
+constexpr void hd_fun() {
+  host_fun(); // dev-error {{reference to __host__ function 'host_fun' in __host__ __device__ function}}
+}
+
+struct A {
+  constexpr ~A() { // dev-note {{called by '~B'}}
+     hd_fun<8>(); // dev-note {{called by '~A'}}
+  }
+};
+
+template <typename T>
+struct B {
+public:
+  ~B() = default; // dev-note {{called by 'foo'}}
+  A _a;
+};
+
+__device__ void foo() {
+  B<float> x;
+}
+}

From e3f0ce3ef803a5689fe0710e50052af019474dc0 Mon Sep 17 00:00:00 2001
From: cor3ntin <corentinjabot@gmail.com>
Date: Sat, 15 Mar 2025 22:27:08 +0100
Subject: [PATCH 252/282] [Clang] Do not emit nodiscard warnings for the base
 expr of static member access (#131450)

For an expression `nodiscard_function().static_member(), the nodiscard
warnings added by #120223, are not useful or actionable, and are
disruptive to some library implementations; we just remove them.

Fixes #131410

(cherry picked from commit 9a1e39062b2ab445f1f4424ecdc5ffb46e8cb9e0)
---
 clang/include/clang/Sema/Sema.h                        |  5 -----
 clang/lib/Sema/SemaExprMember.cpp                      |  1 -
 clang/lib/Sema/SemaStmt.cpp                            |  4 ----
 .../CXX/dcl.dcl/dcl.attr/dcl.attr.nodiscard/p2.cpp     | 10 ++++++----
 clang/test/SemaCXX/ms-property.cpp                     |  2 +-
 5 files changed, 7 insertions(+), 15 deletions(-)

diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index af648d7f9c63f..48fd4bb342c65 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -10671,11 +10671,6 @@ class Sema final : public SemaBase {
                            SourceLocation EndLoc);
   void ActOnForEachDeclStmt(DeclGroupPtrTy Decl);
 
-  /// DiagnoseDiscardedExprMarkedNodiscard - Given an expression that is
-  /// semantically a discarded-value expression, diagnose if any [[nodiscard]]
-  /// value has been discarded.
-  void DiagnoseDiscardedExprMarkedNodiscard(const Expr *E);
-
   /// DiagnoseUnusedExprResult - If the statement passed in is an expression
   /// whose result is unused, warn.
   void DiagnoseUnusedExprResult(const Stmt *S, unsigned DiagID);
diff --git a/clang/lib/Sema/SemaExprMember.cpp b/clang/lib/Sema/SemaExprMember.cpp
index d130e8b86bc56..adb8e3cc90c0c 100644
--- a/clang/lib/Sema/SemaExprMember.cpp
+++ b/clang/lib/Sema/SemaExprMember.cpp
@@ -1136,7 +1136,6 @@ Sema::BuildMemberReferenceExpr(Expr *BaseExpr, QualType BaseExprType,
     if (Converted.isInvalid())
       return true;
     BaseExpr = Converted.get();
-    DiagnoseDiscardedExprMarkedNodiscard(BaseExpr);
     return false;
   };
   auto ConvertBaseExprToGLValue = [&] {
diff --git a/clang/lib/Sema/SemaStmt.cpp b/clang/lib/Sema/SemaStmt.cpp
index 947651d514b3b..b8b59793d6508 100644
--- a/clang/lib/Sema/SemaStmt.cpp
+++ b/clang/lib/Sema/SemaStmt.cpp
@@ -413,10 +413,6 @@ void DiagnoseUnused(Sema &S, const Expr *E, std::optional<unsigned> DiagID) {
 }
 } // namespace
 
-void Sema::DiagnoseDiscardedExprMarkedNodiscard(const Expr *E) {
-  DiagnoseUnused(*this, E, std::nullopt);
-}
-
 void Sema::DiagnoseUnusedExprResult(const Stmt *S, unsigned DiagID) {
   if (const LabelStmt *Label = dyn_cast_if_present<LabelStmt>(S))
     S = Label->getSubStmt();
diff --git a/clang/test/CXX/dcl.dcl/dcl.attr/dcl.attr.nodiscard/p2.cpp b/clang/test/CXX/dcl.dcl/dcl.attr/dcl.attr.nodiscard/p2.cpp
index 18f4bd5e9c0fa..0012ab976baa5 100644
--- a/clang/test/CXX/dcl.dcl/dcl.attr/dcl.attr.nodiscard/p2.cpp
+++ b/clang/test/CXX/dcl.dcl/dcl.attr/dcl.attr.nodiscard/p2.cpp
@@ -164,19 +164,21 @@ struct X {
 
 [[nodiscard]] X get_X();
 // cxx11-warning@-1 {{use of the 'nodiscard' attribute is a C++17 extension}}
+[[nodiscard]] X* get_Ptr();
+// cxx11-warning@-1 {{use of the 'nodiscard' attribute is a C++17 extension}}
 void f() {
+  get_X(); // expected-warning{{ignoring return value of function declared with 'nodiscard' attribute}}
+  (void) get_X();
   (void) get_X().variant_member;
   (void) get_X().anonymous_struct_member;
   (void) get_X().data_member;
   (void) get_X().static_data_member;
-  // expected-warning@-1 {{ignoring return value of function declared with 'nodiscard' attribute}}
   (void) get_X().unscoped_enum;
-  // expected-warning@-1 {{ignoring return value of function declared with 'nodiscard' attribute}}
   (void) get_X().scoped_enum;
-  // expected-warning@-1 {{ignoring return value of function declared with 'nodiscard' attribute}}
   (void) get_X().implicit_object_member_function();
   (void) get_X().static_member_function();
-  // expected-warning@-1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  (void) get_Ptr()->implicit_object_member_function();
+  (void) get_Ptr()->static_member_function();
 #if __cplusplus >= 202302L
   (void) get_X().explicit_object_member_function();
 #endif
diff --git a/clang/test/SemaCXX/ms-property.cpp b/clang/test/SemaCXX/ms-property.cpp
index d5799a8a4d363..f1424b9cb12bc 100644
--- a/clang/test/SemaCXX/ms-property.cpp
+++ b/clang/test/SemaCXX/ms-property.cpp
@@ -2,6 +2,7 @@
 // RUN: %clang_cc1 -triple=x86_64-pc-win32 -fms-compatibility -emit-pch -o %t -verify %s
 // RUN: %clang_cc1 -triple=x86_64-pc-win32 -fms-compatibility -include-pch %t %s -ast-print -o - | FileCheck %s
 // RUN: %clang_cc1 -fdeclspec -fsyntax-only -verify %s -std=c++23
+// expected-no-diagnostics
 
 #ifndef HEADER
 #define HEADER
@@ -103,7 +104,6 @@ struct X {
 void f() {
   (void) get_x().imp;
   (void) get_x().st;
-  // expected-warning@-1 {{ignoring return value of function declared with 'nodiscard' attribute}}
 #if __cplusplus >= 202302L
   (void) get_x().exp;
 #endif

From 2044b18af046df923536da10fa1f8fbe4b0cd6f6 Mon Sep 17 00:00:00 2001
From: Corentin Jabot <corentinjabot@gmail.com>
Date: Mon, 17 Mar 2025 21:50:26 +0100
Subject: [PATCH 253/282] fix abi

---
 clang/include/clang/Sema/Sema.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 48fd4bb342c65..b6e45b8940a1a 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -10671,6 +10671,10 @@ class Sema final : public SemaBase {
                            SourceLocation EndLoc);
   void ActOnForEachDeclStmt(DeclGroupPtrTy Decl);
 
+
+  // Unused, kept in Clang 20 for ABI stability.
+  void DiagnoseDiscardedExprMarkedNodiscard(const Expr *E) {};
+
   /// DiagnoseUnusedExprResult - If the statement passed in is an expression
   /// whose result is unused, warn.
   void DiagnoseUnusedExprResult(const Stmt *S, unsigned DiagID);

From 2dc152fb40dbe0b0a80bb17dbcac9cc39ed3761d Mon Sep 17 00:00:00 2001
From: Corentin Jabot <corentinjabot@gmail.com>
Date: Tue, 18 Mar 2025 09:53:12 +0100
Subject: [PATCH 254/282] fix abi (again)

---
 clang/include/clang/Sema/Sema.h | 2 +-
 clang/lib/Sema/SemaStmt.cpp     | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index b6e45b8940a1a..cecf5cff332f4 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -10673,7 +10673,7 @@ class Sema final : public SemaBase {
 
 
   // Unused, kept in Clang 20 for ABI stability.
-  void DiagnoseDiscardedExprMarkedNodiscard(const Expr *E) {};
+  void DiagnoseDiscardedExprMarkedNodiscard(const Expr *E);
 
   /// DiagnoseUnusedExprResult - If the statement passed in is an expression
   /// whose result is unused, warn.
diff --git a/clang/lib/Sema/SemaStmt.cpp b/clang/lib/Sema/SemaStmt.cpp
index b8b59793d6508..947651d514b3b 100644
--- a/clang/lib/Sema/SemaStmt.cpp
+++ b/clang/lib/Sema/SemaStmt.cpp
@@ -413,6 +413,10 @@ void DiagnoseUnused(Sema &S, const Expr *E, std::optional<unsigned> DiagID) {
 }
 } // namespace
 
+void Sema::DiagnoseDiscardedExprMarkedNodiscard(const Expr *E) {
+  DiagnoseUnused(*this, E, std::nullopt);
+}
+
 void Sema::DiagnoseUnusedExprResult(const Stmt *S, unsigned DiagID) {
   if (const LabelStmt *Label = dyn_cast_if_present<LabelStmt>(S))
     S = Label->getSubStmt();

From 0619bbcbbd864f662790b194c77baaf5578d21b9 Mon Sep 17 00:00:00 2001
From: yonghong-song <yhs@fb.com>
Date: Tue, 18 Mar 2025 15:41:30 -0700
Subject: [PATCH 255/282] [BPF] Add default cpu change in ReleaseNotes
 (#131691)

The pull request [1] changed bpf default cpu from -mcpu=v1 to
-mcpu=v3 in clang20. Recently in [1], Yuval Deutscher suggested
to add an entry to clang20 ReleaseNotes so users can easily find
the change from documentation.

  [1] https://github.com/llvm/llvm-project/pull/107008

Co-authored-by: Yonghong Song <yonghong.song@linux.dev>
---
 clang/docs/ReleaseNotes.rst | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index dc63b5213c546..03b68271b7864 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -1300,6 +1300,11 @@ AVR Support
 
 - Reject C/C++ compilation for avr1 devices which have no SRAM.
 
+BPF Support
+^^^^^^^^^^^
+
+- Make ``-mcpu=v3`` as the default.
+
 DWARF Support in Clang
 ----------------------
 

From 2cc53628fbe88d00c8f54bfd54530330f191ebfc Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Mon, 17 Mar 2025 13:59:16 +0800
Subject: [PATCH 256/282] [SCEV] Check whether the start is non-zero in
 `ScalarEvolution::howFarToZero` (#131522)

https://github.com/llvm/llvm-project/pull/94525 assumes that the loop
will be infinite when the stride is zero. However, it doesn't hold when
the start value of addrec is also zero.

Closes https://github.com/llvm/llvm-project/issues/131465.

(cherry picked from commit c5a491e9ea22014b65664b6e09134b4f055933e2)
---
 llvm/lib/Analysis/ScalarEvolution.cpp         |  9 ++--
 .../trip-count-unknown-stride.ll              | 34 +++++++++++----
 llvm/test/Transforms/LoopUnroll/pr131465.ll   | 43 +++++++++++++++++++
 3 files changed, 74 insertions(+), 12 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopUnroll/pr131465.ll

diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index c71202c8dd58e..b8069df4e6598 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -10635,10 +10635,11 @@ ScalarEvolution::ExitLimit ScalarEvolution::howFarToZero(const SCEV *V,
   if (ControlsOnlyExit && AddRec->hasNoSelfWrap() &&
       loopHasNoAbnormalExits(AddRec->getLoop())) {
 
-    // If the stride is zero, the loop must be infinite.  In C++, most loops
-    // are finite by assumption, in which case the step being zero implies
-    // UB must execute if the loop is entered.
-    if (!loopIsFiniteByAssumption(L) && !isKnownNonZero(StepWLG))
+    // If the stride is zero and the start is non-zero, the loop must be
+    // infinite. In C++, most loops are finite by assumption, in which case the
+    // step being zero implies UB must execute if the loop is entered.
+    if (!(loopIsFiniteByAssumption(L) && isKnownNonZero(Start)) &&
+        !isKnownNonZero(StepWLG))
       return getCouldNotCompute();
 
     const SCEV *Exact =
diff --git a/llvm/test/Analysis/ScalarEvolution/trip-count-unknown-stride.ll b/llvm/test/Analysis/ScalarEvolution/trip-count-unknown-stride.ll
index 2d02cb6194f4c..1f08a620b2e15 100644
--- a/llvm/test/Analysis/ScalarEvolution/trip-count-unknown-stride.ll
+++ b/llvm/test/Analysis/ScalarEvolution/trip-count-unknown-stride.ll
@@ -329,10 +329,9 @@ define void @ne_nsw_nonneg_step(ptr nocapture %A, i32 %n, i32 %s) mustprogress {
 ;
 ; CHECK-LABEL: 'ne_nsw_nonneg_step'
 ; CHECK-NEXT:  Determining loop execution counts for: @ne_nsw_nonneg_step
-; CHECK-NEXT:  Loop %for.body: backedge-taken count is (((-1 * %s) + %n) /u %s)
-; CHECK-NEXT:  Loop %for.body: constant max backedge-taken count is i32 -1
-; CHECK-NEXT:  Loop %for.body: symbolic max backedge-taken count is (((-1 * %s) + %n) /u %s)
-; CHECK-NEXT:  Loop %for.body: Trip multiple is 1
+; CHECK-NEXT:  Loop %for.body: Unpredictable backedge-taken count.
+; CHECK-NEXT:  Loop %for.body: Unpredictable constant max backedge-taken count.
+; CHECK-NEXT:  Loop %for.body: Unpredictable symbolic max backedge-taken count.
 ;
 entry:
   %nonneg_step = icmp sge i32 %s, 0
@@ -442,10 +441,9 @@ define void @ne_nuw_nonneg_step(ptr nocapture %A, i32 %n, i32 %s) mustprogress {
 ;
 ; CHECK-LABEL: 'ne_nuw_nonneg_step'
 ; CHECK-NEXT:  Determining loop execution counts for: @ne_nuw_nonneg_step
-; CHECK-NEXT:  Loop %for.body: backedge-taken count is (((-1 * %s) + %n) /u %s)
-; CHECK-NEXT:  Loop %for.body: constant max backedge-taken count is i32 -1
-; CHECK-NEXT:  Loop %for.body: symbolic max backedge-taken count is (((-1 * %s) + %n) /u %s)
-; CHECK-NEXT:  Loop %for.body: Trip multiple is 1
+; CHECK-NEXT:  Loop %for.body: Unpredictable backedge-taken count.
+; CHECK-NEXT:  Loop %for.body: Unpredictable constant max backedge-taken count.
+; CHECK-NEXT:  Loop %for.body: Unpredictable symbolic max backedge-taken count.
 ;
 entry:
   %nonneg_step = icmp sge i32 %s, 0
@@ -493,6 +491,26 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
+define i32 @pr131465(i1 %x) mustprogress {
+; CHECK-LABEL: 'pr131465'
+; CHECK-NEXT:  Determining loop execution counts for: @pr131465
+; CHECK-NEXT:  Loop %for.body: Unpredictable backedge-taken count.
+; CHECK-NEXT:  Loop %for.body: Unpredictable constant max backedge-taken count.
+; CHECK-NEXT:  Loop %for.body: Unpredictable symbolic max backedge-taken count.
+;
+entry:
+  %inc = zext i1 %x to i32
+  br label %for.body
+
+for.body:
+  %indvar = phi i32 [ 2, %entry ], [ %next, %for.body ]
+  %next = add nsw i32 %indvar, %inc
+  %exitcond = icmp eq i32 %next, 2
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 0
+}
 
 declare void @llvm.assume(i1)
 
diff --git a/llvm/test/Transforms/LoopUnroll/pr131465.ll b/llvm/test/Transforms/LoopUnroll/pr131465.ll
new file mode 100644
index 0000000000000..643b020c6c110
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/pr131465.ll
@@ -0,0 +1,43 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=loop-unroll -unroll-runtime %s | FileCheck %s
+
+define i32 @pr131465(i1 %x) mustprogress {
+; CHECK-LABEL: define i32 @pr131465(
+; CHECK-SAME: i1 [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[INC:%.*]] = zext i1 [[X]] to i32
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVAR:%.*]] = phi i32 [ 2, %[[ENTRY]] ], [ [[NEXT_1:%.*]], %[[FOR_BODY_1:.*]] ]
+; CHECK-NEXT:    [[NEXT:%.*]] = add nsw i32 [[INDVAR]], [[INC]]
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[NEXT]], 2
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END:.*]], label %[[FOR_BODY_1]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[FOR_BODY_1]]:
+; CHECK-NEXT:    [[NEXT_1]] = add nsw i32 [[NEXT]], [[INC]]
+; CHECK-NEXT:    [[EXITCOND_1:%.*]] = icmp eq i32 [[NEXT_1]], 2
+; CHECK-NEXT:    br i1 [[EXITCOND_1]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %inc = zext i1 %x to i32
+  br label %for.body
+
+for.body:
+  %indvar = phi i32 [ 2, %entry ], [ %next, %for.body ]
+  %next = add nsw i32 %indvar, %inc
+  %exitcond = icmp eq i32 %next, 2
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+  ret i32 0
+}
+
+; Force runtime unrolling.
+!0 = !{!0, !{!"llvm.loop.unroll.count", i32 2}}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.unroll.count", i32 2}
+; CHECK: [[LOOP2]] = distinct !{[[LOOP2]], [[META3:![0-9]+]]}
+; CHECK: [[META3]] = !{!"llvm.loop.unroll.disable"}
+;.

From 424c2d9b7e4de40d0804dd374721e6411c27d1d1 Mon Sep 17 00:00:00 2001
From: Takuto Ikuta <tikuta@google.com>
Date: Thu, 13 Feb 2025 16:54:43 +0900
Subject: [PATCH 257/282] [libcxx] Add a missing include for __bit_iterator
 (#127015)

This is to fix compile error with explicit Clang modules like
```
../../third_party/libc++/src/include/__vector/vector_bool.h:85:11: error: default argument of '__bit_iterator' must be imported from module 'std.bit_reference_fwd' before it is required
   85 |   typedef __bit_iterator<vector, false> pointer;
      |           ^
../../third_party/libc++/src/include/__fwd/bit_reference.h:23:68: note: default argument declared here is not reachable
   23 | template <class _Cp, bool _IsConst, typename _Cp::__storage_type = 0>
      |                                                                    ^
```

(cherry picked from commit 672e3858a4e4b9e155adb72426074ea2af0dd922)
---
 libcxx/include/__vector/vector_bool.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libcxx/include/__vector/vector_bool.h b/libcxx/include/__vector/vector_bool.h
index 4f1c442ce0be8..feff646a35dc8 100644
--- a/libcxx/include/__vector/vector_bool.h
+++ b/libcxx/include/__vector/vector_bool.h
@@ -17,6 +17,7 @@
 #include <__bit_reference>
 #include <__config>
 #include <__functional/unary_function.h>
+#include <__fwd/bit_reference.h>
 #include <__fwd/functional.h>
 #include <__fwd/vector.h>
 #include <__iterator/distance.h>

From 3f957cc67cff4e337f56fa2dbfdb037d3a997baf Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Fri, 21 Mar 2025 14:03:40 -0700
Subject: [PATCH 258/282] Bump version to 20.1.2 (#132293)

---
 cmake/Modules/LLVMVersion.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/Modules/LLVMVersion.cmake b/cmake/Modules/LLVMVersion.cmake
index 05e772cb6fa38..49cdc04707eb7 100644
--- a/cmake/Modules/LLVMVersion.cmake
+++ b/cmake/Modules/LLVMVersion.cmake
@@ -7,7 +7,7 @@ if(NOT DEFINED LLVM_VERSION_MINOR)
   set(LLVM_VERSION_MINOR 1)
 endif()
 if(NOT DEFINED LLVM_VERSION_PATCH)
-  set(LLVM_VERSION_PATCH 1)
+  set(LLVM_VERSION_PATCH 2)
 endif()
 if(NOT DEFINED LLVM_VERSION_SUFFIX)
   set(LLVM_VERSION_SUFFIX)

From 9710e9963455563444c5834f0c9a1e77a21447da Mon Sep 17 00:00:00 2001
From: Evgenii Kudriashov <evgenii.kudriashov@intel.com>
Date: Tue, 25 Mar 2025 13:26:13 +0100
Subject: [PATCH 259/282] [X86][AVX10.2] Include changes for COMX and VGETEXP
 from rev. 2 (#132824)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Address missing changes:
- V[,U]COMXSD need to have XD (F3.0F –> F2.0F)
- V[,U]COMXS[S,H] need to have XS (F2.[0F,MAP5] -> F3.[0F,MAP5])
- VGETEXPBF16 needs to have T_MAP6 and NP (66.MAP5 -> NP.MAP6)

 Ref.: https://cdrdv2.intel.com/v1/dl/getContent/828965

(cherry picked from commit 975c208556ef85b321a223fe592fa6d98fadfaa0)
---
 llvm/lib/Target/X86/X86InstrAVX10.td          | 20 ++--
 .../CodeGen/X86/avx10_2_512bf16-intrinsics.ll |  4 +-
 .../CodeGen/X86/avx10_2bf16-intrinsics.ll     | 20 ++--
 .../MC/Disassembler/X86/avx10.2-bf16-32.txt   | 54 +++++------
 .../MC/Disassembler/X86/avx10.2-bf16-64.txt   | 54 +++++------
 .../MC/Disassembler/X86/avx10.2-com-ef-32.txt | 96 +++++++++----------
 .../MC/Disassembler/X86/avx10.2-com-ef-64.txt | 96 +++++++++----------
 llvm/test/MC/X86/avx10.2-bf16-32-att.s        | 54 +++++------
 llvm/test/MC/X86/avx10.2-bf16-32-intel.s      | 54 +++++------
 llvm/test/MC/X86/avx10.2-bf16-64-att.s        | 54 +++++------
 llvm/test/MC/X86/avx10.2-bf16-64-intel.s      | 54 +++++------
 llvm/test/MC/X86/avx10.2-com-ef-32-att.s      | 96 +++++++++----------
 llvm/test/MC/X86/avx10.2-com-ef-32-intel.s    | 96 +++++++++----------
 llvm/test/MC/X86/avx10.2-com-ef-64-att.s      | 96 +++++++++----------
 llvm/test/MC/X86/avx10.2-com-ef-64-intel.s    | 96 +++++++++----------
 15 files changed, 472 insertions(+), 472 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstrAVX10.td b/llvm/lib/Target/X86/X86InstrAVX10.td
index 9bb3e364f7c62..37d3b0a67cd33 100644
--- a/llvm/lib/Target/X86/X86InstrAVX10.td
+++ b/llvm/lib/Target/X86/X86InstrAVX10.td
@@ -1468,7 +1468,7 @@ defm VRSQRT  : avx10_fp14_bf16<0x4E, "vrsqrt", X86rsqrt14, SchedWriteFRsqrt>,
 defm VRCP    : avx10_fp14_bf16<0x4C, "vrcp", X86rcp14, SchedWriteFRcp>,
                                 T_MAP6, PS, EVEX_CD8<16, CD8VF>;
 defm VGETEXP : avx10_fp14_bf16<0x42, "vgetexp", X86fgetexp, SchedWriteFRnd>,
-                                T_MAP5, EVEX_CD8<16, CD8VF>;
+                                T_MAP6, PS, EVEX_CD8<16, CD8VF>;
 
 // VSCALEFBF16
 multiclass avx10_fp_scalef_bf16<bits<8> opc, string OpcodeStr,
@@ -1665,31 +1665,31 @@ multiclass avx10_com_ef_int<bits<8> Opc, X86VectorVTInfo _, SDNode OpNode,
 let Defs = [EFLAGS], Uses = [MXCSR], Predicates = [HasAVX10_2] in {
   defm VUCOMXSDZ  :  avx10_com_ef<0x2e, FR64X, f64, X86ucomi512,
                                   "vucomxsd", f64mem, loadf64, SSEPackedDouble>,
-                                  TB, XS, VEX_LIG, REX_W, EVEX_CD8<64, CD8VT1>;
+                                  TB, XD, VEX_LIG, REX_W, EVEX_CD8<64, CD8VT1>;
   defm VUCOMXSHZ  :  avx10_com_ef<0x2e, FR16X, f16, X86ucomi512,
                                   "vucomxsh", f16mem, loadf16, SSEPackedSingle>,
-                                  T_MAP5, XD, EVEX_CD8<16, CD8VT1>;
+                                  T_MAP5, XS, EVEX_CD8<16, CD8VT1>;
   defm VUCOMXSSZ  :  avx10_com_ef<0x2e, FR32X, f32, X86ucomi512,
                                   "vucomxss", f32mem, loadf32, SSEPackedSingle>,
-                                  TB, XD, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+                                  TB, XS, VEX_LIG, EVEX_CD8<32, CD8VT1>;
   defm VCOMXSDZ   :  avx10_com_ef_int<0x2f, v2f64x_info, X86comi512,
                                       "vcomxsd", SSEPackedDouble>,
-                                      TB, XS, VEX_LIG, REX_W, EVEX_CD8<64, CD8VT1>;
+                                      TB, XD, VEX_LIG, REX_W, EVEX_CD8<64, CD8VT1>;
   defm VCOMXSHZ   :  avx10_com_ef_int<0x2f, v8f16x_info, X86comi512,
                                       "vcomxsh", SSEPackedSingle>,
-                                      T_MAP5, XD, EVEX_CD8<16, CD8VT1>;
+                                      T_MAP5, XS, EVEX_CD8<16, CD8VT1>;
   defm VCOMXSSZ   :  avx10_com_ef_int<0x2f, v4f32x_info, X86comi512,
                                       "vcomxss", SSEPackedSingle>,
-                                      TB, XD, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+                                      TB, XS, VEX_LIG, EVEX_CD8<32, CD8VT1>;
   defm VUCOMXSDZ  :  avx10_com_ef_int<0x2e, v2f64x_info, X86ucomi512,
                                       "vucomxsd", SSEPackedDouble>,
-                                      TB, XS, VEX_LIG, REX_W, EVEX_CD8<64, CD8VT1>;
+                                      TB, XD, VEX_LIG, REX_W, EVEX_CD8<64, CD8VT1>;
   defm VUCOMXSHZ  :  avx10_com_ef_int<0x2e, v8f16x_info, X86ucomi512,
                                       "vucomxsh", SSEPackedSingle>,
-                                      T_MAP5, XD, EVEX_CD8<16, CD8VT1>;
+                                      T_MAP5, XS, EVEX_CD8<16, CD8VT1>;
   defm VUCOMXSSZ  :  avx10_com_ef_int<0x2e, v4f32x_info, X86ucomi512,
                                       "vucomxss", SSEPackedSingle>,
-                                      TB, XD, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+                                      TB, XS, VEX_LIG, EVEX_CD8<32, CD8VT1>;
 }
 
 //-------------------------------------------------
diff --git a/llvm/test/CodeGen/X86/avx10_2_512bf16-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2_512bf16-intrinsics.ll
index da17b995afedf..cbac76e9de273 100644
--- a/llvm/test/CodeGen/X86/avx10_2_512bf16-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx10_2_512bf16-intrinsics.ll
@@ -164,7 +164,7 @@ define <32 x bfloat>@test_int_x86_avx512_mask_getexp_bf16_512(<32 x bfloat> %x0,
 ; X64-LABEL: test_int_x86_avx512_mask_getexp_bf16_512:
 ; X64:       # %bb.0:
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
-; X64-NEXT:    vgetexpbf16 %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7d,0x48,0x42,0xc0]
+; X64-NEXT:    vgetexpbf16 %zmm0, %zmm0 # encoding: [0x62,0xf6,0x7c,0x48,0x42,0xc0]
 ; X64-NEXT:    vmovdqu16 %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0xff,0x49,0x6f,0xc8]
 ; X64-NEXT:    vaddbf16 %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf5,0x75,0x48,0x58,0xc0]
 ; X64-NEXT:    retq # encoding: [0xc3]
@@ -172,7 +172,7 @@ define <32 x bfloat>@test_int_x86_avx512_mask_getexp_bf16_512(<32 x bfloat> %x0,
 ; X86-LABEL: test_int_x86_avx512_mask_getexp_bf16_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
-; X86-NEXT:    vgetexpbf16 %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7d,0x48,0x42,0xc0]
+; X86-NEXT:    vgetexpbf16 %zmm0, %zmm0 # encoding: [0x62,0xf6,0x7c,0x48,0x42,0xc0]
 ; X86-NEXT:    vmovdqu16 %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0xff,0x49,0x6f,0xc8]
 ; X86-NEXT:    vaddbf16 %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf5,0x75,0x48,0x58,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
diff --git a/llvm/test/CodeGen/X86/avx10_2bf16-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2bf16-intrinsics.ll
index 06875dbe7cd23..ba32b2adc7999 100644
--- a/llvm/test/CodeGen/X86/avx10_2bf16-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx10_2bf16-intrinsics.ll
@@ -333,7 +333,7 @@ declare <16 x bfloat> @llvm.x86.avx10.mask.getexp.bf16.256(<16 x bfloat>, <16 x
 define <8 x bfloat>@test_int_x86_avx512_getexp_bf16_128(<8 x bfloat> %x0) {
 ; CHECK-LABEL: test_int_x86_avx512_getexp_bf16_128:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vgetexpbf16 %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7d,0x08,0x42,0xc0]
+; CHECK-NEXT:    vgetexpbf16 %xmm0, %xmm0 # encoding: [0x62,0xf6,0x7c,0x08,0x42,0xc0]
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %res = call <8 x bfloat> @llvm.x86.avx10.mask.getexp.bf16.128(<8 x bfloat> %x0, <8 x bfloat> zeroinitializer, i8 -1)
   ret <8 x bfloat> %res
@@ -343,14 +343,14 @@ define <8 x bfloat>@test_int_x86_avx512_mask_getexp_bf16_128(<8 x bfloat> %x0, <
 ; X64-LABEL: test_int_x86_avx512_mask_getexp_bf16_128:
 ; X64:       # %bb.0:
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
-; X64-NEXT:    vgetexpbf16 %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf5,0x7d,0x09,0x42,0xc8]
+; X64-NEXT:    vgetexpbf16 %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf6,0x7c,0x09,0x42,0xc8]
 ; X64-NEXT:    vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
 ; X64-NEXT:    retq # encoding: [0xc3]
 ;
 ; X86-LABEL: test_int_x86_avx512_mask_getexp_bf16_128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
-; X86-NEXT:    vgetexpbf16 %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf5,0x7d,0x09,0x42,0xc8]
+; X86-NEXT:    vgetexpbf16 %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf6,0x7c,0x09,0x42,0xc8]
 ; X86-NEXT:    vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
   %res = call <8 x bfloat> @llvm.x86.avx10.mask.getexp.bf16.128(<8 x bfloat> %x0, <8 x bfloat> %x1, i8 %x2)
@@ -361,13 +361,13 @@ define <8 x bfloat>@test_int_x86_avx512_maskz_getexp_bf16_128(<8 x bfloat> %x0,
 ; X64-LABEL: test_int_x86_avx512_maskz_getexp_bf16_128:
 ; X64:       # %bb.0:
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
-; X64-NEXT:    vgetexpbf16 %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0x89,0x42,0xc0]
+; X64-NEXT:    vgetexpbf16 %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf6,0x7c,0x89,0x42,0xc0]
 ; X64-NEXT:    retq # encoding: [0xc3]
 ;
 ; X86-LABEL: test_int_x86_avx512_maskz_getexp_bf16_128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
-; X86-NEXT:    vgetexpbf16 %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0x89,0x42,0xc0]
+; X86-NEXT:    vgetexpbf16 %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf6,0x7c,0x89,0x42,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
   %res = call <8 x bfloat> @llvm.x86.avx10.mask.getexp.bf16.128(<8 x bfloat> %x0, <8 x bfloat> zeroinitializer, i8 %x2)
   ret <8 x bfloat> %res
@@ -376,7 +376,7 @@ define <8 x bfloat>@test_int_x86_avx512_maskz_getexp_bf16_128(<8 x bfloat> %x0,
 define <16 x bfloat>@test_int_x86_avx512_getexp_bf16_256(<16 x bfloat> %x0) {
 ; CHECK-LABEL: test_int_x86_avx512_getexp_bf16_256:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vgetexpbf16 %ymm0, %ymm0 # encoding: [0x62,0xf5,0x7d,0x28,0x42,0xc0]
+; CHECK-NEXT:    vgetexpbf16 %ymm0, %ymm0 # encoding: [0x62,0xf6,0x7c,0x28,0x42,0xc0]
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %res = call <16 x bfloat> @llvm.x86.avx10.mask.getexp.bf16.256(<16 x bfloat> %x0, <16 x bfloat> zeroinitializer, i16 -1)
   ret <16 x bfloat> %res
@@ -386,14 +386,14 @@ define <16 x bfloat>@test_int_x86_avx512_mask_getexp_bf16_256(<16 x bfloat> %x0,
 ; X64-LABEL: test_int_x86_avx512_mask_getexp_bf16_256:
 ; X64:       # %bb.0:
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
-; X64-NEXT:    vgetexpbf16 %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf5,0x7d,0x29,0x42,0xc8]
+; X64-NEXT:    vgetexpbf16 %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf6,0x7c,0x29,0x42,0xc8]
 ; X64-NEXT:    vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1]
 ; X64-NEXT:    retq # encoding: [0xc3]
 ;
 ; X86-LABEL: test_int_x86_avx512_mask_getexp_bf16_256:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
-; X86-NEXT:    vgetexpbf16 %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf5,0x7d,0x29,0x42,0xc8]
+; X86-NEXT:    vgetexpbf16 %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf6,0x7c,0x29,0x42,0xc8]
 ; X86-NEXT:    vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
   %res = call <16 x bfloat> @llvm.x86.avx10.mask.getexp.bf16.256(<16 x bfloat> %x0, <16 x bfloat> %x1, i16 %x2)
@@ -404,13 +404,13 @@ define <16 x bfloat>@test_int_x86_avx512_maskz_getexp_bf16_256(<16 x bfloat> %x0
 ; X64-LABEL: test_int_x86_avx512_maskz_getexp_bf16_256:
 ; X64:       # %bb.0:
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
-; X64-NEXT:    vgetexpbf16 %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0xa9,0x42,0xc0]
+; X64-NEXT:    vgetexpbf16 %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf6,0x7c,0xa9,0x42,0xc0]
 ; X64-NEXT:    retq # encoding: [0xc3]
 ;
 ; X86-LABEL: test_int_x86_avx512_maskz_getexp_bf16_256:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
-; X86-NEXT:    vgetexpbf16 %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0xa9,0x42,0xc0]
+; X86-NEXT:    vgetexpbf16 %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf6,0x7c,0xa9,0x42,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
   %res = call <16 x bfloat> @llvm.x86.avx10.mask.getexp.bf16.256(<16 x bfloat> %x0, <16 x bfloat> zeroinitializer, i16 %x2)
   ret <16 x bfloat> %res
diff --git a/llvm/test/MC/Disassembler/X86/avx10.2-bf16-32.txt b/llvm/test/MC/Disassembler/X86/avx10.2-bf16-32.txt
index a32e55e20e6b7..0db70d290e565 100644
--- a/llvm/test/MC/Disassembler/X86/avx10.2-bf16-32.txt
+++ b/llvm/test/MC/Disassembler/X86/avx10.2-bf16-32.txt
@@ -1719,111 +1719,111 @@
 
 # ATT:   vgetexpbf16 %xmm3, %xmm2
 # INTEL: vgetexpbf16 xmm2, xmm3
-0x62,0xf5,0x7d,0x08,0x42,0xd3
+0x62,0xf6,0x7c,0x08,0x42,0xd3
 
 # ATT:   vgetexpbf16 %xmm3, %xmm2 {%k7}
 # INTEL: vgetexpbf16 xmm2 {k7}, xmm3
-0x62,0xf5,0x7d,0x0f,0x42,0xd3
+0x62,0xf6,0x7c,0x0f,0x42,0xd3
 
 # ATT:   vgetexpbf16 %xmm3, %xmm2 {%k7} {z}
 # INTEL: vgetexpbf16 xmm2 {k7} {z}, xmm3
-0x62,0xf5,0x7d,0x8f,0x42,0xd3
+0x62,0xf6,0x7c,0x8f,0x42,0xd3
 
 # ATT:   vgetexpbf16 %zmm3, %zmm2
 # INTEL: vgetexpbf16 zmm2, zmm3
-0x62,0xf5,0x7d,0x48,0x42,0xd3
+0x62,0xf6,0x7c,0x48,0x42,0xd3
 
 # ATT:   vgetexpbf16 %zmm3, %zmm2 {%k7}
 # INTEL: vgetexpbf16 zmm2 {k7}, zmm3
-0x62,0xf5,0x7d,0x4f,0x42,0xd3
+0x62,0xf6,0x7c,0x4f,0x42,0xd3
 
 # ATT:   vgetexpbf16 %zmm3, %zmm2 {%k7} {z}
 # INTEL: vgetexpbf16 zmm2 {k7} {z}, zmm3
-0x62,0xf5,0x7d,0xcf,0x42,0xd3
+0x62,0xf6,0x7c,0xcf,0x42,0xd3
 
 # ATT:   vgetexpbf16 %ymm3, %ymm2
 # INTEL: vgetexpbf16 ymm2, ymm3
-0x62,0xf5,0x7d,0x28,0x42,0xd3
+0x62,0xf6,0x7c,0x28,0x42,0xd3
 
 # ATT:   vgetexpbf16 %ymm3, %ymm2 {%k7}
 # INTEL: vgetexpbf16 ymm2 {k7}, ymm3
-0x62,0xf5,0x7d,0x2f,0x42,0xd3
+0x62,0xf6,0x7c,0x2f,0x42,0xd3
 
 # ATT:   vgetexpbf16 %ymm3, %ymm2 {%k7} {z}
 # INTEL: vgetexpbf16 ymm2 {k7} {z}, ymm3
-0x62,0xf5,0x7d,0xaf,0x42,0xd3
+0x62,0xf6,0x7c,0xaf,0x42,0xd3
 
 # ATT:   vgetexpbf16  268435456(%esp,%esi,8), %xmm2
 # INTEL: vgetexpbf16 xmm2, xmmword ptr [esp + 8*esi + 268435456]
-0x62,0xf5,0x7d,0x08,0x42,0x94,0xf4,0x00,0x00,0x00,0x10
+0x62,0xf6,0x7c,0x08,0x42,0x94,0xf4,0x00,0x00,0x00,0x10
 
 # ATT:   vgetexpbf16  291(%edi,%eax,4), %xmm2 {%k7}
 # INTEL: vgetexpbf16 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291]
-0x62,0xf5,0x7d,0x0f,0x42,0x94,0x87,0x23,0x01,0x00,0x00
+0x62,0xf6,0x7c,0x0f,0x42,0x94,0x87,0x23,0x01,0x00,0x00
 
 # ATT:   vgetexpbf16  (%eax){1to8}, %xmm2
 # INTEL: vgetexpbf16 xmm2, word ptr [eax]{1to8}
-0x62,0xf5,0x7d,0x18,0x42,0x10
+0x62,0xf6,0x7c,0x18,0x42,0x10
 
 # ATT:   vgetexpbf16  -512(,%ebp,2), %xmm2
 # INTEL: vgetexpbf16 xmm2, xmmword ptr [2*ebp - 512]
-0x62,0xf5,0x7d,0x08,0x42,0x14,0x6d,0x00,0xfe,0xff,0xff
+0x62,0xf6,0x7c,0x08,0x42,0x14,0x6d,0x00,0xfe,0xff,0xff
 
 # ATT:   vgetexpbf16  2032(%ecx), %xmm2 {%k7} {z}
 # INTEL: vgetexpbf16 xmm2 {k7} {z}, xmmword ptr [ecx + 2032]
-0x62,0xf5,0x7d,0x8f,0x42,0x51,0x7f
+0x62,0xf6,0x7c,0x8f,0x42,0x51,0x7f
 
 # ATT:   vgetexpbf16  -256(%edx){1to8}, %xmm2 {%k7} {z}
 # INTEL: vgetexpbf16 xmm2 {k7} {z}, word ptr [edx - 256]{1to8}
-0x62,0xf5,0x7d,0x9f,0x42,0x52,0x80
+0x62,0xf6,0x7c,0x9f,0x42,0x52,0x80
 
 # ATT:   vgetexpbf16  268435456(%esp,%esi,8), %ymm2
 # INTEL: vgetexpbf16 ymm2, ymmword ptr [esp + 8*esi + 268435456]
-0x62,0xf5,0x7d,0x28,0x42,0x94,0xf4,0x00,0x00,0x00,0x10
+0x62,0xf6,0x7c,0x28,0x42,0x94,0xf4,0x00,0x00,0x00,0x10
 
 # ATT:   vgetexpbf16  291(%edi,%eax,4), %ymm2 {%k7}
 # INTEL: vgetexpbf16 ymm2 {k7}, ymmword ptr [edi + 4*eax + 291]
-0x62,0xf5,0x7d,0x2f,0x42,0x94,0x87,0x23,0x01,0x00,0x00
+0x62,0xf6,0x7c,0x2f,0x42,0x94,0x87,0x23,0x01,0x00,0x00
 
 # ATT:   vgetexpbf16  (%eax){1to16}, %ymm2
 # INTEL: vgetexpbf16 ymm2, word ptr [eax]{1to16}
-0x62,0xf5,0x7d,0x38,0x42,0x10
+0x62,0xf6,0x7c,0x38,0x42,0x10
 
 # ATT:   vgetexpbf16  -1024(,%ebp,2), %ymm2
 # INTEL: vgetexpbf16 ymm2, ymmword ptr [2*ebp - 1024]
-0x62,0xf5,0x7d,0x28,0x42,0x14,0x6d,0x00,0xfc,0xff,0xff
+0x62,0xf6,0x7c,0x28,0x42,0x14,0x6d,0x00,0xfc,0xff,0xff
 
 # ATT:   vgetexpbf16  4064(%ecx), %ymm2 {%k7} {z}
 # INTEL: vgetexpbf16 ymm2 {k7} {z}, ymmword ptr [ecx + 4064]
-0x62,0xf5,0x7d,0xaf,0x42,0x51,0x7f
+0x62,0xf6,0x7c,0xaf,0x42,0x51,0x7f
 
 # ATT:   vgetexpbf16  -256(%edx){1to16}, %ymm2 {%k7} {z}
 # INTEL: vgetexpbf16 ymm2 {k7} {z}, word ptr [edx - 256]{1to16}
-0x62,0xf5,0x7d,0xbf,0x42,0x52,0x80
+0x62,0xf6,0x7c,0xbf,0x42,0x52,0x80
 
 # ATT:   vgetexpbf16  268435456(%esp,%esi,8), %zmm2
 # INTEL: vgetexpbf16 zmm2, zmmword ptr [esp + 8*esi + 268435456]
-0x62,0xf5,0x7d,0x48,0x42,0x94,0xf4,0x00,0x00,0x00,0x10
+0x62,0xf6,0x7c,0x48,0x42,0x94,0xf4,0x00,0x00,0x00,0x10
 
 # ATT:   vgetexpbf16  291(%edi,%eax,4), %zmm2 {%k7}
 # INTEL: vgetexpbf16 zmm2 {k7}, zmmword ptr [edi + 4*eax + 291]
-0x62,0xf5,0x7d,0x4f,0x42,0x94,0x87,0x23,0x01,0x00,0x00
+0x62,0xf6,0x7c,0x4f,0x42,0x94,0x87,0x23,0x01,0x00,0x00
 
 # ATT:   vgetexpbf16  (%eax){1to32}, %zmm2
 # INTEL: vgetexpbf16 zmm2, word ptr [eax]{1to32}
-0x62,0xf5,0x7d,0x58,0x42,0x10
+0x62,0xf6,0x7c,0x58,0x42,0x10
 
 # ATT:   vgetexpbf16  -2048(,%ebp,2), %zmm2
 # INTEL: vgetexpbf16 zmm2, zmmword ptr [2*ebp - 2048]
-0x62,0xf5,0x7d,0x48,0x42,0x14,0x6d,0x00,0xf8,0xff,0xff
+0x62,0xf6,0x7c,0x48,0x42,0x14,0x6d,0x00,0xf8,0xff,0xff
 
 # ATT:   vgetexpbf16  8128(%ecx), %zmm2 {%k7} {z}
 # INTEL: vgetexpbf16 zmm2 {k7} {z}, zmmword ptr [ecx + 8128]
-0x62,0xf5,0x7d,0xcf,0x42,0x51,0x7f
+0x62,0xf6,0x7c,0xcf,0x42,0x51,0x7f
 
 # ATT:   vgetexpbf16  -256(%edx){1to32}, %zmm2 {%k7} {z}
 # INTEL: vgetexpbf16 zmm2 {k7} {z}, word ptr [edx - 256]{1to32}
-0x62,0xf5,0x7d,0xdf,0x42,0x52,0x80
+0x62,0xf6,0x7c,0xdf,0x42,0x52,0x80
 
 # ATT:   vgetmantbf16 $123, %zmm3, %zmm2
 # INTEL: vgetmantbf16 zmm2, zmm3, 123
diff --git a/llvm/test/MC/Disassembler/X86/avx10.2-bf16-64.txt b/llvm/test/MC/Disassembler/X86/avx10.2-bf16-64.txt
index 1319c5cbd0362..197415e5ba329 100644
--- a/llvm/test/MC/Disassembler/X86/avx10.2-bf16-64.txt
+++ b/llvm/test/MC/Disassembler/X86/avx10.2-bf16-64.txt
@@ -1719,111 +1719,111 @@
 
 # ATT:   vgetexpbf16 %xmm23, %xmm22
 # INTEL: vgetexpbf16 xmm22, xmm23
-0x62,0xa5,0x7d,0x08,0x42,0xf7
+0x62,0xa6,0x7c,0x08,0x42,0xf7
 
 # ATT:   vgetexpbf16 %xmm23, %xmm22 {%k7}
 # INTEL: vgetexpbf16 xmm22 {k7}, xmm23
-0x62,0xa5,0x7d,0x0f,0x42,0xf7
+0x62,0xa6,0x7c,0x0f,0x42,0xf7
 
 # ATT:   vgetexpbf16 %xmm23, %xmm22 {%k7} {z}
 # INTEL: vgetexpbf16 xmm22 {k7} {z}, xmm23
-0x62,0xa5,0x7d,0x8f,0x42,0xf7
+0x62,0xa6,0x7c,0x8f,0x42,0xf7
 
 # ATT:   vgetexpbf16 %zmm23, %zmm22
 # INTEL: vgetexpbf16 zmm22, zmm23
-0x62,0xa5,0x7d,0x48,0x42,0xf7
+0x62,0xa6,0x7c,0x48,0x42,0xf7
 
 # ATT:   vgetexpbf16 %zmm23, %zmm22 {%k7}
 # INTEL: vgetexpbf16 zmm22 {k7}, zmm23
-0x62,0xa5,0x7d,0x4f,0x42,0xf7
+0x62,0xa6,0x7c,0x4f,0x42,0xf7
 
 # ATT:   vgetexpbf16 %zmm23, %zmm22 {%k7} {z}
 # INTEL: vgetexpbf16 zmm22 {k7} {z}, zmm23
-0x62,0xa5,0x7d,0xcf,0x42,0xf7
+0x62,0xa6,0x7c,0xcf,0x42,0xf7
 
 # ATT:   vgetexpbf16 %ymm23, %ymm22
 # INTEL: vgetexpbf16 ymm22, ymm23
-0x62,0xa5,0x7d,0x28,0x42,0xf7
+0x62,0xa6,0x7c,0x28,0x42,0xf7
 
 # ATT:   vgetexpbf16 %ymm23, %ymm22 {%k7}
 # INTEL: vgetexpbf16 ymm22 {k7}, ymm23
-0x62,0xa5,0x7d,0x2f,0x42,0xf7
+0x62,0xa6,0x7c,0x2f,0x42,0xf7
 
 # ATT:   vgetexpbf16 %ymm23, %ymm22 {%k7} {z}
 # INTEL: vgetexpbf16 ymm22 {k7} {z}, ymm23
-0x62,0xa5,0x7d,0xaf,0x42,0xf7
+0x62,0xa6,0x7c,0xaf,0x42,0xf7
 
 # ATT:   vgetexpbf16  268435456(%rbp,%r14,8), %xmm22
 # INTEL: vgetexpbf16 xmm22, xmmword ptr [rbp + 8*r14 + 268435456]
-0x62,0xa5,0x7d,0x08,0x42,0xb4,0xf5,0x00,0x00,0x00,0x10
+0x62,0xa6,0x7c,0x08,0x42,0xb4,0xf5,0x00,0x00,0x00,0x10
 
 # ATT:   vgetexpbf16  291(%r8,%rax,4), %xmm22 {%k7}
 # INTEL: vgetexpbf16 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291]
-0x62,0xc5,0x7d,0x0f,0x42,0xb4,0x80,0x23,0x01,0x00,0x00
+0x62,0xc6,0x7c,0x0f,0x42,0xb4,0x80,0x23,0x01,0x00,0x00
 
 # ATT:   vgetexpbf16  (%rip){1to8}, %xmm22
 # INTEL: vgetexpbf16 xmm22, word ptr [rip]{1to8}
-0x62,0xe5,0x7d,0x18,0x42,0x35,0x00,0x00,0x00,0x00
+0x62,0xe6,0x7c,0x18,0x42,0x35,0x00,0x00,0x00,0x00
 
 # ATT:   vgetexpbf16  -512(,%rbp,2), %xmm22
 # INTEL: vgetexpbf16 xmm22, xmmword ptr [2*rbp - 512]
-0x62,0xe5,0x7d,0x08,0x42,0x34,0x6d,0x00,0xfe,0xff,0xff
+0x62,0xe6,0x7c,0x08,0x42,0x34,0x6d,0x00,0xfe,0xff,0xff
 
 # ATT:   vgetexpbf16  2032(%rcx), %xmm22 {%k7} {z}
 # INTEL: vgetexpbf16 xmm22 {k7} {z}, xmmword ptr [rcx + 2032]
-0x62,0xe5,0x7d,0x8f,0x42,0x71,0x7f
+0x62,0xe6,0x7c,0x8f,0x42,0x71,0x7f
 
 # ATT:   vgetexpbf16  -256(%rdx){1to8}, %xmm22 {%k7} {z}
 # INTEL: vgetexpbf16 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8}
-0x62,0xe5,0x7d,0x9f,0x42,0x72,0x80
+0x62,0xe6,0x7c,0x9f,0x42,0x72,0x80
 
 # ATT:   vgetexpbf16  268435456(%rbp,%r14,8), %ymm22
 # INTEL: vgetexpbf16 ymm22, ymmword ptr [rbp + 8*r14 + 268435456]
-0x62,0xa5,0x7d,0x28,0x42,0xb4,0xf5,0x00,0x00,0x00,0x10
+0x62,0xa6,0x7c,0x28,0x42,0xb4,0xf5,0x00,0x00,0x00,0x10
 
 # ATT:   vgetexpbf16  291(%r8,%rax,4), %ymm22 {%k7}
 # INTEL: vgetexpbf16 ymm22 {k7}, ymmword ptr [r8 + 4*rax + 291]
-0x62,0xc5,0x7d,0x2f,0x42,0xb4,0x80,0x23,0x01,0x00,0x00
+0x62,0xc6,0x7c,0x2f,0x42,0xb4,0x80,0x23,0x01,0x00,0x00
 
 # ATT:   vgetexpbf16  (%rip){1to16}, %ymm22
 # INTEL: vgetexpbf16 ymm22, word ptr [rip]{1to16}
-0x62,0xe5,0x7d,0x38,0x42,0x35,0x00,0x00,0x00,0x00
+0x62,0xe6,0x7c,0x38,0x42,0x35,0x00,0x00,0x00,0x00
 
 # ATT:   vgetexpbf16  -1024(,%rbp,2), %ymm22
 # INTEL: vgetexpbf16 ymm22, ymmword ptr [2*rbp - 1024]
-0x62,0xe5,0x7d,0x28,0x42,0x34,0x6d,0x00,0xfc,0xff,0xff
+0x62,0xe6,0x7c,0x28,0x42,0x34,0x6d,0x00,0xfc,0xff,0xff
 
 # ATT:   vgetexpbf16  4064(%rcx), %ymm22 {%k7} {z}
 # INTEL: vgetexpbf16 ymm22 {k7} {z}, ymmword ptr [rcx + 4064]
-0x62,0xe5,0x7d,0xaf,0x42,0x71,0x7f
+0x62,0xe6,0x7c,0xaf,0x42,0x71,0x7f
 
 # ATT:   vgetexpbf16  -256(%rdx){1to16}, %ymm22 {%k7} {z}
 # INTEL: vgetexpbf16 ymm22 {k7} {z}, word ptr [rdx - 256]{1to16}
-0x62,0xe5,0x7d,0xbf,0x42,0x72,0x80
+0x62,0xe6,0x7c,0xbf,0x42,0x72,0x80
 
 # ATT:   vgetexpbf16  268435456(%rbp,%r14,8), %zmm22
 # INTEL: vgetexpbf16 zmm22, zmmword ptr [rbp + 8*r14 + 268435456]
-0x62,0xa5,0x7d,0x48,0x42,0xb4,0xf5,0x00,0x00,0x00,0x10
+0x62,0xa6,0x7c,0x48,0x42,0xb4,0xf5,0x00,0x00,0x00,0x10
 
 # ATT:   vgetexpbf16  291(%r8,%rax,4), %zmm22 {%k7}
 # INTEL: vgetexpbf16 zmm22 {k7}, zmmword ptr [r8 + 4*rax + 291]
-0x62,0xc5,0x7d,0x4f,0x42,0xb4,0x80,0x23,0x01,0x00,0x00
+0x62,0xc6,0x7c,0x4f,0x42,0xb4,0x80,0x23,0x01,0x00,0x00
 
 # ATT:   vgetexpbf16  (%rip){1to32}, %zmm22
 # INTEL: vgetexpbf16 zmm22, word ptr [rip]{1to32}
-0x62,0xe5,0x7d,0x58,0x42,0x35,0x00,0x00,0x00,0x00
+0x62,0xe6,0x7c,0x58,0x42,0x35,0x00,0x00,0x00,0x00
 
 # ATT:   vgetexpbf16  -2048(,%rbp,2), %zmm22
 # INTEL: vgetexpbf16 zmm22, zmmword ptr [2*rbp - 2048]
-0x62,0xe5,0x7d,0x48,0x42,0x34,0x6d,0x00,0xf8,0xff,0xff
+0x62,0xe6,0x7c,0x48,0x42,0x34,0x6d,0x00,0xf8,0xff,0xff
 
 # ATT:   vgetexpbf16  8128(%rcx), %zmm22 {%k7} {z}
 # INTEL: vgetexpbf16 zmm22 {k7} {z}, zmmword ptr [rcx + 8128]
-0x62,0xe5,0x7d,0xcf,0x42,0x71,0x7f
+0x62,0xe6,0x7c,0xcf,0x42,0x71,0x7f
 
 # ATT:   vgetexpbf16  -256(%rdx){1to32}, %zmm22 {%k7} {z}
 # INTEL: vgetexpbf16 zmm22 {k7} {z}, word ptr [rdx - 256]{1to32}
-0x62,0xe5,0x7d,0xdf,0x42,0x72,0x80
+0x62,0xe6,0x7c,0xdf,0x42,0x72,0x80
 
 # ATT:   vgetmantbf16 $123, %zmm23, %zmm22
 # INTEL: vgetmantbf16 zmm22, zmm23, 123
diff --git a/llvm/test/MC/Disassembler/X86/avx10.2-com-ef-32.txt b/llvm/test/MC/Disassembler/X86/avx10.2-com-ef-32.txt
index e7adacbbf88c8..ecdc75979e8df 100644
--- a/llvm/test/MC/Disassembler/X86/avx10.2-com-ef-32.txt
+++ b/llvm/test/MC/Disassembler/X86/avx10.2-com-ef-32.txt
@@ -3,193 +3,193 @@
 
 # ATT:   vcomxsd %xmm3, %xmm2
 # INTEL: vcomxsd xmm2, xmm3
-0x62,0xf1,0xfe,0x08,0x2f,0xd3
+0x62,0xf1,0xff,0x08,0x2f,0xd3
 
 # ATT:   vcomxsd {sae}, %xmm3, %xmm2
 # INTEL: vcomxsd xmm2, xmm3, {sae}
-0x62,0xf1,0xfe,0x18,0x2f,0xd3
+0x62,0xf1,0xff,0x18,0x2f,0xd3
 
 # ATT:   vcomxsd  268435456(%esp,%esi,8), %xmm2
 # INTEL: vcomxsd xmm2, qword ptr [esp + 8*esi + 268435456]
-0x62,0xf1,0xfe,0x08,0x2f,0x94,0xf4,0x00,0x00,0x00,0x10
+0x62,0xf1,0xff,0x08,0x2f,0x94,0xf4,0x00,0x00,0x00,0x10
 
 # ATT:   vcomxsd  291(%edi,%eax,4), %xmm2
 # INTEL: vcomxsd xmm2, qword ptr [edi + 4*eax + 291]
-0x62,0xf1,0xfe,0x08,0x2f,0x94,0x87,0x23,0x01,0x00,0x00
+0x62,0xf1,0xff,0x08,0x2f,0x94,0x87,0x23,0x01,0x00,0x00
 
 # ATT:   vcomxsd  (%eax), %xmm2
 # INTEL: vcomxsd xmm2, qword ptr [eax]
-0x62,0xf1,0xfe,0x08,0x2f,0x10
+0x62,0xf1,0xff,0x08,0x2f,0x10
 
 # ATT:   vcomxsd  -256(,%ebp,2), %xmm2
 # INTEL: vcomxsd xmm2, qword ptr [2*ebp - 256]
-0x62,0xf1,0xfe,0x08,0x2f,0x14,0x6d,0x00,0xff,0xff,0xff
+0x62,0xf1,0xff,0x08,0x2f,0x14,0x6d,0x00,0xff,0xff,0xff
 
 # ATT:   vcomxsd  1016(%ecx), %xmm2
 # INTEL: vcomxsd xmm2, qword ptr [ecx + 1016]
-0x62,0xf1,0xfe,0x08,0x2f,0x51,0x7f
+0x62,0xf1,0xff,0x08,0x2f,0x51,0x7f
 
 # ATT:   vcomxsd  -1024(%edx), %xmm2
 # INTEL: vcomxsd xmm2, qword ptr [edx - 1024]
-0x62,0xf1,0xfe,0x08,0x2f,0x52,0x80
+0x62,0xf1,0xff,0x08,0x2f,0x52,0x80
 
 # ATT:   vcomxsh %xmm3, %xmm2
 # INTEL: vcomxsh xmm2, xmm3
-0x62,0xf5,0x7f,0x08,0x2f,0xd3
+0x62,0xf5,0x7e,0x08,0x2f,0xd3
 
 # ATT:   vcomxsh {sae}, %xmm3, %xmm2
 # INTEL: vcomxsh xmm2, xmm3, {sae}
-0x62,0xf5,0x7f,0x18,0x2f,0xd3
+0x62,0xf5,0x7e,0x18,0x2f,0xd3
 
 # ATT:   vcomxsh  268435456(%esp,%esi,8), %xmm2
 # INTEL: vcomxsh xmm2, word ptr [esp + 8*esi + 268435456]
-0x62,0xf5,0x7f,0x08,0x2f,0x94,0xf4,0x00,0x00,0x00,0x10
+0x62,0xf5,0x7e,0x08,0x2f,0x94,0xf4,0x00,0x00,0x00,0x10
 
 # ATT:   vcomxsh  291(%edi,%eax,4), %xmm2
 # INTEL: vcomxsh xmm2, word ptr [edi + 4*eax + 291]
-0x62,0xf5,0x7f,0x08,0x2f,0x94,0x87,0x23,0x01,0x00,0x00
+0x62,0xf5,0x7e,0x08,0x2f,0x94,0x87,0x23,0x01,0x00,0x00
 
 # ATT:   vcomxsh  (%eax), %xmm2
 # INTEL: vcomxsh xmm2, word ptr [eax]
-0x62,0xf5,0x7f,0x08,0x2f,0x10
+0x62,0xf5,0x7e,0x08,0x2f,0x10
 
 # ATT:   vcomxsh  -64(,%ebp,2), %xmm2
 # INTEL: vcomxsh xmm2, word ptr [2*ebp - 64]
-0x62,0xf5,0x7f,0x08,0x2f,0x14,0x6d,0xc0,0xff,0xff,0xff
+0x62,0xf5,0x7e,0x08,0x2f,0x14,0x6d,0xc0,0xff,0xff,0xff
 
 # ATT:   vcomxsh  254(%ecx), %xmm2
 # INTEL: vcomxsh xmm2, word ptr [ecx + 254]
-0x62,0xf5,0x7f,0x08,0x2f,0x51,0x7f
+0x62,0xf5,0x7e,0x08,0x2f,0x51,0x7f
 
 # ATT:   vcomxsh  -256(%edx), %xmm2
 # INTEL: vcomxsh xmm2, word ptr [edx - 256]
-0x62,0xf5,0x7f,0x08,0x2f,0x52,0x80
+0x62,0xf5,0x7e,0x08,0x2f,0x52,0x80
 
 # ATT:   vcomxss %xmm3, %xmm2
 # INTEL: vcomxss xmm2, xmm3
-0x62,0xf1,0x7f,0x08,0x2f,0xd3
+0x62,0xf1,0x7e,0x08,0x2f,0xd3
 
 # ATT:   vcomxss {sae}, %xmm3, %xmm2
 # INTEL: vcomxss xmm2, xmm3, {sae}
-0x62,0xf1,0x7f,0x18,0x2f,0xd3
+0x62,0xf1,0x7e,0x18,0x2f,0xd3
 
 # ATT:   vcomxss  268435456(%esp,%esi,8), %xmm2
 # INTEL: vcomxss xmm2, dword ptr [esp + 8*esi + 268435456]
-0x62,0xf1,0x7f,0x08,0x2f,0x94,0xf4,0x00,0x00,0x00,0x10
+0x62,0xf1,0x7e,0x08,0x2f,0x94,0xf4,0x00,0x00,0x00,0x10
 
 # ATT:   vcomxss  291(%edi,%eax,4), %xmm2
 # INTEL: vcomxss xmm2, dword ptr [edi + 4*eax + 291]
-0x62,0xf1,0x7f,0x08,0x2f,0x94,0x87,0x23,0x01,0x00,0x00
+0x62,0xf1,0x7e,0x08,0x2f,0x94,0x87,0x23,0x01,0x00,0x00
 
 # ATT:   vcomxss  (%eax), %xmm2
 # INTEL: vcomxss xmm2, dword ptr [eax]
-0x62,0xf1,0x7f,0x08,0x2f,0x10
+0x62,0xf1,0x7e,0x08,0x2f,0x10
 
 # ATT:   vcomxss  -128(,%ebp,2), %xmm2
 # INTEL: vcomxss xmm2, dword ptr [2*ebp - 128]
-0x62,0xf1,0x7f,0x08,0x2f,0x14,0x6d,0x80,0xff,0xff,0xff
+0x62,0xf1,0x7e,0x08,0x2f,0x14,0x6d,0x80,0xff,0xff,0xff
 
 # ATT:   vcomxss  508(%ecx), %xmm2
 # INTEL: vcomxss xmm2, dword ptr [ecx + 508]
-0x62,0xf1,0x7f,0x08,0x2f,0x51,0x7f
+0x62,0xf1,0x7e,0x08,0x2f,0x51,0x7f
 
 # ATT:   vcomxss  -512(%edx), %xmm2
 # INTEL: vcomxss xmm2, dword ptr [edx - 512]
-0x62,0xf1,0x7f,0x08,0x2f,0x52,0x80
+0x62,0xf1,0x7e,0x08,0x2f,0x52,0x80
 
 # ATT:   vucomxsd %xmm3, %xmm2
 # INTEL: vucomxsd xmm2, xmm3
-0x62,0xf1,0xfe,0x08,0x2e,0xd3
+0x62,0xf1,0xff,0x08,0x2e,0xd3
 
 # ATT:   vucomxsd {sae}, %xmm3, %xmm2
 # INTEL: vucomxsd xmm2, xmm3, {sae}
-0x62,0xf1,0xfe,0x18,0x2e,0xd3
+0x62,0xf1,0xff,0x18,0x2e,0xd3
 
 # ATT:   vucomxsd  268435456(%esp,%esi,8), %xmm2
 # INTEL: vucomxsd xmm2, qword ptr [esp + 8*esi + 268435456]
-0x62,0xf1,0xfe,0x08,0x2e,0x94,0xf4,0x00,0x00,0x00,0x10
+0x62,0xf1,0xff,0x08,0x2e,0x94,0xf4,0x00,0x00,0x00,0x10
 
 # ATT:   vucomxsd  291(%edi,%eax,4), %xmm2
 # INTEL: vucomxsd xmm2, qword ptr [edi + 4*eax + 291]
-0x62,0xf1,0xfe,0x08,0x2e,0x94,0x87,0x23,0x01,0x00,0x00
+0x62,0xf1,0xff,0x08,0x2e,0x94,0x87,0x23,0x01,0x00,0x00
 
 # ATT:   vucomxsd  (%eax), %xmm2
 # INTEL: vucomxsd xmm2, qword ptr [eax]
-0x62,0xf1,0xfe,0x08,0x2e,0x10
+0x62,0xf1,0xff,0x08,0x2e,0x10
 
 # ATT:   vucomxsd  -256(,%ebp,2), %xmm2
 # INTEL: vucomxsd xmm2, qword ptr [2*ebp - 256]
-0x62,0xf1,0xfe,0x08,0x2e,0x14,0x6d,0x00,0xff,0xff,0xff
+0x62,0xf1,0xff,0x08,0x2e,0x14,0x6d,0x00,0xff,0xff,0xff
 
 # ATT:   vucomxsd  1016(%ecx), %xmm2
 # INTEL: vucomxsd xmm2, qword ptr [ecx + 1016]
-0x62,0xf1,0xfe,0x08,0x2e,0x51,0x7f
+0x62,0xf1,0xff,0x08,0x2e,0x51,0x7f
 
 # ATT:   vucomxsd  -1024(%edx), %xmm2
 # INTEL: vucomxsd xmm2, qword ptr [edx - 1024]
-0x62,0xf1,0xfe,0x08,0x2e,0x52,0x80
+0x62,0xf1,0xff,0x08,0x2e,0x52,0x80
 
 # ATT:   vucomxsh %xmm3, %xmm2
 # INTEL: vucomxsh xmm2, xmm3
-0x62,0xf5,0x7f,0x08,0x2e,0xd3
+0x62,0xf5,0x7e,0x08,0x2e,0xd3
 
 # ATT:   vucomxsh {sae}, %xmm3, %xmm2
 # INTEL: vucomxsh xmm2, xmm3, {sae}
-0x62,0xf5,0x7f,0x18,0x2e,0xd3
+0x62,0xf5,0x7e,0x18,0x2e,0xd3
 
 # ATT:   vucomxsh  268435456(%esp,%esi,8), %xmm2
 # INTEL: vucomxsh xmm2, word ptr [esp + 8*esi + 268435456]
-0x62,0xf5,0x7f,0x08,0x2e,0x94,0xf4,0x00,0x00,0x00,0x10
+0x62,0xf5,0x7e,0x08,0x2e,0x94,0xf4,0x00,0x00,0x00,0x10
 
 # ATT:   vucomxsh  291(%edi,%eax,4), %xmm2
 # INTEL: vucomxsh xmm2, word ptr [edi + 4*eax + 291]
-0x62,0xf5,0x7f,0x08,0x2e,0x94,0x87,0x23,0x01,0x00,0x00
+0x62,0xf5,0x7e,0x08,0x2e,0x94,0x87,0x23,0x01,0x00,0x00
 
 # ATT:   vucomxsh  (%eax), %xmm2
 # INTEL: vucomxsh xmm2, word ptr [eax]
-0x62,0xf5,0x7f,0x08,0x2e,0x10
+0x62,0xf5,0x7e,0x08,0x2e,0x10
 
 # ATT:   vucomxsh  -64(,%ebp,2), %xmm2
 # INTEL: vucomxsh xmm2, word ptr [2*ebp - 64]
-0x62,0xf5,0x7f,0x08,0x2e,0x14,0x6d,0xc0,0xff,0xff,0xff
+0x62,0xf5,0x7e,0x08,0x2e,0x14,0x6d,0xc0,0xff,0xff,0xff
 
 # ATT:   vucomxsh  254(%ecx), %xmm2
 # INTEL: vucomxsh xmm2, word ptr [ecx + 254]
-0x62,0xf5,0x7f,0x08,0x2e,0x51,0x7f
+0x62,0xf5,0x7e,0x08,0x2e,0x51,0x7f
 
 # ATT:   vucomxsh  -256(%edx), %xmm2
 # INTEL: vucomxsh xmm2, word ptr [edx - 256]
-0x62,0xf5,0x7f,0x08,0x2e,0x52,0x80
+0x62,0xf5,0x7e,0x08,0x2e,0x52,0x80
 
 # ATT:   vucomxss %xmm3, %xmm2
 # INTEL: vucomxss xmm2, xmm3
-0x62,0xf1,0x7f,0x08,0x2e,0xd3
+0x62,0xf1,0x7e,0x08,0x2e,0xd3
 
 # ATT:   vucomxss {sae}, %xmm3, %xmm2
 # INTEL: vucomxss xmm2, xmm3, {sae}
-0x62,0xf1,0x7f,0x18,0x2e,0xd3
+0x62,0xf1,0x7e,0x18,0x2e,0xd3
 
 # ATT:   vucomxss  268435456(%esp,%esi,8), %xmm2
 # INTEL: vucomxss xmm2, dword ptr [esp + 8*esi + 268435456]
-0x62,0xf1,0x7f,0x08,0x2e,0x94,0xf4,0x00,0x00,0x00,0x10
+0x62,0xf1,0x7e,0x08,0x2e,0x94,0xf4,0x00,0x00,0x00,0x10
 
 # ATT:   vucomxss  291(%edi,%eax,4), %xmm2
 # INTEL: vucomxss xmm2, dword ptr [edi + 4*eax + 291]
-0x62,0xf1,0x7f,0x08,0x2e,0x94,0x87,0x23,0x01,0x00,0x00
+0x62,0xf1,0x7e,0x08,0x2e,0x94,0x87,0x23,0x01,0x00,0x00
 
 # ATT:   vucomxss  (%eax), %xmm2
 # INTEL: vucomxss xmm2, dword ptr [eax]
-0x62,0xf1,0x7f,0x08,0x2e,0x10
+0x62,0xf1,0x7e,0x08,0x2e,0x10
 
 # ATT:   vucomxss  -128(,%ebp,2), %xmm2
 # INTEL: vucomxss xmm2, dword ptr [2*ebp - 128]
-0x62,0xf1,0x7f,0x08,0x2e,0x14,0x6d,0x80,0xff,0xff,0xff
+0x62,0xf1,0x7e,0x08,0x2e,0x14,0x6d,0x80,0xff,0xff,0xff
 
 # ATT:   vucomxss  508(%ecx), %xmm2
 # INTEL: vucomxss xmm2, dword ptr [ecx + 508]
-0x62,0xf1,0x7f,0x08,0x2e,0x51,0x7f
+0x62,0xf1,0x7e,0x08,0x2e,0x51,0x7f
 
 # ATT:   vucomxss  -512(%edx), %xmm2
 # INTEL: vucomxss xmm2, dword ptr [edx - 512]
-0x62,0xf1,0x7f,0x08,0x2e,0x52,0x80
+0x62,0xf1,0x7e,0x08,0x2e,0x52,0x80
 
diff --git a/llvm/test/MC/Disassembler/X86/avx10.2-com-ef-64.txt b/llvm/test/MC/Disassembler/X86/avx10.2-com-ef-64.txt
index ea580fe8d5083..e01e762d12aaf 100644
--- a/llvm/test/MC/Disassembler/X86/avx10.2-com-ef-64.txt
+++ b/llvm/test/MC/Disassembler/X86/avx10.2-com-ef-64.txt
@@ -3,193 +3,193 @@
 
 # ATT:   vcomxsd %xmm23, %xmm22
 # INTEL: vcomxsd xmm22, xmm23
-0x62,0xa1,0xfe,0x08,0x2f,0xf7
+0x62,0xa1,0xff,0x08,0x2f,0xf7
 
 # ATT:   vcomxsd {sae}, %xmm23, %xmm22
 # INTEL: vcomxsd xmm22, xmm23, {sae}
-0x62,0xa1,0xfe,0x18,0x2f,0xf7
+0x62,0xa1,0xff,0x18,0x2f,0xf7
 
 # ATT:   vcomxsd  268435456(%rbp,%r14,8), %xmm22
 # INTEL: vcomxsd xmm22, qword ptr [rbp + 8*r14 + 268435456]
-0x62,0xa1,0xfe,0x08,0x2f,0xb4,0xf5,0x00,0x00,0x00,0x10
+0x62,0xa1,0xff,0x08,0x2f,0xb4,0xf5,0x00,0x00,0x00,0x10
 
 # ATT:   vcomxsd  291(%r8,%rax,4), %xmm22
 # INTEL: vcomxsd xmm22, qword ptr [r8 + 4*rax + 291]
-0x62,0xc1,0xfe,0x08,0x2f,0xb4,0x80,0x23,0x01,0x00,0x00
+0x62,0xc1,0xff,0x08,0x2f,0xb4,0x80,0x23,0x01,0x00,0x00
 
 # ATT:   vcomxsd  (%rip), %xmm22
 # INTEL: vcomxsd xmm22, qword ptr [rip]
-0x62,0xe1,0xfe,0x08,0x2f,0x35,0x00,0x00,0x00,0x00
+0x62,0xe1,0xff,0x08,0x2f,0x35,0x00,0x00,0x00,0x00
 
 # ATT:   vcomxsd  -256(,%rbp,2), %xmm22
 # INTEL: vcomxsd xmm22, qword ptr [2*rbp - 256]
-0x62,0xe1,0xfe,0x08,0x2f,0x34,0x6d,0x00,0xff,0xff,0xff
+0x62,0xe1,0xff,0x08,0x2f,0x34,0x6d,0x00,0xff,0xff,0xff
 
 # ATT:   vcomxsd  1016(%rcx), %xmm22
 # INTEL: vcomxsd xmm22, qword ptr [rcx + 1016]
-0x62,0xe1,0xfe,0x08,0x2f,0x71,0x7f
+0x62,0xe1,0xff,0x08,0x2f,0x71,0x7f
 
 # ATT:   vcomxsd  -1024(%rdx), %xmm22
 # INTEL: vcomxsd xmm22, qword ptr [rdx - 1024]
-0x62,0xe1,0xfe,0x08,0x2f,0x72,0x80
+0x62,0xe1,0xff,0x08,0x2f,0x72,0x80
 
 # ATT:   vcomxsh %xmm23, %xmm22
 # INTEL: vcomxsh xmm22, xmm23
-0x62,0xa5,0x7f,0x08,0x2f,0xf7
+0x62,0xa5,0x7e,0x08,0x2f,0xf7
 
 # ATT:   vcomxsh {sae}, %xmm23, %xmm22
 # INTEL: vcomxsh xmm22, xmm23, {sae}
-0x62,0xa5,0x7f,0x18,0x2f,0xf7
+0x62,0xa5,0x7e,0x18,0x2f,0xf7
 
 # ATT:   vcomxsh  268435456(%rbp,%r14,8), %xmm22
 # INTEL: vcomxsh xmm22, word ptr [rbp + 8*r14 + 268435456]
-0x62,0xa5,0x7f,0x08,0x2f,0xb4,0xf5,0x00,0x00,0x00,0x10
+0x62,0xa5,0x7e,0x08,0x2f,0xb4,0xf5,0x00,0x00,0x00,0x10
 
 # ATT:   vcomxsh  291(%r8,%rax,4), %xmm22
 # INTEL: vcomxsh xmm22, word ptr [r8 + 4*rax + 291]
-0x62,0xc5,0x7f,0x08,0x2f,0xb4,0x80,0x23,0x01,0x00,0x00
+0x62,0xc5,0x7e,0x08,0x2f,0xb4,0x80,0x23,0x01,0x00,0x00
 
 # ATT:   vcomxsh  (%rip), %xmm22
 # INTEL: vcomxsh xmm22, word ptr [rip]
-0x62,0xe5,0x7f,0x08,0x2f,0x35,0x00,0x00,0x00,0x00
+0x62,0xe5,0x7e,0x08,0x2f,0x35,0x00,0x00,0x00,0x00
 
 # ATT:   vcomxsh  -64(,%rbp,2), %xmm22
 # INTEL: vcomxsh xmm22, word ptr [2*rbp - 64]
-0x62,0xe5,0x7f,0x08,0x2f,0x34,0x6d,0xc0,0xff,0xff,0xff
+0x62,0xe5,0x7e,0x08,0x2f,0x34,0x6d,0xc0,0xff,0xff,0xff
 
 # ATT:   vcomxsh  254(%rcx), %xmm22
 # INTEL: vcomxsh xmm22, word ptr [rcx + 254]
-0x62,0xe5,0x7f,0x08,0x2f,0x71,0x7f
+0x62,0xe5,0x7e,0x08,0x2f,0x71,0x7f
 
 # ATT:   vcomxsh  -256(%rdx), %xmm22
 # INTEL: vcomxsh xmm22, word ptr [rdx - 256]
-0x62,0xe5,0x7f,0x08,0x2f,0x72,0x80
+0x62,0xe5,0x7e,0x08,0x2f,0x72,0x80
 
 # ATT:   vcomxss %xmm23, %xmm22
 # INTEL: vcomxss xmm22, xmm23
-0x62,0xa1,0x7f,0x08,0x2f,0xf7
+0x62,0xa1,0x7e,0x08,0x2f,0xf7
 
 # ATT:   vcomxss {sae}, %xmm23, %xmm22
 # INTEL: vcomxss xmm22, xmm23, {sae}
-0x62,0xa1,0x7f,0x18,0x2f,0xf7
+0x62,0xa1,0x7e,0x18,0x2f,0xf7
 
 # ATT:   vcomxss  268435456(%rbp,%r14,8), %xmm22
 # INTEL: vcomxss xmm22, dword ptr [rbp + 8*r14 + 268435456]
-0x62,0xa1,0x7f,0x08,0x2f,0xb4,0xf5,0x00,0x00,0x00,0x10
+0x62,0xa1,0x7e,0x08,0x2f,0xb4,0xf5,0x00,0x00,0x00,0x10
 
 # ATT:   vcomxss  291(%r8,%rax,4), %xmm22
 # INTEL: vcomxss xmm22, dword ptr [r8 + 4*rax + 291]
-0x62,0xc1,0x7f,0x08,0x2f,0xb4,0x80,0x23,0x01,0x00,0x00
+0x62,0xc1,0x7e,0x08,0x2f,0xb4,0x80,0x23,0x01,0x00,0x00
 
 # ATT:   vcomxss  (%rip), %xmm22
 # INTEL: vcomxss xmm22, dword ptr [rip]
-0x62,0xe1,0x7f,0x08,0x2f,0x35,0x00,0x00,0x00,0x00
+0x62,0xe1,0x7e,0x08,0x2f,0x35,0x00,0x00,0x00,0x00
 
 # ATT:   vcomxss  -128(,%rbp,2), %xmm22
 # INTEL: vcomxss xmm22, dword ptr [2*rbp - 128]
-0x62,0xe1,0x7f,0x08,0x2f,0x34,0x6d,0x80,0xff,0xff,0xff
+0x62,0xe1,0x7e,0x08,0x2f,0x34,0x6d,0x80,0xff,0xff,0xff
 
 # ATT:   vcomxss  508(%rcx), %xmm22
 # INTEL: vcomxss xmm22, dword ptr [rcx + 508]
-0x62,0xe1,0x7f,0x08,0x2f,0x71,0x7f
+0x62,0xe1,0x7e,0x08,0x2f,0x71,0x7f
 
 # ATT:   vcomxss  -512(%rdx), %xmm22
 # INTEL: vcomxss xmm22, dword ptr [rdx - 512]
-0x62,0xe1,0x7f,0x08,0x2f,0x72,0x80
+0x62,0xe1,0x7e,0x08,0x2f,0x72,0x80
 
 # ATT:   vucomxsd %xmm23, %xmm22
 # INTEL: vucomxsd xmm22, xmm23
-0x62,0xa1,0xfe,0x08,0x2e,0xf7
+0x62,0xa1,0xff,0x08,0x2e,0xf7
 
 # ATT:   vucomxsd {sae}, %xmm23, %xmm22
 # INTEL: vucomxsd xmm22, xmm23, {sae}
-0x62,0xa1,0xfe,0x18,0x2e,0xf7
+0x62,0xa1,0xff,0x18,0x2e,0xf7
 
 # ATT:   vucomxsd  268435456(%rbp,%r14,8), %xmm22
 # INTEL: vucomxsd xmm22, qword ptr [rbp + 8*r14 + 268435456]
-0x62,0xa1,0xfe,0x08,0x2e,0xb4,0xf5,0x00,0x00,0x00,0x10
+0x62,0xa1,0xff,0x08,0x2e,0xb4,0xf5,0x00,0x00,0x00,0x10
 
 # ATT:   vucomxsd  291(%r8,%rax,4), %xmm22
 # INTEL: vucomxsd xmm22, qword ptr [r8 + 4*rax + 291]
-0x62,0xc1,0xfe,0x08,0x2e,0xb4,0x80,0x23,0x01,0x00,0x00
+0x62,0xc1,0xff,0x08,0x2e,0xb4,0x80,0x23,0x01,0x00,0x00
 
 # ATT:   vucomxsd  (%rip), %xmm22
 # INTEL: vucomxsd xmm22, qword ptr [rip]
-0x62,0xe1,0xfe,0x08,0x2e,0x35,0x00,0x00,0x00,0x00
+0x62,0xe1,0xff,0x08,0x2e,0x35,0x00,0x00,0x00,0x00
 
 # ATT:   vucomxsd  -256(,%rbp,2), %xmm22
 # INTEL: vucomxsd xmm22, qword ptr [2*rbp - 256]
-0x62,0xe1,0xfe,0x08,0x2e,0x34,0x6d,0x00,0xff,0xff,0xff
+0x62,0xe1,0xff,0x08,0x2e,0x34,0x6d,0x00,0xff,0xff,0xff
 
 # ATT:   vucomxsd  1016(%rcx), %xmm22
 # INTEL: vucomxsd xmm22, qword ptr [rcx + 1016]
-0x62,0xe1,0xfe,0x08,0x2e,0x71,0x7f
+0x62,0xe1,0xff,0x08,0x2e,0x71,0x7f
 
 # ATT:   vucomxsd  -1024(%rdx), %xmm22
 # INTEL: vucomxsd xmm22, qword ptr [rdx - 1024]
-0x62,0xe1,0xfe,0x08,0x2e,0x72,0x80
+0x62,0xe1,0xff,0x08,0x2e,0x72,0x80
 
 # ATT:   vucomxsh %xmm23, %xmm22
 # INTEL: vucomxsh xmm22, xmm23
-0x62,0xa5,0x7f,0x08,0x2e,0xf7
+0x62,0xa5,0x7e,0x08,0x2e,0xf7
 
 # ATT:   vucomxsh {sae}, %xmm23, %xmm22
 # INTEL: vucomxsh xmm22, xmm23, {sae}
-0x62,0xa5,0x7f,0x18,0x2e,0xf7
+0x62,0xa5,0x7e,0x18,0x2e,0xf7
 
 # ATT:   vucomxsh  268435456(%rbp,%r14,8), %xmm22
 # INTEL: vucomxsh xmm22, word ptr [rbp + 8*r14 + 268435456]
-0x62,0xa5,0x7f,0x08,0x2e,0xb4,0xf5,0x00,0x00,0x00,0x10
+0x62,0xa5,0x7e,0x08,0x2e,0xb4,0xf5,0x00,0x00,0x00,0x10
 
 # ATT:   vucomxsh  291(%r8,%rax,4), %xmm22
 # INTEL: vucomxsh xmm22, word ptr [r8 + 4*rax + 291]
-0x62,0xc5,0x7f,0x08,0x2e,0xb4,0x80,0x23,0x01,0x00,0x00
+0x62,0xc5,0x7e,0x08,0x2e,0xb4,0x80,0x23,0x01,0x00,0x00
 
 # ATT:   vucomxsh  (%rip), %xmm22
 # INTEL: vucomxsh xmm22, word ptr [rip]
-0x62,0xe5,0x7f,0x08,0x2e,0x35,0x00,0x00,0x00,0x00
+0x62,0xe5,0x7e,0x08,0x2e,0x35,0x00,0x00,0x00,0x00
 
 # ATT:   vucomxsh  -64(,%rbp,2), %xmm22
 # INTEL: vucomxsh xmm22, word ptr [2*rbp - 64]
-0x62,0xe5,0x7f,0x08,0x2e,0x34,0x6d,0xc0,0xff,0xff,0xff
+0x62,0xe5,0x7e,0x08,0x2e,0x34,0x6d,0xc0,0xff,0xff,0xff
 
 # ATT:   vucomxsh  254(%rcx), %xmm22
 # INTEL: vucomxsh xmm22, word ptr [rcx + 254]
-0x62,0xe5,0x7f,0x08,0x2e,0x71,0x7f
+0x62,0xe5,0x7e,0x08,0x2e,0x71,0x7f
 
 # ATT:   vucomxsh  -256(%rdx), %xmm22
 # INTEL: vucomxsh xmm22, word ptr [rdx - 256]
-0x62,0xe5,0x7f,0x08,0x2e,0x72,0x80
+0x62,0xe5,0x7e,0x08,0x2e,0x72,0x80
 
 # ATT:   vucomxss %xmm23, %xmm22
 # INTEL: vucomxss xmm22, xmm23
-0x62,0xa1,0x7f,0x08,0x2e,0xf7
+0x62,0xa1,0x7e,0x08,0x2e,0xf7
 
 # ATT:   vucomxss {sae}, %xmm23, %xmm22
 # INTEL: vucomxss xmm22, xmm23, {sae}
-0x62,0xa1,0x7f,0x18,0x2e,0xf7
+0x62,0xa1,0x7e,0x18,0x2e,0xf7
 
 # ATT:   vucomxss  268435456(%rbp,%r14,8), %xmm22
 # INTEL: vucomxss xmm22, dword ptr [rbp + 8*r14 + 268435456]
-0x62,0xa1,0x7f,0x08,0x2e,0xb4,0xf5,0x00,0x00,0x00,0x10
+0x62,0xa1,0x7e,0x08,0x2e,0xb4,0xf5,0x00,0x00,0x00,0x10
 
 # ATT:   vucomxss  291(%r8,%rax,4), %xmm22
 # INTEL: vucomxss xmm22, dword ptr [r8 + 4*rax + 291]
-0x62,0xc1,0x7f,0x08,0x2e,0xb4,0x80,0x23,0x01,0x00,0x00
+0x62,0xc1,0x7e,0x08,0x2e,0xb4,0x80,0x23,0x01,0x00,0x00
 
 # ATT:   vucomxss  (%rip), %xmm22
 # INTEL: vucomxss xmm22, dword ptr [rip]
-0x62,0xe1,0x7f,0x08,0x2e,0x35,0x00,0x00,0x00,0x00
+0x62,0xe1,0x7e,0x08,0x2e,0x35,0x00,0x00,0x00,0x00
 
 # ATT:   vucomxss  -128(,%rbp,2), %xmm22
 # INTEL: vucomxss xmm22, dword ptr [2*rbp - 128]
-0x62,0xe1,0x7f,0x08,0x2e,0x34,0x6d,0x80,0xff,0xff,0xff
+0x62,0xe1,0x7e,0x08,0x2e,0x34,0x6d,0x80,0xff,0xff,0xff
 
 # ATT:   vucomxss  508(%rcx), %xmm22
 # INTEL: vucomxss xmm22, dword ptr [rcx + 508]
-0x62,0xe1,0x7f,0x08,0x2e,0x71,0x7f
+0x62,0xe1,0x7e,0x08,0x2e,0x71,0x7f
 
 # ATT:   vucomxss  -512(%rdx), %xmm22
 # INTEL: vucomxss xmm22, dword ptr [rdx - 512]
-0x62,0xe1,0x7f,0x08,0x2e,0x72,0x80
+0x62,0xe1,0x7e,0x08,0x2e,0x72,0x80
 
diff --git a/llvm/test/MC/X86/avx10.2-bf16-32-att.s b/llvm/test/MC/X86/avx10.2-bf16-32-att.s
index e1e82623d838c..88433d7a3411a 100644
--- a/llvm/test/MC/X86/avx10.2-bf16-32-att.s
+++ b/llvm/test/MC/X86/avx10.2-bf16-32-att.s
@@ -1717,111 +1717,111 @@
           vfpclassbf16  $123, -256(%edx){1to32}, %k5 {%k7}
 
 // CHECK: vgetexpbf16 %xmm3, %xmm2
-// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x42,0xd3]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x08,0x42,0xd3]
           vgetexpbf16 %xmm3, %xmm2
 
 // CHECK: vgetexpbf16 %xmm3, %xmm2 {%k7}
-// CHECK: encoding: [0x62,0xf5,0x7d,0x0f,0x42,0xd3]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x0f,0x42,0xd3]
           vgetexpbf16 %xmm3, %xmm2 {%k7}
 
 // CHECK: vgetexpbf16 %xmm3, %xmm2 {%k7} {z}
-// CHECK: encoding: [0x62,0xf5,0x7d,0x8f,0x42,0xd3]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x8f,0x42,0xd3]
           vgetexpbf16 %xmm3, %xmm2 {%k7} {z}
 
 // CHECK: vgetexpbf16 %zmm3, %zmm2
-// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x42,0xd3]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x48,0x42,0xd3]
           vgetexpbf16 %zmm3, %zmm2
 
 // CHECK: vgetexpbf16 %zmm3, %zmm2 {%k7}
-// CHECK: encoding: [0x62,0xf5,0x7d,0x4f,0x42,0xd3]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x4f,0x42,0xd3]
           vgetexpbf16 %zmm3, %zmm2 {%k7}
 
 // CHECK: vgetexpbf16 %zmm3, %zmm2 {%k7} {z}
-// CHECK: encoding: [0x62,0xf5,0x7d,0xcf,0x42,0xd3]
+// CHECK: encoding: [0x62,0xf6,0x7c,0xcf,0x42,0xd3]
           vgetexpbf16 %zmm3, %zmm2 {%k7} {z}
 
 // CHECK: vgetexpbf16 %ymm3, %ymm2
-// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x42,0xd3]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x28,0x42,0xd3]
           vgetexpbf16 %ymm3, %ymm2
 
 // CHECK: vgetexpbf16 %ymm3, %ymm2 {%k7}
-// CHECK: encoding: [0x62,0xf5,0x7d,0x2f,0x42,0xd3]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x2f,0x42,0xd3]
           vgetexpbf16 %ymm3, %ymm2 {%k7}
 
 // CHECK: vgetexpbf16 %ymm3, %ymm2 {%k7} {z}
-// CHECK: encoding: [0x62,0xf5,0x7d,0xaf,0x42,0xd3]
+// CHECK: encoding: [0x62,0xf6,0x7c,0xaf,0x42,0xd3]
           vgetexpbf16 %ymm3, %ymm2 {%k7} {z}
 
 // CHECK: vgetexpbf16  268435456(%esp,%esi,8), %xmm2
-// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x42,0x94,0xf4,0x00,0x00,0x00,0x10]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x08,0x42,0x94,0xf4,0x00,0x00,0x00,0x10]
           vgetexpbf16  268435456(%esp,%esi,8), %xmm2
 
 // CHECK: vgetexpbf16  291(%edi,%eax,4), %xmm2 {%k7}
-// CHECK: encoding: [0x62,0xf5,0x7d,0x0f,0x42,0x94,0x87,0x23,0x01,0x00,0x00]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x0f,0x42,0x94,0x87,0x23,0x01,0x00,0x00]
           vgetexpbf16  291(%edi,%eax,4), %xmm2 {%k7}
 
 // CHECK: vgetexpbf16  (%eax){1to8}, %xmm2
-// CHECK: encoding: [0x62,0xf5,0x7d,0x18,0x42,0x10]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x18,0x42,0x10]
           vgetexpbf16  (%eax){1to8}, %xmm2
 
 // CHECK: vgetexpbf16  -512(,%ebp,2), %xmm2
-// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x42,0x14,0x6d,0x00,0xfe,0xff,0xff]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x08,0x42,0x14,0x6d,0x00,0xfe,0xff,0xff]
           vgetexpbf16  -512(,%ebp,2), %xmm2
 
 // CHECK: vgetexpbf16  2032(%ecx), %xmm2 {%k7} {z}
-// CHECK: encoding: [0x62,0xf5,0x7d,0x8f,0x42,0x51,0x7f]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x8f,0x42,0x51,0x7f]
           vgetexpbf16  2032(%ecx), %xmm2 {%k7} {z}
 
 // CHECK: vgetexpbf16  -256(%edx){1to8}, %xmm2 {%k7} {z}
-// CHECK: encoding: [0x62,0xf5,0x7d,0x9f,0x42,0x52,0x80]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x9f,0x42,0x52,0x80]
           vgetexpbf16  -256(%edx){1to8}, %xmm2 {%k7} {z}
 
 // CHECK: vgetexpbf16  268435456(%esp,%esi,8), %ymm2
-// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x42,0x94,0xf4,0x00,0x00,0x00,0x10]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x28,0x42,0x94,0xf4,0x00,0x00,0x00,0x10]
           vgetexpbf16  268435456(%esp,%esi,8), %ymm2
 
 // CHECK: vgetexpbf16  291(%edi,%eax,4), %ymm2 {%k7}
-// CHECK: encoding: [0x62,0xf5,0x7d,0x2f,0x42,0x94,0x87,0x23,0x01,0x00,0x00]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x2f,0x42,0x94,0x87,0x23,0x01,0x00,0x00]
           vgetexpbf16  291(%edi,%eax,4), %ymm2 {%k7}
 
 // CHECK: vgetexpbf16  (%eax){1to16}, %ymm2
-// CHECK: encoding: [0x62,0xf5,0x7d,0x38,0x42,0x10]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x38,0x42,0x10]
           vgetexpbf16  (%eax){1to16}, %ymm2
 
 // CHECK: vgetexpbf16  -1024(,%ebp,2), %ymm2
-// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x42,0x14,0x6d,0x00,0xfc,0xff,0xff]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x28,0x42,0x14,0x6d,0x00,0xfc,0xff,0xff]
           vgetexpbf16  -1024(,%ebp,2), %ymm2
 
 // CHECK: vgetexpbf16  4064(%ecx), %ymm2 {%k7} {z}
-// CHECK: encoding: [0x62,0xf5,0x7d,0xaf,0x42,0x51,0x7f]
+// CHECK: encoding: [0x62,0xf6,0x7c,0xaf,0x42,0x51,0x7f]
           vgetexpbf16  4064(%ecx), %ymm2 {%k7} {z}
 
 // CHECK: vgetexpbf16  -256(%edx){1to16}, %ymm2 {%k7} {z}
-// CHECK: encoding: [0x62,0xf5,0x7d,0xbf,0x42,0x52,0x80]
+// CHECK: encoding: [0x62,0xf6,0x7c,0xbf,0x42,0x52,0x80]
           vgetexpbf16  -256(%edx){1to16}, %ymm2 {%k7} {z}
 
 // CHECK: vgetexpbf16  268435456(%esp,%esi,8), %zmm2
-// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x42,0x94,0xf4,0x00,0x00,0x00,0x10]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x48,0x42,0x94,0xf4,0x00,0x00,0x00,0x10]
           vgetexpbf16  268435456(%esp,%esi,8), %zmm2
 
 // CHECK: vgetexpbf16  291(%edi,%eax,4), %zmm2 {%k7}
-// CHECK: encoding: [0x62,0xf5,0x7d,0x4f,0x42,0x94,0x87,0x23,0x01,0x00,0x00]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x4f,0x42,0x94,0x87,0x23,0x01,0x00,0x00]
           vgetexpbf16  291(%edi,%eax,4), %zmm2 {%k7}
 
 // CHECK: vgetexpbf16  (%eax){1to32}, %zmm2
-// CHECK: encoding: [0x62,0xf5,0x7d,0x58,0x42,0x10]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x58,0x42,0x10]
           vgetexpbf16  (%eax){1to32}, %zmm2
 
 // CHECK: vgetexpbf16  -2048(,%ebp,2), %zmm2
-// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x42,0x14,0x6d,0x00,0xf8,0xff,0xff]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x48,0x42,0x14,0x6d,0x00,0xf8,0xff,0xff]
           vgetexpbf16  -2048(,%ebp,2), %zmm2
 
 // CHECK: vgetexpbf16  8128(%ecx), %zmm2 {%k7} {z}
-// CHECK: encoding: [0x62,0xf5,0x7d,0xcf,0x42,0x51,0x7f]
+// CHECK: encoding: [0x62,0xf6,0x7c,0xcf,0x42,0x51,0x7f]
           vgetexpbf16  8128(%ecx), %zmm2 {%k7} {z}
 
 // CHECK: vgetexpbf16  -256(%edx){1to32}, %zmm2 {%k7} {z}
-// CHECK: encoding: [0x62,0xf5,0x7d,0xdf,0x42,0x52,0x80]
+// CHECK: encoding: [0x62,0xf6,0x7c,0xdf,0x42,0x52,0x80]
           vgetexpbf16  -256(%edx){1to32}, %zmm2 {%k7} {z}
 
 // CHECK: vgetmantbf16 $123, %zmm3, %zmm2
diff --git a/llvm/test/MC/X86/avx10.2-bf16-32-intel.s b/llvm/test/MC/X86/avx10.2-bf16-32-intel.s
index d2e9440ba9c34..7e1d0c305336a 100644
--- a/llvm/test/MC/X86/avx10.2-bf16-32-intel.s
+++ b/llvm/test/MC/X86/avx10.2-bf16-32-intel.s
@@ -1717,111 +1717,111 @@
           vfpclassbf16 k5 {k7}, word ptr [edx - 256]{1to32}, 123
 
 // CHECK: vgetexpbf16 xmm2, xmm3
-// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x42,0xd3]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x08,0x42,0xd3]
           vgetexpbf16 xmm2, xmm3
 
 // CHECK: vgetexpbf16 xmm2 {k7}, xmm3
-// CHECK: encoding: [0x62,0xf5,0x7d,0x0f,0x42,0xd3]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x0f,0x42,0xd3]
           vgetexpbf16 xmm2 {k7}, xmm3
 
 // CHECK: vgetexpbf16 xmm2 {k7} {z}, xmm3
-// CHECK: encoding: [0x62,0xf5,0x7d,0x8f,0x42,0xd3]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x8f,0x42,0xd3]
           vgetexpbf16 xmm2 {k7} {z}, xmm3
 
 // CHECK: vgetexpbf16 zmm2, zmm3
-// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x42,0xd3]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x48,0x42,0xd3]
           vgetexpbf16 zmm2, zmm3
 
 // CHECK: vgetexpbf16 zmm2 {k7}, zmm3
-// CHECK: encoding: [0x62,0xf5,0x7d,0x4f,0x42,0xd3]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x4f,0x42,0xd3]
           vgetexpbf16 zmm2 {k7}, zmm3
 
 // CHECK: vgetexpbf16 zmm2 {k7} {z}, zmm3
-// CHECK: encoding: [0x62,0xf5,0x7d,0xcf,0x42,0xd3]
+// CHECK: encoding: [0x62,0xf6,0x7c,0xcf,0x42,0xd3]
           vgetexpbf16 zmm2 {k7} {z}, zmm3
 
 // CHECK: vgetexpbf16 ymm2, ymm3
-// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x42,0xd3]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x28,0x42,0xd3]
           vgetexpbf16 ymm2, ymm3
 
 // CHECK: vgetexpbf16 ymm2 {k7}, ymm3
-// CHECK: encoding: [0x62,0xf5,0x7d,0x2f,0x42,0xd3]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x2f,0x42,0xd3]
           vgetexpbf16 ymm2 {k7}, ymm3
 
 // CHECK: vgetexpbf16 ymm2 {k7} {z}, ymm3
-// CHECK: encoding: [0x62,0xf5,0x7d,0xaf,0x42,0xd3]
+// CHECK: encoding: [0x62,0xf6,0x7c,0xaf,0x42,0xd3]
           vgetexpbf16 ymm2 {k7} {z}, ymm3
 
 // CHECK: vgetexpbf16 xmm2, xmmword ptr [esp + 8*esi + 268435456]
-// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x42,0x94,0xf4,0x00,0x00,0x00,0x10]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x08,0x42,0x94,0xf4,0x00,0x00,0x00,0x10]
           vgetexpbf16 xmm2, xmmword ptr [esp + 8*esi + 268435456]
 
 // CHECK: vgetexpbf16 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291]
-// CHECK: encoding: [0x62,0xf5,0x7d,0x0f,0x42,0x94,0x87,0x23,0x01,0x00,0x00]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x0f,0x42,0x94,0x87,0x23,0x01,0x00,0x00]
           vgetexpbf16 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291]
 
 // CHECK: vgetexpbf16 xmm2, word ptr [eax]{1to8}
-// CHECK: encoding: [0x62,0xf5,0x7d,0x18,0x42,0x10]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x18,0x42,0x10]
           vgetexpbf16 xmm2, word ptr [eax]{1to8}
 
 // CHECK: vgetexpbf16 xmm2, xmmword ptr [2*ebp - 512]
-// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x42,0x14,0x6d,0x00,0xfe,0xff,0xff]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x08,0x42,0x14,0x6d,0x00,0xfe,0xff,0xff]
           vgetexpbf16 xmm2, xmmword ptr [2*ebp - 512]
 
 // CHECK: vgetexpbf16 xmm2 {k7} {z}, xmmword ptr [ecx + 2032]
-// CHECK: encoding: [0x62,0xf5,0x7d,0x8f,0x42,0x51,0x7f]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x8f,0x42,0x51,0x7f]
           vgetexpbf16 xmm2 {k7} {z}, xmmword ptr [ecx + 2032]
 
 // CHECK: vgetexpbf16 xmm2 {k7} {z}, word ptr [edx - 256]{1to8}
-// CHECK: encoding: [0x62,0xf5,0x7d,0x9f,0x42,0x52,0x80]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x9f,0x42,0x52,0x80]
           vgetexpbf16 xmm2 {k7} {z}, word ptr [edx - 256]{1to8}
 
 // CHECK: vgetexpbf16 ymm2, ymmword ptr [esp + 8*esi + 268435456]
-// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x42,0x94,0xf4,0x00,0x00,0x00,0x10]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x28,0x42,0x94,0xf4,0x00,0x00,0x00,0x10]
           vgetexpbf16 ymm2, ymmword ptr [esp + 8*esi + 268435456]
 
 // CHECK: vgetexpbf16 ymm2 {k7}, ymmword ptr [edi + 4*eax + 291]
-// CHECK: encoding: [0x62,0xf5,0x7d,0x2f,0x42,0x94,0x87,0x23,0x01,0x00,0x00]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x2f,0x42,0x94,0x87,0x23,0x01,0x00,0x00]
           vgetexpbf16 ymm2 {k7}, ymmword ptr [edi + 4*eax + 291]
 
 // CHECK: vgetexpbf16 ymm2, word ptr [eax]{1to16}
-// CHECK: encoding: [0x62,0xf5,0x7d,0x38,0x42,0x10]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x38,0x42,0x10]
           vgetexpbf16 ymm2, word ptr [eax]{1to16}
 
 // CHECK: vgetexpbf16 ymm2, ymmword ptr [2*ebp - 1024]
-// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x42,0x14,0x6d,0x00,0xfc,0xff,0xff]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x28,0x42,0x14,0x6d,0x00,0xfc,0xff,0xff]
           vgetexpbf16 ymm2, ymmword ptr [2*ebp - 1024]
 
 // CHECK: vgetexpbf16 ymm2 {k7} {z}, ymmword ptr [ecx + 4064]
-// CHECK: encoding: [0x62,0xf5,0x7d,0xaf,0x42,0x51,0x7f]
+// CHECK: encoding: [0x62,0xf6,0x7c,0xaf,0x42,0x51,0x7f]
           vgetexpbf16 ymm2 {k7} {z}, ymmword ptr [ecx + 4064]
 
 // CHECK: vgetexpbf16 ymm2 {k7} {z}, word ptr [edx - 256]{1to16}
-// CHECK: encoding: [0x62,0xf5,0x7d,0xbf,0x42,0x52,0x80]
+// CHECK: encoding: [0x62,0xf6,0x7c,0xbf,0x42,0x52,0x80]
           vgetexpbf16 ymm2 {k7} {z}, word ptr [edx - 256]{1to16}
 
 // CHECK: vgetexpbf16 zmm2, zmmword ptr [esp + 8*esi + 268435456]
-// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x42,0x94,0xf4,0x00,0x00,0x00,0x10]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x48,0x42,0x94,0xf4,0x00,0x00,0x00,0x10]
           vgetexpbf16 zmm2, zmmword ptr [esp + 8*esi + 268435456]
 
 // CHECK: vgetexpbf16 zmm2 {k7}, zmmword ptr [edi + 4*eax + 291]
-// CHECK: encoding: [0x62,0xf5,0x7d,0x4f,0x42,0x94,0x87,0x23,0x01,0x00,0x00]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x4f,0x42,0x94,0x87,0x23,0x01,0x00,0x00]
           vgetexpbf16 zmm2 {k7}, zmmword ptr [edi + 4*eax + 291]
 
 // CHECK: vgetexpbf16 zmm2, word ptr [eax]{1to32}
-// CHECK: encoding: [0x62,0xf5,0x7d,0x58,0x42,0x10]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x58,0x42,0x10]
           vgetexpbf16 zmm2, word ptr [eax]{1to32}
 
 // CHECK: vgetexpbf16 zmm2, zmmword ptr [2*ebp - 2048]
-// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x42,0x14,0x6d,0x00,0xf8,0xff,0xff]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x48,0x42,0x14,0x6d,0x00,0xf8,0xff,0xff]
           vgetexpbf16 zmm2, zmmword ptr [2*ebp - 2048]
 
 // CHECK: vgetexpbf16 zmm2 {k7} {z}, zmmword ptr [ecx + 8128]
-// CHECK: encoding: [0x62,0xf5,0x7d,0xcf,0x42,0x51,0x7f]
+// CHECK: encoding: [0x62,0xf6,0x7c,0xcf,0x42,0x51,0x7f]
           vgetexpbf16 zmm2 {k7} {z}, zmmword ptr [ecx + 8128]
 
 // CHECK: vgetexpbf16 zmm2 {k7} {z}, word ptr [edx - 256]{1to32}
-// CHECK: encoding: [0x62,0xf5,0x7d,0xdf,0x42,0x52,0x80]
+// CHECK: encoding: [0x62,0xf6,0x7c,0xdf,0x42,0x52,0x80]
           vgetexpbf16 zmm2 {k7} {z}, word ptr [edx - 256]{1to32}
 
 // CHECK: vgetmantbf16 zmm2, zmm3, 123
diff --git a/llvm/test/MC/X86/avx10.2-bf16-64-att.s b/llvm/test/MC/X86/avx10.2-bf16-64-att.s
index 67d6f3a531dfe..0eb10fbf6d86f 100644
--- a/llvm/test/MC/X86/avx10.2-bf16-64-att.s
+++ b/llvm/test/MC/X86/avx10.2-bf16-64-att.s
@@ -1717,111 +1717,111 @@
           vfpclassbf16  $123, -256(%rdx){1to32}, %k5 {%k7}
 
 // CHECK: vgetexpbf16 %xmm23, %xmm22
-// CHECK: encoding: [0x62,0xa5,0x7d,0x08,0x42,0xf7]
+// CHECK: encoding: [0x62,0xa6,0x7c,0x08,0x42,0xf7]
           vgetexpbf16 %xmm23, %xmm22
 
 // CHECK: vgetexpbf16 %xmm23, %xmm22 {%k7}
-// CHECK: encoding: [0x62,0xa5,0x7d,0x0f,0x42,0xf7]
+// CHECK: encoding: [0x62,0xa6,0x7c,0x0f,0x42,0xf7]
           vgetexpbf16 %xmm23, %xmm22 {%k7}
 
 // CHECK: vgetexpbf16 %xmm23, %xmm22 {%k7} {z}
-// CHECK: encoding: [0x62,0xa5,0x7d,0x8f,0x42,0xf7]
+// CHECK: encoding: [0x62,0xa6,0x7c,0x8f,0x42,0xf7]
           vgetexpbf16 %xmm23, %xmm22 {%k7} {z}
 
 // CHECK: vgetexpbf16 %zmm23, %zmm22
-// CHECK: encoding: [0x62,0xa5,0x7d,0x48,0x42,0xf7]
+// CHECK: encoding: [0x62,0xa6,0x7c,0x48,0x42,0xf7]
           vgetexpbf16 %zmm23, %zmm22
 
 // CHECK: vgetexpbf16 %zmm23, %zmm22 {%k7}
-// CHECK: encoding: [0x62,0xa5,0x7d,0x4f,0x42,0xf7]
+// CHECK: encoding: [0x62,0xa6,0x7c,0x4f,0x42,0xf7]
           vgetexpbf16 %zmm23, %zmm22 {%k7}
 
 // CHECK: vgetexpbf16 %zmm23, %zmm22 {%k7} {z}
-// CHECK: encoding: [0x62,0xa5,0x7d,0xcf,0x42,0xf7]
+// CHECK: encoding: [0x62,0xa6,0x7c,0xcf,0x42,0xf7]
           vgetexpbf16 %zmm23, %zmm22 {%k7} {z}
 
 // CHECK: vgetexpbf16 %ymm23, %ymm22
-// CHECK: encoding: [0x62,0xa5,0x7d,0x28,0x42,0xf7]
+// CHECK: encoding: [0x62,0xa6,0x7c,0x28,0x42,0xf7]
           vgetexpbf16 %ymm23, %ymm22
 
 // CHECK: vgetexpbf16 %ymm23, %ymm22 {%k7}
-// CHECK: encoding: [0x62,0xa5,0x7d,0x2f,0x42,0xf7]
+// CHECK: encoding: [0x62,0xa6,0x7c,0x2f,0x42,0xf7]
           vgetexpbf16 %ymm23, %ymm22 {%k7}
 
 // CHECK: vgetexpbf16 %ymm23, %ymm22 {%k7} {z}
-// CHECK: encoding: [0x62,0xa5,0x7d,0xaf,0x42,0xf7]
+// CHECK: encoding: [0x62,0xa6,0x7c,0xaf,0x42,0xf7]
           vgetexpbf16 %ymm23, %ymm22 {%k7} {z}
 
 // CHECK: vgetexpbf16  268435456(%rbp,%r14,8), %xmm22
-// CHECK: encoding: [0x62,0xa5,0x7d,0x08,0x42,0xb4,0xf5,0x00,0x00,0x00,0x10]
+// CHECK: encoding: [0x62,0xa6,0x7c,0x08,0x42,0xb4,0xf5,0x00,0x00,0x00,0x10]
           vgetexpbf16  268435456(%rbp,%r14,8), %xmm22
 
 // CHECK: vgetexpbf16  291(%r8,%rax,4), %xmm22 {%k7}
-// CHECK: encoding: [0x62,0xc5,0x7d,0x0f,0x42,0xb4,0x80,0x23,0x01,0x00,0x00]
+// CHECK: encoding: [0x62,0xc6,0x7c,0x0f,0x42,0xb4,0x80,0x23,0x01,0x00,0x00]
           vgetexpbf16  291(%r8,%rax,4), %xmm22 {%k7}
 
 // CHECK: vgetexpbf16  (%rip){1to8}, %xmm22
-// CHECK: encoding: [0x62,0xe5,0x7d,0x18,0x42,0x35,0x00,0x00,0x00,0x00]
+// CHECK: encoding: [0x62,0xe6,0x7c,0x18,0x42,0x35,0x00,0x00,0x00,0x00]
           vgetexpbf16  (%rip){1to8}, %xmm22
 
 // CHECK: vgetexpbf16  -512(,%rbp,2), %xmm22
-// CHECK: encoding: [0x62,0xe5,0x7d,0x08,0x42,0x34,0x6d,0x00,0xfe,0xff,0xff]
+// CHECK: encoding: [0x62,0xe6,0x7c,0x08,0x42,0x34,0x6d,0x00,0xfe,0xff,0xff]
           vgetexpbf16  -512(,%rbp,2), %xmm22
 
 // CHECK: vgetexpbf16  2032(%rcx), %xmm22 {%k7} {z}
-// CHECK: encoding: [0x62,0xe5,0x7d,0x8f,0x42,0x71,0x7f]
+// CHECK: encoding: [0x62,0xe6,0x7c,0x8f,0x42,0x71,0x7f]
           vgetexpbf16  2032(%rcx), %xmm22 {%k7} {z}
 
 // CHECK: vgetexpbf16  -256(%rdx){1to8}, %xmm22 {%k7} {z}
-// CHECK: encoding: [0x62,0xe5,0x7d,0x9f,0x42,0x72,0x80]
+// CHECK: encoding: [0x62,0xe6,0x7c,0x9f,0x42,0x72,0x80]
           vgetexpbf16  -256(%rdx){1to8}, %xmm22 {%k7} {z}
 
 // CHECK: vgetexpbf16  268435456(%rbp,%r14,8), %ymm22
-// CHECK: encoding: [0x62,0xa5,0x7d,0x28,0x42,0xb4,0xf5,0x00,0x00,0x00,0x10]
+// CHECK: encoding: [0x62,0xa6,0x7c,0x28,0x42,0xb4,0xf5,0x00,0x00,0x00,0x10]
           vgetexpbf16  268435456(%rbp,%r14,8), %ymm22
 
 // CHECK: vgetexpbf16  291(%r8,%rax,4), %ymm22 {%k7}
-// CHECK: encoding: [0x62,0xc5,0x7d,0x2f,0x42,0xb4,0x80,0x23,0x01,0x00,0x00]
+// CHECK: encoding: [0x62,0xc6,0x7c,0x2f,0x42,0xb4,0x80,0x23,0x01,0x00,0x00]
           vgetexpbf16  291(%r8,%rax,4), %ymm22 {%k7}
 
 // CHECK: vgetexpbf16  (%rip){1to16}, %ymm22
-// CHECK: encoding: [0x62,0xe5,0x7d,0x38,0x42,0x35,0x00,0x00,0x00,0x00]
+// CHECK: encoding: [0x62,0xe6,0x7c,0x38,0x42,0x35,0x00,0x00,0x00,0x00]
           vgetexpbf16  (%rip){1to16}, %ymm22
 
 // CHECK: vgetexpbf16  -1024(,%rbp,2), %ymm22
-// CHECK: encoding: [0x62,0xe5,0x7d,0x28,0x42,0x34,0x6d,0x00,0xfc,0xff,0xff]
+// CHECK: encoding: [0x62,0xe6,0x7c,0x28,0x42,0x34,0x6d,0x00,0xfc,0xff,0xff]
           vgetexpbf16  -1024(,%rbp,2), %ymm22
 
 // CHECK: vgetexpbf16  4064(%rcx), %ymm22 {%k7} {z}
-// CHECK: encoding: [0x62,0xe5,0x7d,0xaf,0x42,0x71,0x7f]
+// CHECK: encoding: [0x62,0xe6,0x7c,0xaf,0x42,0x71,0x7f]
           vgetexpbf16  4064(%rcx), %ymm22 {%k7} {z}
 
 // CHECK: vgetexpbf16  -256(%rdx){1to16}, %ymm22 {%k7} {z}
-// CHECK: encoding: [0x62,0xe5,0x7d,0xbf,0x42,0x72,0x80]
+// CHECK: encoding: [0x62,0xe6,0x7c,0xbf,0x42,0x72,0x80]
           vgetexpbf16  -256(%rdx){1to16}, %ymm22 {%k7} {z}
 
 // CHECK: vgetexpbf16  268435456(%rbp,%r14,8), %zmm22
-// CHECK: encoding: [0x62,0xa5,0x7d,0x48,0x42,0xb4,0xf5,0x00,0x00,0x00,0x10]
+// CHECK: encoding: [0x62,0xa6,0x7c,0x48,0x42,0xb4,0xf5,0x00,0x00,0x00,0x10]
           vgetexpbf16  268435456(%rbp,%r14,8), %zmm22
 
 // CHECK: vgetexpbf16  291(%r8,%rax,4), %zmm22 {%k7}
-// CHECK: encoding: [0x62,0xc5,0x7d,0x4f,0x42,0xb4,0x80,0x23,0x01,0x00,0x00]
+// CHECK: encoding: [0x62,0xc6,0x7c,0x4f,0x42,0xb4,0x80,0x23,0x01,0x00,0x00]
           vgetexpbf16  291(%r8,%rax,4), %zmm22 {%k7}
 
 // CHECK: vgetexpbf16  (%rip){1to32}, %zmm22
-// CHECK: encoding: [0x62,0xe5,0x7d,0x58,0x42,0x35,0x00,0x00,0x00,0x00]
+// CHECK: encoding: [0x62,0xe6,0x7c,0x58,0x42,0x35,0x00,0x00,0x00,0x00]
           vgetexpbf16  (%rip){1to32}, %zmm22
 
 // CHECK: vgetexpbf16  -2048(,%rbp,2), %zmm22
-// CHECK: encoding: [0x62,0xe5,0x7d,0x48,0x42,0x34,0x6d,0x00,0xf8,0xff,0xff]
+// CHECK: encoding: [0x62,0xe6,0x7c,0x48,0x42,0x34,0x6d,0x00,0xf8,0xff,0xff]
           vgetexpbf16  -2048(,%rbp,2), %zmm22
 
 // CHECK: vgetexpbf16  8128(%rcx), %zmm22 {%k7} {z}
-// CHECK: encoding: [0x62,0xe5,0x7d,0xcf,0x42,0x71,0x7f]
+// CHECK: encoding: [0x62,0xe6,0x7c,0xcf,0x42,0x71,0x7f]
           vgetexpbf16  8128(%rcx), %zmm22 {%k7} {z}
 
 // CHECK: vgetexpbf16  -256(%rdx){1to32}, %zmm22 {%k7} {z}
-// CHECK: encoding: [0x62,0xe5,0x7d,0xdf,0x42,0x72,0x80]
+// CHECK: encoding: [0x62,0xe6,0x7c,0xdf,0x42,0x72,0x80]
           vgetexpbf16  -256(%rdx){1to32}, %zmm22 {%k7} {z}
 
 // CHECK: vgetmantbf16 $123, %zmm23, %zmm22
diff --git a/llvm/test/MC/X86/avx10.2-bf16-64-intel.s b/llvm/test/MC/X86/avx10.2-bf16-64-intel.s
index d1727c586e240..b0787a60c7144 100644
--- a/llvm/test/MC/X86/avx10.2-bf16-64-intel.s
+++ b/llvm/test/MC/X86/avx10.2-bf16-64-intel.s
@@ -1717,111 +1717,111 @@
           vfpclassbf16 k5 {k7}, word ptr [rdx - 256]{1to32}, 123
 
 // CHECK: vgetexpbf16 xmm22, xmm23
-// CHECK: encoding: [0x62,0xa5,0x7d,0x08,0x42,0xf7]
+// CHECK: encoding: [0x62,0xa6,0x7c,0x08,0x42,0xf7]
           vgetexpbf16 xmm22, xmm23
 
 // CHECK: vgetexpbf16 xmm22 {k7}, xmm23
-// CHECK: encoding: [0x62,0xa5,0x7d,0x0f,0x42,0xf7]
+// CHECK: encoding: [0x62,0xa6,0x7c,0x0f,0x42,0xf7]
           vgetexpbf16 xmm22 {k7}, xmm23
 
 // CHECK: vgetexpbf16 xmm22 {k7} {z}, xmm23
-// CHECK: encoding: [0x62,0xa5,0x7d,0x8f,0x42,0xf7]
+// CHECK: encoding: [0x62,0xa6,0x7c,0x8f,0x42,0xf7]
           vgetexpbf16 xmm22 {k7} {z}, xmm23
 
 // CHECK: vgetexpbf16 zmm22, zmm23
-// CHECK: encoding: [0x62,0xa5,0x7d,0x48,0x42,0xf7]
+// CHECK: encoding: [0x62,0xa6,0x7c,0x48,0x42,0xf7]
           vgetexpbf16 zmm22, zmm23
 
 // CHECK: vgetexpbf16 zmm22 {k7}, zmm23
-// CHECK: encoding: [0x62,0xa5,0x7d,0x4f,0x42,0xf7]
+// CHECK: encoding: [0x62,0xa6,0x7c,0x4f,0x42,0xf7]
           vgetexpbf16 zmm22 {k7}, zmm23
 
 // CHECK: vgetexpbf16 zmm22 {k7} {z}, zmm23
-// CHECK: encoding: [0x62,0xa5,0x7d,0xcf,0x42,0xf7]
+// CHECK: encoding: [0x62,0xa6,0x7c,0xcf,0x42,0xf7]
           vgetexpbf16 zmm22 {k7} {z}, zmm23
 
 // CHECK: vgetexpbf16 ymm22, ymm23
-// CHECK: encoding: [0x62,0xa5,0x7d,0x28,0x42,0xf7]
+// CHECK: encoding: [0x62,0xa6,0x7c,0x28,0x42,0xf7]
           vgetexpbf16 ymm22, ymm23
 
 // CHECK: vgetexpbf16 ymm22 {k7}, ymm23
-// CHECK: encoding: [0x62,0xa5,0x7d,0x2f,0x42,0xf7]
+// CHECK: encoding: [0x62,0xa6,0x7c,0x2f,0x42,0xf7]
           vgetexpbf16 ymm22 {k7}, ymm23
 
 // CHECK: vgetexpbf16 ymm22 {k7} {z}, ymm23
-// CHECK: encoding: [0x62,0xa5,0x7d,0xaf,0x42,0xf7]
+// CHECK: encoding: [0x62,0xa6,0x7c,0xaf,0x42,0xf7]
           vgetexpbf16 ymm22 {k7} {z}, ymm23
 
 // CHECK: vgetexpbf16 xmm22, xmmword ptr [rbp + 8*r14 + 268435456]
-// CHECK: encoding: [0x62,0xa5,0x7d,0x08,0x42,0xb4,0xf5,0x00,0x00,0x00,0x10]
+// CHECK: encoding: [0x62,0xa6,0x7c,0x08,0x42,0xb4,0xf5,0x00,0x00,0x00,0x10]
           vgetexpbf16 xmm22, xmmword ptr [rbp + 8*r14 + 268435456]
 
 // CHECK: vgetexpbf16 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291]
-// CHECK: encoding: [0x62,0xc5,0x7d,0x0f,0x42,0xb4,0x80,0x23,0x01,0x00,0x00]
+// CHECK: encoding: [0x62,0xc6,0x7c,0x0f,0x42,0xb4,0x80,0x23,0x01,0x00,0x00]
           vgetexpbf16 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291]
 
 // CHECK: vgetexpbf16 xmm22, word ptr [rip]{1to8}
-// CHECK: encoding: [0x62,0xe5,0x7d,0x18,0x42,0x35,0x00,0x00,0x00,0x00]
+// CHECK: encoding: [0x62,0xe6,0x7c,0x18,0x42,0x35,0x00,0x00,0x00,0x00]
           vgetexpbf16 xmm22, word ptr [rip]{1to8}
 
 // CHECK: vgetexpbf16 xmm22, xmmword ptr [2*rbp - 512]
-// CHECK: encoding: [0x62,0xe5,0x7d,0x08,0x42,0x34,0x6d,0x00,0xfe,0xff,0xff]
+// CHECK: encoding: [0x62,0xe6,0x7c,0x08,0x42,0x34,0x6d,0x00,0xfe,0xff,0xff]
           vgetexpbf16 xmm22, xmmword ptr [2*rbp - 512]
 
 // CHECK: vgetexpbf16 xmm22 {k7} {z}, xmmword ptr [rcx + 2032]
-// CHECK: encoding: [0x62,0xe5,0x7d,0x8f,0x42,0x71,0x7f]
+// CHECK: encoding: [0x62,0xe6,0x7c,0x8f,0x42,0x71,0x7f]
           vgetexpbf16 xmm22 {k7} {z}, xmmword ptr [rcx + 2032]
 
 // CHECK: vgetexpbf16 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8}
-// CHECK: encoding: [0x62,0xe5,0x7d,0x9f,0x42,0x72,0x80]
+// CHECK: encoding: [0x62,0xe6,0x7c,0x9f,0x42,0x72,0x80]
           vgetexpbf16 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8}
 
 // CHECK: vgetexpbf16 ymm22, ymmword ptr [rbp + 8*r14 + 268435456]
-// CHECK: encoding: [0x62,0xa5,0x7d,0x28,0x42,0xb4,0xf5,0x00,0x00,0x00,0x10]
+// CHECK: encoding: [0x62,0xa6,0x7c,0x28,0x42,0xb4,0xf5,0x00,0x00,0x00,0x10]
           vgetexpbf16 ymm22, ymmword ptr [rbp + 8*r14 + 268435456]
 
 // CHECK: vgetexpbf16 ymm22 {k7}, ymmword ptr [r8 + 4*rax + 291]
-// CHECK: encoding: [0x62,0xc5,0x7d,0x2f,0x42,0xb4,0x80,0x23,0x01,0x00,0x00]
+// CHECK: encoding: [0x62,0xc6,0x7c,0x2f,0x42,0xb4,0x80,0x23,0x01,0x00,0x00]
           vgetexpbf16 ymm22 {k7}, ymmword ptr [r8 + 4*rax + 291]
 
 // CHECK: vgetexpbf16 ymm22, word ptr [rip]{1to16}
-// CHECK: encoding: [0x62,0xe5,0x7d,0x38,0x42,0x35,0x00,0x00,0x00,0x00]
+// CHECK: encoding: [0x62,0xe6,0x7c,0x38,0x42,0x35,0x00,0x00,0x00,0x00]
           vgetexpbf16 ymm22, word ptr [rip]{1to16}
 
 // CHECK: vgetexpbf16 ymm22, ymmword ptr [2*rbp - 1024]
-// CHECK: encoding: [0x62,0xe5,0x7d,0x28,0x42,0x34,0x6d,0x00,0xfc,0xff,0xff]
+// CHECK: encoding: [0x62,0xe6,0x7c,0x28,0x42,0x34,0x6d,0x00,0xfc,0xff,0xff]
           vgetexpbf16 ymm22, ymmword ptr [2*rbp - 1024]
 
 // CHECK: vgetexpbf16 ymm22 {k7} {z}, ymmword ptr [rcx + 4064]
-// CHECK: encoding: [0x62,0xe5,0x7d,0xaf,0x42,0x71,0x7f]
+// CHECK: encoding: [0x62,0xe6,0x7c,0xaf,0x42,0x71,0x7f]
           vgetexpbf16 ymm22 {k7} {z}, ymmword ptr [rcx + 4064]
 
 // CHECK: vgetexpbf16 ymm22 {k7} {z}, word ptr [rdx - 256]{1to16}
-// CHECK: encoding: [0x62,0xe5,0x7d,0xbf,0x42,0x72,0x80]
+// CHECK: encoding: [0x62,0xe6,0x7c,0xbf,0x42,0x72,0x80]
           vgetexpbf16 ymm22 {k7} {z}, word ptr [rdx - 256]{1to16}
 
 // CHECK: vgetexpbf16 zmm22, zmmword ptr [rbp + 8*r14 + 268435456]
-// CHECK: encoding: [0x62,0xa5,0x7d,0x48,0x42,0xb4,0xf5,0x00,0x00,0x00,0x10]
+// CHECK: encoding: [0x62,0xa6,0x7c,0x48,0x42,0xb4,0xf5,0x00,0x00,0x00,0x10]
           vgetexpbf16 zmm22, zmmword ptr [rbp + 8*r14 + 268435456]
 
 // CHECK: vgetexpbf16 zmm22 {k7}, zmmword ptr [r8 + 4*rax + 291]
-// CHECK: encoding: [0x62,0xc5,0x7d,0x4f,0x42,0xb4,0x80,0x23,0x01,0x00,0x00]
+// CHECK: encoding: [0x62,0xc6,0x7c,0x4f,0x42,0xb4,0x80,0x23,0x01,0x00,0x00]
           vgetexpbf16 zmm22 {k7}, zmmword ptr [r8 + 4*rax + 291]
 
 // CHECK: vgetexpbf16 zmm22, word ptr [rip]{1to32}
-// CHECK: encoding: [0x62,0xe5,0x7d,0x58,0x42,0x35,0x00,0x00,0x00,0x00]
+// CHECK: encoding: [0x62,0xe6,0x7c,0x58,0x42,0x35,0x00,0x00,0x00,0x00]
           vgetexpbf16 zmm22, word ptr [rip]{1to32}
 
 // CHECK: vgetexpbf16 zmm22, zmmword ptr [2*rbp - 2048]
-// CHECK: encoding: [0x62,0xe5,0x7d,0x48,0x42,0x34,0x6d,0x00,0xf8,0xff,0xff]
+// CHECK: encoding: [0x62,0xe6,0x7c,0x48,0x42,0x34,0x6d,0x00,0xf8,0xff,0xff]
           vgetexpbf16 zmm22, zmmword ptr [2*rbp - 2048]
 
 // CHECK: vgetexpbf16 zmm22 {k7} {z}, zmmword ptr [rcx + 8128]
-// CHECK: encoding: [0x62,0xe5,0x7d,0xcf,0x42,0x71,0x7f]
+// CHECK: encoding: [0x62,0xe6,0x7c,0xcf,0x42,0x71,0x7f]
           vgetexpbf16 zmm22 {k7} {z}, zmmword ptr [rcx + 8128]
 
 // CHECK: vgetexpbf16 zmm22 {k7} {z}, word ptr [rdx - 256]{1to32}
-// CHECK: encoding: [0x62,0xe5,0x7d,0xdf,0x42,0x72,0x80]
+// CHECK: encoding: [0x62,0xe6,0x7c,0xdf,0x42,0x72,0x80]
           vgetexpbf16 zmm22 {k7} {z}, word ptr [rdx - 256]{1to32}
 
 // CHECK: vgetmantbf16 zmm22, zmm23, 123
diff --git a/llvm/test/MC/X86/avx10.2-com-ef-32-att.s b/llvm/test/MC/X86/avx10.2-com-ef-32-att.s
index 8883bb3d6775a..5f91ec8370ef1 100644
--- a/llvm/test/MC/X86/avx10.2-com-ef-32-att.s
+++ b/llvm/test/MC/X86/avx10.2-com-ef-32-att.s
@@ -1,194 +1,194 @@
 // RUN: llvm-mc -triple i386 --show-encoding %s | FileCheck %s
 
 // CHECK: vcomxsd %xmm3, %xmm2
-// CHECK: encoding: [0x62,0xf1,0xfe,0x08,0x2f,0xd3]
+// CHECK: encoding: [0x62,0xf1,0xff,0x08,0x2f,0xd3]
           vcomxsd %xmm3, %xmm2
 
 // CHECK: vcomxsd {sae}, %xmm3, %xmm2
-// CHECK: encoding: [0x62,0xf1,0xfe,0x18,0x2f,0xd3]
+// CHECK: encoding: [0x62,0xf1,0xff,0x18,0x2f,0xd3]
           vcomxsd {sae}, %xmm3, %xmm2
 
 // CHECK: vcomxsd  268435456(%esp,%esi,8), %xmm2
-// CHECK: encoding: [0x62,0xf1,0xfe,0x08,0x2f,0x94,0xf4,0x00,0x00,0x00,0x10]
+// CHECK: encoding: [0x62,0xf1,0xff,0x08,0x2f,0x94,0xf4,0x00,0x00,0x00,0x10]
           vcomxsd  268435456(%esp,%esi,8), %xmm2
 
 // CHECK: vcomxsd  291(%edi,%eax,4), %xmm2
-// CHECK: encoding: [0x62,0xf1,0xfe,0x08,0x2f,0x94,0x87,0x23,0x01,0x00,0x00]
+// CHECK: encoding: [0x62,0xf1,0xff,0x08,0x2f,0x94,0x87,0x23,0x01,0x00,0x00]
           vcomxsd  291(%edi,%eax,4), %xmm2
 
 // CHECK: vcomxsd  (%eax), %xmm2
-// CHECK: encoding: [0x62,0xf1,0xfe,0x08,0x2f,0x10]
+// CHECK: encoding: [0x62,0xf1,0xff,0x08,0x2f,0x10]
           vcomxsd  (%eax), %xmm2
 
 // CHECK: vcomxsd  -256(,%ebp,2), %xmm2
-// CHECK: encoding: [0x62,0xf1,0xfe,0x08,0x2f,0x14,0x6d,0x00,0xff,0xff,0xff]
+// CHECK: encoding: [0x62,0xf1,0xff,0x08,0x2f,0x14,0x6d,0x00,0xff,0xff,0xff]
           vcomxsd  -256(,%ebp,2), %xmm2
 
 // CHECK: vcomxsd  1016(%ecx), %xmm2
-// CHECK: encoding: [0x62,0xf1,0xfe,0x08,0x2f,0x51,0x7f]
+// CHECK: encoding: [0x62,0xf1,0xff,0x08,0x2f,0x51,0x7f]
           vcomxsd  1016(%ecx), %xmm2
 
 // CHECK: vcomxsd  -1024(%edx), %xmm2
-// CHECK: encoding: [0x62,0xf1,0xfe,0x08,0x2f,0x52,0x80]
+// CHECK: encoding: [0x62,0xf1,0xff,0x08,0x2f,0x52,0x80]
           vcomxsd  -1024(%edx), %xmm2
 
 // CHECK: vcomxsh %xmm3, %xmm2
-// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x2f,0xd3]
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x2f,0xd3]
           vcomxsh %xmm3, %xmm2
 
 // CHECK: vcomxsh {sae}, %xmm3, %xmm2
-// CHECK: encoding: [0x62,0xf5,0x7f,0x18,0x2f,0xd3]
+// CHECK: encoding: [0x62,0xf5,0x7e,0x18,0x2f,0xd3]
           vcomxsh {sae}, %xmm3, %xmm2
 
 // CHECK: vcomxsh  268435456(%esp,%esi,8), %xmm2
-// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x2f,0x94,0xf4,0x00,0x00,0x00,0x10]
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x2f,0x94,0xf4,0x00,0x00,0x00,0x10]
           vcomxsh  268435456(%esp,%esi,8), %xmm2
 
 // CHECK: vcomxsh  291(%edi,%eax,4), %xmm2
-// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x2f,0x94,0x87,0x23,0x01,0x00,0x00]
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x2f,0x94,0x87,0x23,0x01,0x00,0x00]
           vcomxsh  291(%edi,%eax,4), %xmm2
 
 // CHECK: vcomxsh  (%eax), %xmm2
-// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x2f,0x10]
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x2f,0x10]
           vcomxsh  (%eax), %xmm2
 
 // CHECK: vcomxsh  -64(,%ebp,2), %xmm2
-// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x2f,0x14,0x6d,0xc0,0xff,0xff,0xff]
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x2f,0x14,0x6d,0xc0,0xff,0xff,0xff]
           vcomxsh  -64(,%ebp,2), %xmm2
 
 // CHECK: vcomxsh  254(%ecx), %xmm2
-// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x2f,0x51,0x7f]
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x2f,0x51,0x7f]
           vcomxsh  254(%ecx), %xmm2
 
 // CHECK: vcomxsh  -256(%edx), %xmm2
-// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x2f,0x52,0x80]
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x2f,0x52,0x80]
           vcomxsh  -256(%edx), %xmm2
 
 // CHECK: vcomxss %xmm3, %xmm2
-// CHECK: encoding: [0x62,0xf1,0x7f,0x08,0x2f,0xd3]
+// CHECK: encoding: [0x62,0xf1,0x7e,0x08,0x2f,0xd3]
           vcomxss %xmm3, %xmm2
 
 // CHECK: vcomxss {sae}, %xmm3, %xmm2
-// CHECK: encoding: [0x62,0xf1,0x7f,0x18,0x2f,0xd3]
+// CHECK: encoding: [0x62,0xf1,0x7e,0x18,0x2f,0xd3]
           vcomxss {sae}, %xmm3, %xmm2
 
 // CHECK: vcomxss  268435456(%esp,%esi,8), %xmm2
-// CHECK: encoding: [0x62,0xf1,0x7f,0x08,0x2f,0x94,0xf4,0x00,0x00,0x00,0x10]
+// CHECK: encoding: [0x62,0xf1,0x7e,0x08,0x2f,0x94,0xf4,0x00,0x00,0x00,0x10]
           vcomxss  268435456(%esp,%esi,8), %xmm2
 
 // CHECK: vcomxss  291(%edi,%eax,4), %xmm2
-// CHECK: encoding: [0x62,0xf1,0x7f,0x08,0x2f,0x94,0x87,0x23,0x01,0x00,0x00]
+// CHECK: encoding: [0x62,0xf1,0x7e,0x08,0x2f,0x94,0x87,0x23,0x01,0x00,0x00]
           vcomxss  291(%edi,%eax,4), %xmm2
 
 // CHECK: vcomxss  (%eax), %xmm2
-// CHECK: encoding: [0x62,0xf1,0x7f,0x08,0x2f,0x10]
+// CHECK: encoding: [0x62,0xf1,0x7e,0x08,0x2f,0x10]
           vcomxss  (%eax), %xmm2
 
 // CHECK: vcomxss  -128(,%ebp,2), %xmm2
-// CHECK: encoding: [0x62,0xf1,0x7f,0x08,0x2f,0x14,0x6d,0x80,0xff,0xff,0xff]
+// CHECK: encoding: [0x62,0xf1,0x7e,0x08,0x2f,0x14,0x6d,0x80,0xff,0xff,0xff]
           vcomxss  -128(,%ebp,2), %xmm2
 
 // CHECK: vcomxss  508(%ecx), %xmm2
-// CHECK: encoding: [0x62,0xf1,0x7f,0x08,0x2f,0x51,0x7f]
+// CHECK: encoding: [0x62,0xf1,0x7e,0x08,0x2f,0x51,0x7f]
           vcomxss  508(%ecx), %xmm2
 
 // CHECK: vcomxss  -512(%edx), %xmm2
-// CHECK: encoding: [0x62,0xf1,0x7f,0x08,0x2f,0x52,0x80]
+// CHECK: encoding: [0x62,0xf1,0x7e,0x08,0x2f,0x52,0x80]
           vcomxss  -512(%edx), %xmm2
 
 // CHECK: vucomxsd %xmm3, %xmm2
-// CHECK: encoding: [0x62,0xf1,0xfe,0x08,0x2e,0xd3]
+// CHECK: encoding: [0x62,0xf1,0xff,0x08,0x2e,0xd3]
           vucomxsd %xmm3, %xmm2
 
 // CHECK: vucomxsd {sae}, %xmm3, %xmm2
-// CHECK: encoding: [0x62,0xf1,0xfe,0x18,0x2e,0xd3]
+// CHECK: encoding: [0x62,0xf1,0xff,0x18,0x2e,0xd3]
           vucomxsd {sae}, %xmm3, %xmm2
 
 // CHECK: vucomxsd  268435456(%esp,%esi,8), %xmm2
-// CHECK: encoding: [0x62,0xf1,0xfe,0x08,0x2e,0x94,0xf4,0x00,0x00,0x00,0x10]
+// CHECK: encoding: [0x62,0xf1,0xff,0x08,0x2e,0x94,0xf4,0x00,0x00,0x00,0x10]
           vucomxsd  268435456(%esp,%esi,8), %xmm2
 
 // CHECK: vucomxsd  291(%edi,%eax,4), %xmm2
-// CHECK: encoding: [0x62,0xf1,0xfe,0x08,0x2e,0x94,0x87,0x23,0x01,0x00,0x00]
+// CHECK: encoding: [0x62,0xf1,0xff,0x08,0x2e,0x94,0x87,0x23,0x01,0x00,0x00]
           vucomxsd  291(%edi,%eax,4), %xmm2
 
 // CHECK: vucomxsd  (%eax), %xmm2
-// CHECK: encoding: [0x62,0xf1,0xfe,0x08,0x2e,0x10]
+// CHECK: encoding: [0x62,0xf1,0xff,0x08,0x2e,0x10]
           vucomxsd  (%eax), %xmm2
 
 // CHECK: vucomxsd  -256(,%ebp,2), %xmm2
-// CHECK: encoding: [0x62,0xf1,0xfe,0x08,0x2e,0x14,0x6d,0x00,0xff,0xff,0xff]
+// CHECK: encoding: [0x62,0xf1,0xff,0x08,0x2e,0x14,0x6d,0x00,0xff,0xff,0xff]
           vucomxsd  -256(,%ebp,2), %xmm2
 
 // CHECK: vucomxsd  1016(%ecx), %xmm2
-// CHECK: encoding: [0x62,0xf1,0xfe,0x08,0x2e,0x51,0x7f]
+// CHECK: encoding: [0x62,0xf1,0xff,0x08,0x2e,0x51,0x7f]
           vucomxsd  1016(%ecx), %xmm2
 
 // CHECK: vucomxsd  -1024(%edx), %xmm2
-// CHECK: encoding: [0x62,0xf1,0xfe,0x08,0x2e,0x52,0x80]
+// CHECK: encoding: [0x62,0xf1,0xff,0x08,0x2e,0x52,0x80]
           vucomxsd  -1024(%edx), %xmm2
 
 // CHECK: vucomxsh %xmm3, %xmm2
-// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x2e,0xd3]
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x2e,0xd3]
           vucomxsh %xmm3, %xmm2
 
 // CHECK: vucomxsh {sae}, %xmm3, %xmm2
-// CHECK: encoding: [0x62,0xf5,0x7f,0x18,0x2e,0xd3]
+// CHECK: encoding: [0x62,0xf5,0x7e,0x18,0x2e,0xd3]
           vucomxsh {sae}, %xmm3, %xmm2
 
 // CHECK: vucomxsh  268435456(%esp,%esi,8), %xmm2
-// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x2e,0x94,0xf4,0x00,0x00,0x00,0x10]
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x2e,0x94,0xf4,0x00,0x00,0x00,0x10]
           vucomxsh  268435456(%esp,%esi,8), %xmm2
 
 // CHECK: vucomxsh  291(%edi,%eax,4), %xmm2
-// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x2e,0x94,0x87,0x23,0x01,0x00,0x00]
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x2e,0x94,0x87,0x23,0x01,0x00,0x00]
           vucomxsh  291(%edi,%eax,4), %xmm2
 
 // CHECK: vucomxsh  (%eax), %xmm2
-// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x2e,0x10]
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x2e,0x10]
           vucomxsh  (%eax), %xmm2
 
 // CHECK: vucomxsh  -64(,%ebp,2), %xmm2
-// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x2e,0x14,0x6d,0xc0,0xff,0xff,0xff]
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x2e,0x14,0x6d,0xc0,0xff,0xff,0xff]
           vucomxsh  -64(,%ebp,2), %xmm2
 
 // CHECK: vucomxsh  254(%ecx), %xmm2
-// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x2e,0x51,0x7f]
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x2e,0x51,0x7f]
           vucomxsh  254(%ecx), %xmm2
 
 // CHECK: vucomxsh  -256(%edx), %xmm2
-// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x2e,0x52,0x80]
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x2e,0x52,0x80]
           vucomxsh  -256(%edx), %xmm2
 
 // CHECK: vucomxss %xmm3, %xmm2
-// CHECK: encoding: [0x62,0xf1,0x7f,0x08,0x2e,0xd3]
+// CHECK: encoding: [0x62,0xf1,0x7e,0x08,0x2e,0xd3]
           vucomxss %xmm3, %xmm2
 
 // CHECK: vucomxss {sae}, %xmm3, %xmm2
-// CHECK: encoding: [0x62,0xf1,0x7f,0x18,0x2e,0xd3]
+// CHECK: encoding: [0x62,0xf1,0x7e,0x18,0x2e,0xd3]
           vucomxss {sae}, %xmm3, %xmm2
 
 // CHECK: vucomxss  268435456(%esp,%esi,8), %xmm2
-// CHECK: encoding: [0x62,0xf1,0x7f,0x08,0x2e,0x94,0xf4,0x00,0x00,0x00,0x10]
+// CHECK: encoding: [0x62,0xf1,0x7e,0x08,0x2e,0x94,0xf4,0x00,0x00,0x00,0x10]
           vucomxss  268435456(%esp,%esi,8), %xmm2
 
 // CHECK: vucomxss  291(%edi,%eax,4), %xmm2
-// CHECK: encoding: [0x62,0xf1,0x7f,0x08,0x2e,0x94,0x87,0x23,0x01,0x00,0x00]
+// CHECK: encoding: [0x62,0xf1,0x7e,0x08,0x2e,0x94,0x87,0x23,0x01,0x00,0x00]
           vucomxss  291(%edi,%eax,4), %xmm2
 
 // CHECK: vucomxss  (%eax), %xmm2
-// CHECK: encoding: [0x62,0xf1,0x7f,0x08,0x2e,0x10]
+// CHECK: encoding: [0x62,0xf1,0x7e,0x08,0x2e,0x10]
           vucomxss  (%eax), %xmm2
 
 // CHECK: vucomxss  -128(,%ebp,2), %xmm2
-// CHECK: encoding: [0x62,0xf1,0x7f,0x08,0x2e,0x14,0x6d,0x80,0xff,0xff,0xff]
+// CHECK: encoding: [0x62,0xf1,0x7e,0x08,0x2e,0x14,0x6d,0x80,0xff,0xff,0xff]
           vucomxss  -128(,%ebp,2), %xmm2
 
 // CHECK: vucomxss  508(%ecx), %xmm2
-// CHECK: encoding: [0x62,0xf1,0x7f,0x08,0x2e,0x51,0x7f]
+// CHECK: encoding: [0x62,0xf1,0x7e,0x08,0x2e,0x51,0x7f]
           vucomxss  508(%ecx), %xmm2
 
 // CHECK: vucomxss  -512(%edx), %xmm2
-// CHECK: encoding: [0x62,0xf1,0x7f,0x08,0x2e,0x52,0x80]
+// CHECK: encoding: [0x62,0xf1,0x7e,0x08,0x2e,0x52,0x80]
           vucomxss  -512(%edx), %xmm2
 
diff --git a/llvm/test/MC/X86/avx10.2-com-ef-32-intel.s b/llvm/test/MC/X86/avx10.2-com-ef-32-intel.s
index 9ff0484db133c..7cbd4e9722ddb 100644
--- a/llvm/test/MC/X86/avx10.2-com-ef-32-intel.s
+++ b/llvm/test/MC/X86/avx10.2-com-ef-32-intel.s
@@ -1,194 +1,194 @@
 // RUN: llvm-mc -triple i386 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
 
 // CHECK: vcomxsd xmm2, xmm3
-// CHECK: encoding: [0x62,0xf1,0xfe,0x08,0x2f,0xd3]
+// CHECK: encoding: [0x62,0xf1,0xff,0x08,0x2f,0xd3]
           vcomxsd xmm2, xmm3
 
 // CHECK: vcomxsd xmm2, xmm3, {sae}
-// CHECK: encoding: [0x62,0xf1,0xfe,0x18,0x2f,0xd3]
+// CHECK: encoding: [0x62,0xf1,0xff,0x18,0x2f,0xd3]
           vcomxsd xmm2, xmm3, {sae}
 
 // CHECK: vcomxsd xmm2, qword ptr [esp + 8*esi + 268435456]
-// CHECK: encoding: [0x62,0xf1,0xfe,0x08,0x2f,0x94,0xf4,0x00,0x00,0x00,0x10]
+// CHECK: encoding: [0x62,0xf1,0xff,0x08,0x2f,0x94,0xf4,0x00,0x00,0x00,0x10]
           vcomxsd xmm2, qword ptr [esp + 8*esi + 268435456]
 
 // CHECK: vcomxsd xmm2, qword ptr [edi + 4*eax + 291]
-// CHECK: encoding: [0x62,0xf1,0xfe,0x08,0x2f,0x94,0x87,0x23,0x01,0x00,0x00]
+// CHECK: encoding: [0x62,0xf1,0xff,0x08,0x2f,0x94,0x87,0x23,0x01,0x00,0x00]
           vcomxsd xmm2, qword ptr [edi + 4*eax + 291]
 
 // CHECK: vcomxsd xmm2, qword ptr [eax]
-// CHECK: encoding: [0x62,0xf1,0xfe,0x08,0x2f,0x10]
+// CHECK: encoding: [0x62,0xf1,0xff,0x08,0x2f,0x10]
           vcomxsd xmm2, qword ptr [eax]
 
 // CHECK: vcomxsd xmm2, qword ptr [2*ebp - 256]
-// CHECK: encoding: [0x62,0xf1,0xfe,0x08,0x2f,0x14,0x6d,0x00,0xff,0xff,0xff]
+// CHECK: encoding: [0x62,0xf1,0xff,0x08,0x2f,0x14,0x6d,0x00,0xff,0xff,0xff]
           vcomxsd xmm2, qword ptr [2*ebp - 256]
 
 // CHECK: vcomxsd xmm2, qword ptr [ecx + 1016]
-// CHECK: encoding: [0x62,0xf1,0xfe,0x08,0x2f,0x51,0x7f]
+// CHECK: encoding: [0x62,0xf1,0xff,0x08,0x2f,0x51,0x7f]
           vcomxsd xmm2, qword ptr [ecx + 1016]
 
 // CHECK: vcomxsd xmm2, qword ptr [edx - 1024]
-// CHECK: encoding: [0x62,0xf1,0xfe,0x08,0x2f,0x52,0x80]
+// CHECK: encoding: [0x62,0xf1,0xff,0x08,0x2f,0x52,0x80]
           vcomxsd xmm2, qword ptr [edx - 1024]
 
 // CHECK: vcomxsh xmm2, xmm3
-// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x2f,0xd3]
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x2f,0xd3]
           vcomxsh xmm2, xmm3
 
 // CHECK: vcomxsh xmm2, xmm3, {sae}
-// CHECK: encoding: [0x62,0xf5,0x7f,0x18,0x2f,0xd3]
+// CHECK: encoding: [0x62,0xf5,0x7e,0x18,0x2f,0xd3]
           vcomxsh xmm2, xmm3, {sae}
 
 // CHECK: vcomxsh xmm2, word ptr [esp + 8*esi + 268435456]
-// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x2f,0x94,0xf4,0x00,0x00,0x00,0x10]
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x2f,0x94,0xf4,0x00,0x00,0x00,0x10]
           vcomxsh xmm2, word ptr [esp + 8*esi + 268435456]
 
 // CHECK: vcomxsh xmm2, word ptr [edi + 4*eax + 291]
-// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x2f,0x94,0x87,0x23,0x01,0x00,0x00]
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x2f,0x94,0x87,0x23,0x01,0x00,0x00]
           vcomxsh xmm2, word ptr [edi + 4*eax + 291]
 
 // CHECK: vcomxsh xmm2, word ptr [eax]
-// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x2f,0x10]
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x2f,0x10]
           vcomxsh xmm2, word ptr [eax]
 
 // CHECK: vcomxsh xmm2, word ptr [2*ebp - 64]
-// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x2f,0x14,0x6d,0xc0,0xff,0xff,0xff]
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x2f,0x14,0x6d,0xc0,0xff,0xff,0xff]
           vcomxsh xmm2, word ptr [2*ebp - 64]
 
 // CHECK: vcomxsh xmm2, word ptr [ecx + 254]
-// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x2f,0x51,0x7f]
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x2f,0x51,0x7f]
           vcomxsh xmm2, word ptr [ecx + 254]
 
 // CHECK: vcomxsh xmm2, word ptr [edx - 256]
-// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x2f,0x52,0x80]
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x2f,0x52,0x80]
           vcomxsh xmm2, word ptr [edx - 256]
 
 // CHECK: vcomxss xmm2, xmm3
-// CHECK: encoding: [0x62,0xf1,0x7f,0x08,0x2f,0xd3]
+// CHECK: encoding: [0x62,0xf1,0x7e,0x08,0x2f,0xd3]
           vcomxss xmm2, xmm3
 
 // CHECK: vcomxss xmm2, xmm3, {sae}
-// CHECK: encoding: [0x62,0xf1,0x7f,0x18,0x2f,0xd3]
+// CHECK: encoding: [0x62,0xf1,0x7e,0x18,0x2f,0xd3]
           vcomxss xmm2, xmm3, {sae}
 
 // CHECK: vcomxss xmm2, dword ptr [esp + 8*esi + 268435456]
-// CHECK: encoding: [0x62,0xf1,0x7f,0x08,0x2f,0x94,0xf4,0x00,0x00,0x00,0x10]
+// CHECK: encoding: [0x62,0xf1,0x7e,0x08,0x2f,0x94,0xf4,0x00,0x00,0x00,0x10]
           vcomxss xmm2, dword ptr [esp + 8*esi + 268435456]
 
 // CHECK: vcomxss xmm2, dword ptr [edi + 4*eax + 291]
-// CHECK: encoding: [0x62,0xf1,0x7f,0x08,0x2f,0x94,0x87,0x23,0x01,0x00,0x00]
+// CHECK: encoding: [0x62,0xf1,0x7e,0x08,0x2f,0x94,0x87,0x23,0x01,0x00,0x00]
           vcomxss xmm2, dword ptr [edi + 4*eax + 291]
 
 // CHECK: vcomxss xmm2, dword ptr [eax]
-// CHECK: encoding: [0x62,0xf1,0x7f,0x08,0x2f,0x10]
+// CHECK: encoding: [0x62,0xf1,0x7e,0x08,0x2f,0x10]
           vcomxss xmm2, dword ptr [eax]
 
 // CHECK: vcomxss xmm2, dword ptr [2*ebp - 128]
-// CHECK: encoding: [0x62,0xf1,0x7f,0x08,0x2f,0x14,0x6d,0x80,0xff,0xff,0xff]
+// CHECK: encoding: [0x62,0xf1,0x7e,0x08,0x2f,0x14,0x6d,0x80,0xff,0xff,0xff]
           vcomxss xmm2, dword ptr [2*ebp - 128]
 
 // CHECK: vcomxss xmm2, dword ptr [ecx + 508]
-// CHECK: encoding: [0x62,0xf1,0x7f,0x08,0x2f,0x51,0x7f]
+// CHECK: encoding: [0x62,0xf1,0x7e,0x08,0x2f,0x51,0x7f]
           vcomxss xmm2, dword ptr [ecx + 508]
 
 // CHECK: vcomxss xmm2, dword ptr [edx - 512]
-// CHECK: encoding: [0x62,0xf1,0x7f,0x08,0x2f,0x52,0x80]
+// CHECK: encoding: [0x62,0xf1,0x7e,0x08,0x2f,0x52,0x80]
           vcomxss xmm2, dword ptr [edx - 512]
 
 // CHECK: vucomxsd xmm2, xmm3
-// CHECK: encoding: [0x62,0xf1,0xfe,0x08,0x2e,0xd3]
+// CHECK: encoding: [0x62,0xf1,0xff,0x08,0x2e,0xd3]
           vucomxsd xmm2, xmm3
 
 // CHECK: vucomxsd xmm2, xmm3, {sae}
-// CHECK: encoding: [0x62,0xf1,0xfe,0x18,0x2e,0xd3]
+// CHECK: encoding: [0x62,0xf1,0xff,0x18,0x2e,0xd3]
           vucomxsd xmm2, xmm3, {sae}
 
 // CHECK: vucomxsd xmm2, qword ptr [esp + 8*esi + 268435456]
-// CHECK: encoding: [0x62,0xf1,0xfe,0x08,0x2e,0x94,0xf4,0x00,0x00,0x00,0x10]
+// CHECK: encoding: [0x62,0xf1,0xff,0x08,0x2e,0x94,0xf4,0x00,0x00,0x00,0x10]
           vucomxsd xmm2, qword ptr [esp + 8*esi + 268435456]
 
 // CHECK: vucomxsd xmm2, qword ptr [edi + 4*eax + 291]
-// CHECK: encoding: [0x62,0xf1,0xfe,0x08,0x2e,0x94,0x87,0x23,0x01,0x00,0x00]
+// CHECK: encoding: [0x62,0xf1,0xff,0x08,0x2e,0x94,0x87,0x23,0x01,0x00,0x00]
           vucomxsd xmm2, qword ptr [edi + 4*eax + 291]
 
 // CHECK: vucomxsd xmm2, qword ptr [eax]
-// CHECK: encoding: [0x62,0xf1,0xfe,0x08,0x2e,0x10]
+// CHECK: encoding: [0x62,0xf1,0xff,0x08,0x2e,0x10]
           vucomxsd xmm2, qword ptr [eax]
 
 // CHECK: vucomxsd xmm2, qword ptr [2*ebp - 256]
-// CHECK: encoding: [0x62,0xf1,0xfe,0x08,0x2e,0x14,0x6d,0x00,0xff,0xff,0xff]
+// CHECK: encoding: [0x62,0xf1,0xff,0x08,0x2e,0x14,0x6d,0x00,0xff,0xff,0xff]
           vucomxsd xmm2, qword ptr [2*ebp - 256]
 
 // CHECK: vucomxsd xmm2, qword ptr [ecx + 1016]
-// CHECK: encoding: [0x62,0xf1,0xfe,0x08,0x2e,0x51,0x7f]
+// CHECK: encoding: [0x62,0xf1,0xff,0x08,0x2e,0x51,0x7f]
           vucomxsd xmm2, qword ptr [ecx + 1016]
 
 // CHECK: vucomxsd xmm2, qword ptr [edx - 1024]
-// CHECK: encoding: [0x62,0xf1,0xfe,0x08,0x2e,0x52,0x80]
+// CHECK: encoding: [0x62,0xf1,0xff,0x08,0x2e,0x52,0x80]
           vucomxsd xmm2, qword ptr [edx - 1024]
 
 // CHECK: vucomxsh xmm2, xmm3
-// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x2e,0xd3]
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x2e,0xd3]
           vucomxsh xmm2, xmm3
 
 // CHECK: vucomxsh xmm2, xmm3, {sae}
-// CHECK: encoding: [0x62,0xf5,0x7f,0x18,0x2e,0xd3]
+// CHECK: encoding: [0x62,0xf5,0x7e,0x18,0x2e,0xd3]
           vucomxsh xmm2, xmm3, {sae}
 
 // CHECK: vucomxsh xmm2, word ptr [esp + 8*esi + 268435456]
-// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x2e,0x94,0xf4,0x00,0x00,0x00,0x10]
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x2e,0x94,0xf4,0x00,0x00,0x00,0x10]
           vucomxsh xmm2, word ptr [esp + 8*esi + 268435456]
 
 // CHECK: vucomxsh xmm2, word ptr [edi + 4*eax + 291]
-// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x2e,0x94,0x87,0x23,0x01,0x00,0x00]
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x2e,0x94,0x87,0x23,0x01,0x00,0x00]
           vucomxsh xmm2, word ptr [edi + 4*eax + 291]
 
 // CHECK: vucomxsh xmm2, word ptr [eax]
-// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x2e,0x10]
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x2e,0x10]
           vucomxsh xmm2, word ptr [eax]
 
 // CHECK: vucomxsh xmm2, word ptr [2*ebp - 64]
-// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x2e,0x14,0x6d,0xc0,0xff,0xff,0xff]
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x2e,0x14,0x6d,0xc0,0xff,0xff,0xff]
           vucomxsh xmm2, word ptr [2*ebp - 64]
 
 // CHECK: vucomxsh xmm2, word ptr [ecx + 254]
-// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x2e,0x51,0x7f]
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x2e,0x51,0x7f]
           vucomxsh xmm2, word ptr [ecx + 254]
 
 // CHECK: vucomxsh xmm2, word ptr [edx - 256]
-// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x2e,0x52,0x80]
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x2e,0x52,0x80]
           vucomxsh xmm2, word ptr [edx - 256]
 
 // CHECK: vucomxss xmm2, xmm3
-// CHECK: encoding: [0x62,0xf1,0x7f,0x08,0x2e,0xd3]
+// CHECK: encoding: [0x62,0xf1,0x7e,0x08,0x2e,0xd3]
           vucomxss xmm2, xmm3
 
 // CHECK: vucomxss xmm2, xmm3, {sae}
-// CHECK: encoding: [0x62,0xf1,0x7f,0x18,0x2e,0xd3]
+// CHECK: encoding: [0x62,0xf1,0x7e,0x18,0x2e,0xd3]
           vucomxss xmm2, xmm3, {sae}
 
 // CHECK: vucomxss xmm2, dword ptr [esp + 8*esi + 268435456]
-// CHECK: encoding: [0x62,0xf1,0x7f,0x08,0x2e,0x94,0xf4,0x00,0x00,0x00,0x10]
+// CHECK: encoding: [0x62,0xf1,0x7e,0x08,0x2e,0x94,0xf4,0x00,0x00,0x00,0x10]
           vucomxss xmm2, dword ptr [esp + 8*esi + 268435456]
 
 // CHECK: vucomxss xmm2, dword ptr [edi + 4*eax + 291]
-// CHECK: encoding: [0x62,0xf1,0x7f,0x08,0x2e,0x94,0x87,0x23,0x01,0x00,0x00]
+// CHECK: encoding: [0x62,0xf1,0x7e,0x08,0x2e,0x94,0x87,0x23,0x01,0x00,0x00]
           vucomxss xmm2, dword ptr [edi + 4*eax + 291]
 
 // CHECK: vucomxss xmm2, dword ptr [eax]
-// CHECK: encoding: [0x62,0xf1,0x7f,0x08,0x2e,0x10]
+// CHECK: encoding: [0x62,0xf1,0x7e,0x08,0x2e,0x10]
           vucomxss xmm2, dword ptr [eax]
 
 // CHECK: vucomxss xmm2, dword ptr [2*ebp - 128]
-// CHECK: encoding: [0x62,0xf1,0x7f,0x08,0x2e,0x14,0x6d,0x80,0xff,0xff,0xff]
+// CHECK: encoding: [0x62,0xf1,0x7e,0x08,0x2e,0x14,0x6d,0x80,0xff,0xff,0xff]
           vucomxss xmm2, dword ptr [2*ebp - 128]
 
 // CHECK: vucomxss xmm2, dword ptr [ecx + 508]
-// CHECK: encoding: [0x62,0xf1,0x7f,0x08,0x2e,0x51,0x7f]
+// CHECK: encoding: [0x62,0xf1,0x7e,0x08,0x2e,0x51,0x7f]
           vucomxss xmm2, dword ptr [ecx + 508]
 
 // CHECK: vucomxss xmm2, dword ptr [edx - 512]
-// CHECK: encoding: [0x62,0xf1,0x7f,0x08,0x2e,0x52,0x80]
+// CHECK: encoding: [0x62,0xf1,0x7e,0x08,0x2e,0x52,0x80]
           vucomxss xmm2, dword ptr [edx - 512]
 
diff --git a/llvm/test/MC/X86/avx10.2-com-ef-64-att.s b/llvm/test/MC/X86/avx10.2-com-ef-64-att.s
index 2f3690537334a..832151ab23707 100644
--- a/llvm/test/MC/X86/avx10.2-com-ef-64-att.s
+++ b/llvm/test/MC/X86/avx10.2-com-ef-64-att.s
@@ -1,194 +1,194 @@
 // RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
 
 // CHECK: vcomxsd %xmm23, %xmm22
-// CHECK: encoding: [0x62,0xa1,0xfe,0x08,0x2f,0xf7]
+// CHECK: encoding: [0x62,0xa1,0xff,0x08,0x2f,0xf7]
           vcomxsd %xmm23, %xmm22
 
 // CHECK: vcomxsd {sae}, %xmm23, %xmm22
-// CHECK: encoding: [0x62,0xa1,0xfe,0x18,0x2f,0xf7]
+// CHECK: encoding: [0x62,0xa1,0xff,0x18,0x2f,0xf7]
           vcomxsd {sae}, %xmm23, %xmm22
 
 // CHECK: vcomxsd  268435456(%rbp,%r14,8), %xmm22
-// CHECK: encoding: [0x62,0xa1,0xfe,0x08,0x2f,0xb4,0xf5,0x00,0x00,0x00,0x10]
+// CHECK: encoding: [0x62,0xa1,0xff,0x08,0x2f,0xb4,0xf5,0x00,0x00,0x00,0x10]
           vcomxsd  268435456(%rbp,%r14,8), %xmm22
 
 // CHECK: vcomxsd  291(%r8,%rax,4), %xmm22
-// CHECK: encoding: [0x62,0xc1,0xfe,0x08,0x2f,0xb4,0x80,0x23,0x01,0x00,0x00]
+// CHECK: encoding: [0x62,0xc1,0xff,0x08,0x2f,0xb4,0x80,0x23,0x01,0x00,0x00]
           vcomxsd  291(%r8,%rax,4), %xmm22
 
 // CHECK: vcomxsd  (%rip), %xmm22
-// CHECK: encoding: [0x62,0xe1,0xfe,0x08,0x2f,0x35,0x00,0x00,0x00,0x00]
+// CHECK: encoding: [0x62,0xe1,0xff,0x08,0x2f,0x35,0x00,0x00,0x00,0x00]
           vcomxsd  (%rip), %xmm22
 
 // CHECK: vcomxsd  -256(,%rbp,2), %xmm22
-// CHECK: encoding: [0x62,0xe1,0xfe,0x08,0x2f,0x34,0x6d,0x00,0xff,0xff,0xff]
+// CHECK: encoding: [0x62,0xe1,0xff,0x08,0x2f,0x34,0x6d,0x00,0xff,0xff,0xff]
           vcomxsd  -256(,%rbp,2), %xmm22
 
 // CHECK: vcomxsd  1016(%rcx), %xmm22
-// CHECK: encoding: [0x62,0xe1,0xfe,0x08,0x2f,0x71,0x7f]
+// CHECK: encoding: [0x62,0xe1,0xff,0x08,0x2f,0x71,0x7f]
           vcomxsd  1016(%rcx), %xmm22
 
 // CHECK: vcomxsd  -1024(%rdx), %xmm22
-// CHECK: encoding: [0x62,0xe1,0xfe,0x08,0x2f,0x72,0x80]
+// CHECK: encoding: [0x62,0xe1,0xff,0x08,0x2f,0x72,0x80]
           vcomxsd  -1024(%rdx), %xmm22
 
 // CHECK: vcomxsh %xmm23, %xmm22
-// CHECK: encoding: [0x62,0xa5,0x7f,0x08,0x2f,0xf7]
+// CHECK: encoding: [0x62,0xa5,0x7e,0x08,0x2f,0xf7]
           vcomxsh %xmm23, %xmm22
 
 // CHECK: vcomxsh {sae}, %xmm23, %xmm22
-// CHECK: encoding: [0x62,0xa5,0x7f,0x18,0x2f,0xf7]
+// CHECK: encoding: [0x62,0xa5,0x7e,0x18,0x2f,0xf7]
           vcomxsh {sae}, %xmm23, %xmm22
 
 // CHECK: vcomxsh  268435456(%rbp,%r14,8), %xmm22
-// CHECK: encoding: [0x62,0xa5,0x7f,0x08,0x2f,0xb4,0xf5,0x00,0x00,0x00,0x10]
+// CHECK: encoding: [0x62,0xa5,0x7e,0x08,0x2f,0xb4,0xf5,0x00,0x00,0x00,0x10]
           vcomxsh  268435456(%rbp,%r14,8), %xmm22
 
 // CHECK: vcomxsh  291(%r8,%rax,4), %xmm22
-// CHECK: encoding: [0x62,0xc5,0x7f,0x08,0x2f,0xb4,0x80,0x23,0x01,0x00,0x00]
+// CHECK: encoding: [0x62,0xc5,0x7e,0x08,0x2f,0xb4,0x80,0x23,0x01,0x00,0x00]
           vcomxsh  291(%r8,%rax,4), %xmm22
 
 // CHECK: vcomxsh  (%rip), %xmm22
-// CHECK: encoding: [0x62,0xe5,0x7f,0x08,0x2f,0x35,0x00,0x00,0x00,0x00]
+// CHECK: encoding: [0x62,0xe5,0x7e,0x08,0x2f,0x35,0x00,0x00,0x00,0x00]
           vcomxsh  (%rip), %xmm22
 
 // CHECK: vcomxsh  -64(,%rbp,2), %xmm22
-// CHECK: encoding: [0x62,0xe5,0x7f,0x08,0x2f,0x34,0x6d,0xc0,0xff,0xff,0xff]
+// CHECK: encoding: [0x62,0xe5,0x7e,0x08,0x2f,0x34,0x6d,0xc0,0xff,0xff,0xff]
           vcomxsh  -64(,%rbp,2), %xmm22
 
 // CHECK: vcomxsh  254(%rcx), %xmm22
-// CHECK: encoding: [0x62,0xe5,0x7f,0x08,0x2f,0x71,0x7f]
+// CHECK: encoding: [0x62,0xe5,0x7e,0x08,0x2f,0x71,0x7f]
           vcomxsh  254(%rcx), %xmm22
 
 // CHECK: vcomxsh  -256(%rdx), %xmm22
-// CHECK: encoding: [0x62,0xe5,0x7f,0x08,0x2f,0x72,0x80]
+// CHECK: encoding: [0x62,0xe5,0x7e,0x08,0x2f,0x72,0x80]
           vcomxsh  -256(%rdx), %xmm22
 
 // CHECK: vcomxss %xmm23, %xmm22
-// CHECK: encoding: [0x62,0xa1,0x7f,0x08,0x2f,0xf7]
+// CHECK: encoding: [0x62,0xa1,0x7e,0x08,0x2f,0xf7]
           vcomxss %xmm23, %xmm22
 
 // CHECK: vcomxss {sae}, %xmm23, %xmm22
-// CHECK: encoding: [0x62,0xa1,0x7f,0x18,0x2f,0xf7]
+// CHECK: encoding: [0x62,0xa1,0x7e,0x18,0x2f,0xf7]
           vcomxss {sae}, %xmm23, %xmm22
 
 // CHECK: vcomxss  268435456(%rbp,%r14,8), %xmm22
-// CHECK: encoding: [0x62,0xa1,0x7f,0x08,0x2f,0xb4,0xf5,0x00,0x00,0x00,0x10]
+// CHECK: encoding: [0x62,0xa1,0x7e,0x08,0x2f,0xb4,0xf5,0x00,0x00,0x00,0x10]
           vcomxss  268435456(%rbp,%r14,8), %xmm22
 
 // CHECK: vcomxss  291(%r8,%rax,4), %xmm22
-// CHECK: encoding: [0x62,0xc1,0x7f,0x08,0x2f,0xb4,0x80,0x23,0x01,0x00,0x00]
+// CHECK: encoding: [0x62,0xc1,0x7e,0x08,0x2f,0xb4,0x80,0x23,0x01,0x00,0x00]
           vcomxss  291(%r8,%rax,4), %xmm22
 
 // CHECK: vcomxss  (%rip), %xmm22
-// CHECK: encoding: [0x62,0xe1,0x7f,0x08,0x2f,0x35,0x00,0x00,0x00,0x00]
+// CHECK: encoding: [0x62,0xe1,0x7e,0x08,0x2f,0x35,0x00,0x00,0x00,0x00]
           vcomxss  (%rip), %xmm22
 
 // CHECK: vcomxss  -128(,%rbp,2), %xmm22
-// CHECK: encoding: [0x62,0xe1,0x7f,0x08,0x2f,0x34,0x6d,0x80,0xff,0xff,0xff]
+// CHECK: encoding: [0x62,0xe1,0x7e,0x08,0x2f,0x34,0x6d,0x80,0xff,0xff,0xff]
           vcomxss  -128(,%rbp,2), %xmm22
 
 // CHECK: vcomxss  508(%rcx), %xmm22
-// CHECK: encoding: [0x62,0xe1,0x7f,0x08,0x2f,0x71,0x7f]
+// CHECK: encoding: [0x62,0xe1,0x7e,0x08,0x2f,0x71,0x7f]
           vcomxss  508(%rcx), %xmm22
 
 // CHECK: vcomxss  -512(%rdx), %xmm22
-// CHECK: encoding: [0x62,0xe1,0x7f,0x08,0x2f,0x72,0x80]
+// CHECK: encoding: [0x62,0xe1,0x7e,0x08,0x2f,0x72,0x80]
           vcomxss  -512(%rdx), %xmm22
 
 // CHECK: vucomxsd %xmm23, %xmm22
-// CHECK: encoding: [0x62,0xa1,0xfe,0x08,0x2e,0xf7]
+// CHECK: encoding: [0x62,0xa1,0xff,0x08,0x2e,0xf7]
           vucomxsd %xmm23, %xmm22
 
 // CHECK: vucomxsd {sae}, %xmm23, %xmm22
-// CHECK: encoding: [0x62,0xa1,0xfe,0x18,0x2e,0xf7]
+// CHECK: encoding: [0x62,0xa1,0xff,0x18,0x2e,0xf7]
           vucomxsd {sae}, %xmm23, %xmm22
 
 // CHECK: vucomxsd  268435456(%rbp,%r14,8), %xmm22
-// CHECK: encoding: [0x62,0xa1,0xfe,0x08,0x2e,0xb4,0xf5,0x00,0x00,0x00,0x10]
+// CHECK: encoding: [0x62,0xa1,0xff,0x08,0x2e,0xb4,0xf5,0x00,0x00,0x00,0x10]
           vucomxsd  268435456(%rbp,%r14,8), %xmm22
 
 // CHECK: vucomxsd  291(%r8,%rax,4), %xmm22
-// CHECK: encoding: [0x62,0xc1,0xfe,0x08,0x2e,0xb4,0x80,0x23,0x01,0x00,0x00]
+// CHECK: encoding: [0x62,0xc1,0xff,0x08,0x2e,0xb4,0x80,0x23,0x01,0x00,0x00]
           vucomxsd  291(%r8,%rax,4), %xmm22
 
 // CHECK: vucomxsd  (%rip), %xmm22
-// CHECK: encoding: [0x62,0xe1,0xfe,0x08,0x2e,0x35,0x00,0x00,0x00,0x00]
+// CHECK: encoding: [0x62,0xe1,0xff,0x08,0x2e,0x35,0x00,0x00,0x00,0x00]
           vucomxsd  (%rip), %xmm22
 
 // CHECK: vucomxsd  -256(,%rbp,2), %xmm22
-// CHECK: encoding: [0x62,0xe1,0xfe,0x08,0x2e,0x34,0x6d,0x00,0xff,0xff,0xff]
+// CHECK: encoding: [0x62,0xe1,0xff,0x08,0x2e,0x34,0x6d,0x00,0xff,0xff,0xff]
           vucomxsd  -256(,%rbp,2), %xmm22
 
 // CHECK: vucomxsd  1016(%rcx), %xmm22
-// CHECK: encoding: [0x62,0xe1,0xfe,0x08,0x2e,0x71,0x7f]
+// CHECK: encoding: [0x62,0xe1,0xff,0x08,0x2e,0x71,0x7f]
           vucomxsd  1016(%rcx), %xmm22
 
 // CHECK: vucomxsd  -1024(%rdx), %xmm22
-// CHECK: encoding: [0x62,0xe1,0xfe,0x08,0x2e,0x72,0x80]
+// CHECK: encoding: [0x62,0xe1,0xff,0x08,0x2e,0x72,0x80]
           vucomxsd  -1024(%rdx), %xmm22
 
 // CHECK: vucomxsh %xmm23, %xmm22
-// CHECK: encoding: [0x62,0xa5,0x7f,0x08,0x2e,0xf7]
+// CHECK: encoding: [0x62,0xa5,0x7e,0x08,0x2e,0xf7]
           vucomxsh %xmm23, %xmm22
 
 // CHECK: vucomxsh {sae}, %xmm23, %xmm22
-// CHECK: encoding: [0x62,0xa5,0x7f,0x18,0x2e,0xf7]
+// CHECK: encoding: [0x62,0xa5,0x7e,0x18,0x2e,0xf7]
           vucomxsh {sae}, %xmm23, %xmm22
 
 // CHECK: vucomxsh  268435456(%rbp,%r14,8), %xmm22
-// CHECK: encoding: [0x62,0xa5,0x7f,0x08,0x2e,0xb4,0xf5,0x00,0x00,0x00,0x10]
+// CHECK: encoding: [0x62,0xa5,0x7e,0x08,0x2e,0xb4,0xf5,0x00,0x00,0x00,0x10]
           vucomxsh  268435456(%rbp,%r14,8), %xmm22
 
 // CHECK: vucomxsh  291(%r8,%rax,4), %xmm22
-// CHECK: encoding: [0x62,0xc5,0x7f,0x08,0x2e,0xb4,0x80,0x23,0x01,0x00,0x00]
+// CHECK: encoding: [0x62,0xc5,0x7e,0x08,0x2e,0xb4,0x80,0x23,0x01,0x00,0x00]
           vucomxsh  291(%r8,%rax,4), %xmm22
 
 // CHECK: vucomxsh  (%rip), %xmm22
-// CHECK: encoding: [0x62,0xe5,0x7f,0x08,0x2e,0x35,0x00,0x00,0x00,0x00]
+// CHECK: encoding: [0x62,0xe5,0x7e,0x08,0x2e,0x35,0x00,0x00,0x00,0x00]
           vucomxsh  (%rip), %xmm22
 
 // CHECK: vucomxsh  -64(,%rbp,2), %xmm22
-// CHECK: encoding: [0x62,0xe5,0x7f,0x08,0x2e,0x34,0x6d,0xc0,0xff,0xff,0xff]
+// CHECK: encoding: [0x62,0xe5,0x7e,0x08,0x2e,0x34,0x6d,0xc0,0xff,0xff,0xff]
           vucomxsh  -64(,%rbp,2), %xmm22
 
 // CHECK: vucomxsh  254(%rcx), %xmm22
-// CHECK: encoding: [0x62,0xe5,0x7f,0x08,0x2e,0x71,0x7f]
+// CHECK: encoding: [0x62,0xe5,0x7e,0x08,0x2e,0x71,0x7f]
           vucomxsh  254(%rcx), %xmm22
 
 // CHECK: vucomxsh  -256(%rdx), %xmm22
-// CHECK: encoding: [0x62,0xe5,0x7f,0x08,0x2e,0x72,0x80]
+// CHECK: encoding: [0x62,0xe5,0x7e,0x08,0x2e,0x72,0x80]
           vucomxsh  -256(%rdx), %xmm22
 
 // CHECK: vucomxss %xmm23, %xmm22
-// CHECK: encoding: [0x62,0xa1,0x7f,0x08,0x2e,0xf7]
+// CHECK: encoding: [0x62,0xa1,0x7e,0x08,0x2e,0xf7]
           vucomxss %xmm23, %xmm22
 
 // CHECK: vucomxss {sae}, %xmm23, %xmm22
-// CHECK: encoding: [0x62,0xa1,0x7f,0x18,0x2e,0xf7]
+// CHECK: encoding: [0x62,0xa1,0x7e,0x18,0x2e,0xf7]
           vucomxss {sae}, %xmm23, %xmm22
 
 // CHECK: vucomxss  268435456(%rbp,%r14,8), %xmm22
-// CHECK: encoding: [0x62,0xa1,0x7f,0x08,0x2e,0xb4,0xf5,0x00,0x00,0x00,0x10]
+// CHECK: encoding: [0x62,0xa1,0x7e,0x08,0x2e,0xb4,0xf5,0x00,0x00,0x00,0x10]
           vucomxss  268435456(%rbp,%r14,8), %xmm22
 
 // CHECK: vucomxss  291(%r8,%rax,4), %xmm22
-// CHECK: encoding: [0x62,0xc1,0x7f,0x08,0x2e,0xb4,0x80,0x23,0x01,0x00,0x00]
+// CHECK: encoding: [0x62,0xc1,0x7e,0x08,0x2e,0xb4,0x80,0x23,0x01,0x00,0x00]
           vucomxss  291(%r8,%rax,4), %xmm22
 
 // CHECK: vucomxss  (%rip), %xmm22
-// CHECK: encoding: [0x62,0xe1,0x7f,0x08,0x2e,0x35,0x00,0x00,0x00,0x00]
+// CHECK: encoding: [0x62,0xe1,0x7e,0x08,0x2e,0x35,0x00,0x00,0x00,0x00]
           vucomxss  (%rip), %xmm22
 
 // CHECK: vucomxss  -128(,%rbp,2), %xmm22
-// CHECK: encoding: [0x62,0xe1,0x7f,0x08,0x2e,0x34,0x6d,0x80,0xff,0xff,0xff]
+// CHECK: encoding: [0x62,0xe1,0x7e,0x08,0x2e,0x34,0x6d,0x80,0xff,0xff,0xff]
           vucomxss  -128(,%rbp,2), %xmm22
 
 // CHECK: vucomxss  508(%rcx), %xmm22
-// CHECK: encoding: [0x62,0xe1,0x7f,0x08,0x2e,0x71,0x7f]
+// CHECK: encoding: [0x62,0xe1,0x7e,0x08,0x2e,0x71,0x7f]
           vucomxss  508(%rcx), %xmm22
 
 // CHECK: vucomxss  -512(%rdx), %xmm22
-// CHECK: encoding: [0x62,0xe1,0x7f,0x08,0x2e,0x72,0x80]
+// CHECK: encoding: [0x62,0xe1,0x7e,0x08,0x2e,0x72,0x80]
           vucomxss  -512(%rdx), %xmm22
 
diff --git a/llvm/test/MC/X86/avx10.2-com-ef-64-intel.s b/llvm/test/MC/X86/avx10.2-com-ef-64-intel.s
index 41aaf99270b88..94e3b77984c83 100644
--- a/llvm/test/MC/X86/avx10.2-com-ef-64-intel.s
+++ b/llvm/test/MC/X86/avx10.2-com-ef-64-intel.s
@@ -1,194 +1,194 @@
 // RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
 
 // CHECK: vcomxsd xmm22, xmm23
-// CHECK: encoding: [0x62,0xa1,0xfe,0x08,0x2f,0xf7]
+// CHECK: encoding: [0x62,0xa1,0xff,0x08,0x2f,0xf7]
           vcomxsd xmm22, xmm23
 
 // CHECK: vcomxsd xmm22, xmm23, {sae}
-// CHECK: encoding: [0x62,0xa1,0xfe,0x18,0x2f,0xf7]
+// CHECK: encoding: [0x62,0xa1,0xff,0x18,0x2f,0xf7]
           vcomxsd xmm22, xmm23, {sae}
 
 // CHECK: vcomxsd xmm22, qword ptr [rbp + 8*r14 + 268435456]
-// CHECK: encoding: [0x62,0xa1,0xfe,0x08,0x2f,0xb4,0xf5,0x00,0x00,0x00,0x10]
+// CHECK: encoding: [0x62,0xa1,0xff,0x08,0x2f,0xb4,0xf5,0x00,0x00,0x00,0x10]
           vcomxsd xmm22, qword ptr [rbp + 8*r14 + 268435456]
 
 // CHECK: vcomxsd xmm22, qword ptr [r8 + 4*rax + 291]
-// CHECK: encoding: [0x62,0xc1,0xfe,0x08,0x2f,0xb4,0x80,0x23,0x01,0x00,0x00]
+// CHECK: encoding: [0x62,0xc1,0xff,0x08,0x2f,0xb4,0x80,0x23,0x01,0x00,0x00]
           vcomxsd xmm22, qword ptr [r8 + 4*rax + 291]
 
 // CHECK: vcomxsd xmm22, qword ptr [rip]
-// CHECK: encoding: [0x62,0xe1,0xfe,0x08,0x2f,0x35,0x00,0x00,0x00,0x00]
+// CHECK: encoding: [0x62,0xe1,0xff,0x08,0x2f,0x35,0x00,0x00,0x00,0x00]
           vcomxsd xmm22, qword ptr [rip]
 
 // CHECK: vcomxsd xmm22, qword ptr [2*rbp - 256]
-// CHECK: encoding: [0x62,0xe1,0xfe,0x08,0x2f,0x34,0x6d,0x00,0xff,0xff,0xff]
+// CHECK: encoding: [0x62,0xe1,0xff,0x08,0x2f,0x34,0x6d,0x00,0xff,0xff,0xff]
           vcomxsd xmm22, qword ptr [2*rbp - 256]
 
 // CHECK: vcomxsd xmm22, qword ptr [rcx + 1016]
-// CHECK: encoding: [0x62,0xe1,0xfe,0x08,0x2f,0x71,0x7f]
+// CHECK: encoding: [0x62,0xe1,0xff,0x08,0x2f,0x71,0x7f]
           vcomxsd xmm22, qword ptr [rcx + 1016]
 
 // CHECK: vcomxsd xmm22, qword ptr [rdx - 1024]
-// CHECK: encoding: [0x62,0xe1,0xfe,0x08,0x2f,0x72,0x80]
+// CHECK: encoding: [0x62,0xe1,0xff,0x08,0x2f,0x72,0x80]
           vcomxsd xmm22, qword ptr [rdx - 1024]
 
 // CHECK: vcomxsh xmm22, xmm23
-// CHECK: encoding: [0x62,0xa5,0x7f,0x08,0x2f,0xf7]
+// CHECK: encoding: [0x62,0xa5,0x7e,0x08,0x2f,0xf7]
           vcomxsh xmm22, xmm23
 
 // CHECK: vcomxsh xmm22, xmm23, {sae}
-// CHECK: encoding: [0x62,0xa5,0x7f,0x18,0x2f,0xf7]
+// CHECK: encoding: [0x62,0xa5,0x7e,0x18,0x2f,0xf7]
           vcomxsh xmm22, xmm23, {sae}
 
 // CHECK: vcomxsh xmm22, word ptr [rbp + 8*r14 + 268435456]
-// CHECK: encoding: [0x62,0xa5,0x7f,0x08,0x2f,0xb4,0xf5,0x00,0x00,0x00,0x10]
+// CHECK: encoding: [0x62,0xa5,0x7e,0x08,0x2f,0xb4,0xf5,0x00,0x00,0x00,0x10]
           vcomxsh xmm22, word ptr [rbp + 8*r14 + 268435456]
 
 // CHECK: vcomxsh xmm22, word ptr [r8 + 4*rax + 291]
-// CHECK: encoding: [0x62,0xc5,0x7f,0x08,0x2f,0xb4,0x80,0x23,0x01,0x00,0x00]
+// CHECK: encoding: [0x62,0xc5,0x7e,0x08,0x2f,0xb4,0x80,0x23,0x01,0x00,0x00]
           vcomxsh xmm22, word ptr [r8 + 4*rax + 291]
 
 // CHECK: vcomxsh xmm22, word ptr [rip]
-// CHECK: encoding: [0x62,0xe5,0x7f,0x08,0x2f,0x35,0x00,0x00,0x00,0x00]
+// CHECK: encoding: [0x62,0xe5,0x7e,0x08,0x2f,0x35,0x00,0x00,0x00,0x00]
           vcomxsh xmm22, word ptr [rip]
 
 // CHECK: vcomxsh xmm22, word ptr [2*rbp - 64]
-// CHECK: encoding: [0x62,0xe5,0x7f,0x08,0x2f,0x34,0x6d,0xc0,0xff,0xff,0xff]
+// CHECK: encoding: [0x62,0xe5,0x7e,0x08,0x2f,0x34,0x6d,0xc0,0xff,0xff,0xff]
           vcomxsh xmm22, word ptr [2*rbp - 64]
 
 // CHECK: vcomxsh xmm22, word ptr [rcx + 254]
-// CHECK: encoding: [0x62,0xe5,0x7f,0x08,0x2f,0x71,0x7f]
+// CHECK: encoding: [0x62,0xe5,0x7e,0x08,0x2f,0x71,0x7f]
           vcomxsh xmm22, word ptr [rcx + 254]
 
 // CHECK: vcomxsh xmm22, word ptr [rdx - 256]
-// CHECK: encoding: [0x62,0xe5,0x7f,0x08,0x2f,0x72,0x80]
+// CHECK: encoding: [0x62,0xe5,0x7e,0x08,0x2f,0x72,0x80]
           vcomxsh xmm22, word ptr [rdx - 256]
 
 // CHECK: vcomxss xmm22, xmm23
-// CHECK: encoding: [0x62,0xa1,0x7f,0x08,0x2f,0xf7]
+// CHECK: encoding: [0x62,0xa1,0x7e,0x08,0x2f,0xf7]
           vcomxss xmm22, xmm23
 
 // CHECK: vcomxss xmm22, xmm23, {sae}
-// CHECK: encoding: [0x62,0xa1,0x7f,0x18,0x2f,0xf7]
+// CHECK: encoding: [0x62,0xa1,0x7e,0x18,0x2f,0xf7]
           vcomxss xmm22, xmm23, {sae}
 
 // CHECK: vcomxss xmm22, dword ptr [rbp + 8*r14 + 268435456]
-// CHECK: encoding: [0x62,0xa1,0x7f,0x08,0x2f,0xb4,0xf5,0x00,0x00,0x00,0x10]
+// CHECK: encoding: [0x62,0xa1,0x7e,0x08,0x2f,0xb4,0xf5,0x00,0x00,0x00,0x10]
           vcomxss xmm22, dword ptr [rbp + 8*r14 + 268435456]
 
 // CHECK: vcomxss xmm22, dword ptr [r8 + 4*rax + 291]
-// CHECK: encoding: [0x62,0xc1,0x7f,0x08,0x2f,0xb4,0x80,0x23,0x01,0x00,0x00]
+// CHECK: encoding: [0x62,0xc1,0x7e,0x08,0x2f,0xb4,0x80,0x23,0x01,0x00,0x00]
           vcomxss xmm22, dword ptr [r8 + 4*rax + 291]
 
 // CHECK: vcomxss xmm22, dword ptr [rip]
-// CHECK: encoding: [0x62,0xe1,0x7f,0x08,0x2f,0x35,0x00,0x00,0x00,0x00]
+// CHECK: encoding: [0x62,0xe1,0x7e,0x08,0x2f,0x35,0x00,0x00,0x00,0x00]
           vcomxss xmm22, dword ptr [rip]
 
 // CHECK: vcomxss xmm22, dword ptr [2*rbp - 128]
-// CHECK: encoding: [0x62,0xe1,0x7f,0x08,0x2f,0x34,0x6d,0x80,0xff,0xff,0xff]
+// CHECK: encoding: [0x62,0xe1,0x7e,0x08,0x2f,0x34,0x6d,0x80,0xff,0xff,0xff]
           vcomxss xmm22, dword ptr [2*rbp - 128]
 
 // CHECK: vcomxss xmm22, dword ptr [rcx + 508]
-// CHECK: encoding: [0x62,0xe1,0x7f,0x08,0x2f,0x71,0x7f]
+// CHECK: encoding: [0x62,0xe1,0x7e,0x08,0x2f,0x71,0x7f]
           vcomxss xmm22, dword ptr [rcx + 508]
 
 // CHECK: vcomxss xmm22, dword ptr [rdx - 512]
-// CHECK: encoding: [0x62,0xe1,0x7f,0x08,0x2f,0x72,0x80]
+// CHECK: encoding: [0x62,0xe1,0x7e,0x08,0x2f,0x72,0x80]
           vcomxss xmm22, dword ptr [rdx - 512]
 
 // CHECK: vucomxsd xmm22, xmm23
-// CHECK: encoding: [0x62,0xa1,0xfe,0x08,0x2e,0xf7]
+// CHECK: encoding: [0x62,0xa1,0xff,0x08,0x2e,0xf7]
           vucomxsd xmm22, xmm23
 
 // CHECK: vucomxsd xmm22, xmm23, {sae}
-// CHECK: encoding: [0x62,0xa1,0xfe,0x18,0x2e,0xf7]
+// CHECK: encoding: [0x62,0xa1,0xff,0x18,0x2e,0xf7]
           vucomxsd xmm22, xmm23, {sae}
 
 // CHECK: vucomxsd xmm22, qword ptr [rbp + 8*r14 + 268435456]
-// CHECK: encoding: [0x62,0xa1,0xfe,0x08,0x2e,0xb4,0xf5,0x00,0x00,0x00,0x10]
+// CHECK: encoding: [0x62,0xa1,0xff,0x08,0x2e,0xb4,0xf5,0x00,0x00,0x00,0x10]
           vucomxsd xmm22, qword ptr [rbp + 8*r14 + 268435456]
 
 // CHECK: vucomxsd xmm22, qword ptr [r8 + 4*rax + 291]
-// CHECK: encoding: [0x62,0xc1,0xfe,0x08,0x2e,0xb4,0x80,0x23,0x01,0x00,0x00]
+// CHECK: encoding: [0x62,0xc1,0xff,0x08,0x2e,0xb4,0x80,0x23,0x01,0x00,0x00]
           vucomxsd xmm22, qword ptr [r8 + 4*rax + 291]
 
 // CHECK: vucomxsd xmm22, qword ptr [rip]
-// CHECK: encoding: [0x62,0xe1,0xfe,0x08,0x2e,0x35,0x00,0x00,0x00,0x00]
+// CHECK: encoding: [0x62,0xe1,0xff,0x08,0x2e,0x35,0x00,0x00,0x00,0x00]
           vucomxsd xmm22, qword ptr [rip]
 
 // CHECK: vucomxsd xmm22, qword ptr [2*rbp - 256]
-// CHECK: encoding: [0x62,0xe1,0xfe,0x08,0x2e,0x34,0x6d,0x00,0xff,0xff,0xff]
+// CHECK: encoding: [0x62,0xe1,0xff,0x08,0x2e,0x34,0x6d,0x00,0xff,0xff,0xff]
           vucomxsd xmm22, qword ptr [2*rbp - 256]
 
 // CHECK: vucomxsd xmm22, qword ptr [rcx + 1016]
-// CHECK: encoding: [0x62,0xe1,0xfe,0x08,0x2e,0x71,0x7f]
+// CHECK: encoding: [0x62,0xe1,0xff,0x08,0x2e,0x71,0x7f]
           vucomxsd xmm22, qword ptr [rcx + 1016]
 
 // CHECK: vucomxsd xmm22, qword ptr [rdx - 1024]
-// CHECK: encoding: [0x62,0xe1,0xfe,0x08,0x2e,0x72,0x80]
+// CHECK: encoding: [0x62,0xe1,0xff,0x08,0x2e,0x72,0x80]
           vucomxsd xmm22, qword ptr [rdx - 1024]
 
 // CHECK: vucomxsh xmm22, xmm23
-// CHECK: encoding: [0x62,0xa5,0x7f,0x08,0x2e,0xf7]
+// CHECK: encoding: [0x62,0xa5,0x7e,0x08,0x2e,0xf7]
           vucomxsh xmm22, xmm23
 
 // CHECK: vucomxsh xmm22, xmm23, {sae}
-// CHECK: encoding: [0x62,0xa5,0x7f,0x18,0x2e,0xf7]
+// CHECK: encoding: [0x62,0xa5,0x7e,0x18,0x2e,0xf7]
           vucomxsh xmm22, xmm23, {sae}
 
 // CHECK: vucomxsh xmm22, word ptr [rbp + 8*r14 + 268435456]
-// CHECK: encoding: [0x62,0xa5,0x7f,0x08,0x2e,0xb4,0xf5,0x00,0x00,0x00,0x10]
+// CHECK: encoding: [0x62,0xa5,0x7e,0x08,0x2e,0xb4,0xf5,0x00,0x00,0x00,0x10]
           vucomxsh xmm22, word ptr [rbp + 8*r14 + 268435456]
 
 // CHECK: vucomxsh xmm22, word ptr [r8 + 4*rax + 291]
-// CHECK: encoding: [0x62,0xc5,0x7f,0x08,0x2e,0xb4,0x80,0x23,0x01,0x00,0x00]
+// CHECK: encoding: [0x62,0xc5,0x7e,0x08,0x2e,0xb4,0x80,0x23,0x01,0x00,0x00]
           vucomxsh xmm22, word ptr [r8 + 4*rax + 291]
 
 // CHECK: vucomxsh xmm22, word ptr [rip]
-// CHECK: encoding: [0x62,0xe5,0x7f,0x08,0x2e,0x35,0x00,0x00,0x00,0x00]
+// CHECK: encoding: [0x62,0xe5,0x7e,0x08,0x2e,0x35,0x00,0x00,0x00,0x00]
           vucomxsh xmm22, word ptr [rip]
 
 // CHECK: vucomxsh xmm22, word ptr [2*rbp - 64]
-// CHECK: encoding: [0x62,0xe5,0x7f,0x08,0x2e,0x34,0x6d,0xc0,0xff,0xff,0xff]
+// CHECK: encoding: [0x62,0xe5,0x7e,0x08,0x2e,0x34,0x6d,0xc0,0xff,0xff,0xff]
           vucomxsh xmm22, word ptr [2*rbp - 64]
 
 // CHECK: vucomxsh xmm22, word ptr [rcx + 254]
-// CHECK: encoding: [0x62,0xe5,0x7f,0x08,0x2e,0x71,0x7f]
+// CHECK: encoding: [0x62,0xe5,0x7e,0x08,0x2e,0x71,0x7f]
           vucomxsh xmm22, word ptr [rcx + 254]
 
 // CHECK: vucomxsh xmm22, word ptr [rdx - 256]
-// CHECK: encoding: [0x62,0xe5,0x7f,0x08,0x2e,0x72,0x80]
+// CHECK: encoding: [0x62,0xe5,0x7e,0x08,0x2e,0x72,0x80]
           vucomxsh xmm22, word ptr [rdx - 256]
 
 // CHECK: vucomxss xmm22, xmm23
-// CHECK: encoding: [0x62,0xa1,0x7f,0x08,0x2e,0xf7]
+// CHECK: encoding: [0x62,0xa1,0x7e,0x08,0x2e,0xf7]
           vucomxss xmm22, xmm23
 
 // CHECK: vucomxss xmm22, xmm23, {sae}
-// CHECK: encoding: [0x62,0xa1,0x7f,0x18,0x2e,0xf7]
+// CHECK: encoding: [0x62,0xa1,0x7e,0x18,0x2e,0xf7]
           vucomxss xmm22, xmm23, {sae}
 
 // CHECK: vucomxss xmm22, dword ptr [rbp + 8*r14 + 268435456]
-// CHECK: encoding: [0x62,0xa1,0x7f,0x08,0x2e,0xb4,0xf5,0x00,0x00,0x00,0x10]
+// CHECK: encoding: [0x62,0xa1,0x7e,0x08,0x2e,0xb4,0xf5,0x00,0x00,0x00,0x10]
           vucomxss xmm22, dword ptr [rbp + 8*r14 + 268435456]
 
 // CHECK: vucomxss xmm22, dword ptr [r8 + 4*rax + 291]
-// CHECK: encoding: [0x62,0xc1,0x7f,0x08,0x2e,0xb4,0x80,0x23,0x01,0x00,0x00]
+// CHECK: encoding: [0x62,0xc1,0x7e,0x08,0x2e,0xb4,0x80,0x23,0x01,0x00,0x00]
           vucomxss xmm22, dword ptr [r8 + 4*rax + 291]
 
 // CHECK: vucomxss xmm22, dword ptr [rip]
-// CHECK: encoding: [0x62,0xe1,0x7f,0x08,0x2e,0x35,0x00,0x00,0x00,0x00]
+// CHECK: encoding: [0x62,0xe1,0x7e,0x08,0x2e,0x35,0x00,0x00,0x00,0x00]
           vucomxss xmm22, dword ptr [rip]
 
 // CHECK: vucomxss xmm22, dword ptr [2*rbp - 128]
-// CHECK: encoding: [0x62,0xe1,0x7f,0x08,0x2e,0x34,0x6d,0x80,0xff,0xff,0xff]
+// CHECK: encoding: [0x62,0xe1,0x7e,0x08,0x2e,0x34,0x6d,0x80,0xff,0xff,0xff]
           vucomxss xmm22, dword ptr [2*rbp - 128]
 
 // CHECK: vucomxss xmm22, dword ptr [rcx + 508]
-// CHECK: encoding: [0x62,0xe1,0x7f,0x08,0x2e,0x71,0x7f]
+// CHECK: encoding: [0x62,0xe1,0x7e,0x08,0x2e,0x71,0x7f]
           vucomxss xmm22, dword ptr [rcx + 508]
 
 // CHECK: vucomxss xmm22, dword ptr [rdx - 512]
-// CHECK: encoding: [0x62,0xe1,0x7f,0x08,0x2e,0x72,0x80]
+// CHECK: encoding: [0x62,0xe1,0x7e,0x08,0x2e,0x72,0x80]
           vucomxss xmm22, dword ptr [rdx - 512]
 

From 66825a89b8e0e9e1d202cb4d3824791b81afdc98 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Fri, 21 Mar 2025 15:33:25 +0200
Subject: [PATCH 260/282] [LLD] [COFF] Add a few more mingw libs to skip
 autoexports for (#132289)

"libmsvcrt-os" was added to the list of excluded libs in binutils in
9d9c67b06c1bf4c4550e3de0eb575c2bfbe96df9 in 2017.

"libucrt" was added in c4a8df19ba0a82aa8dea88d9f72ed9e63cb1fa84 in 2022.

"libucrtapp" isn't in the binutils exclusion list yet, but a patch for
adding it has been submitted. Since
0d403d5dd13ce22c07418058f3b640708992890c in mingw-w64 (in 2020), there's
such a third variant of the UCRT import library available.

Since 18df3e8323dcf9fdfec56b5f12c04a9c723a0931 in 2025, "libpthread" and
"libwinpthread" are also excluded.

(cherry picked from commit af93db9344919085551fac38d6d6a4f774a7220a)
---
 lld/COFF/MinGW.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/lld/COFF/MinGW.cpp b/lld/COFF/MinGW.cpp
index 76f5a0a7500b9..097cf228f7d6e 100644
--- a/lld/COFF/MinGW.cpp
+++ b/lld/COFF/MinGW.cpp
@@ -54,7 +54,12 @@ AutoExporter::AutoExporter(
       "libFortranDecimal",
       "libunwind",
       "libmsvcrt",
+      "libmsvcrt-os",
       "libucrtbase",
+      "libucrt",
+      "libucrtapp",
+      "libpthread",
+      "libwinpthread",
   };
 
   excludeObjects = {

From fcd0ad23f668bce4b3a3731c5baa115434dc3269 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Tue, 4 Mar 2025 21:46:55 +0000
Subject: [PATCH 261/282] [AArch64] Add test for scalar copysign. NFC

(cherry picked from commit 4c2d1b4c53def85e16d3612b92379a347d76baf0)
---
 ...e-streaming-mode-fixed-length-fcopysign.ll | 228 ++++++++++++++++++
 1 file changed, 228 insertions(+)

diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
index 2282e74af5d00..238c124b7cb06 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
@@ -8,6 +8,234 @@ target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 
 target triple = "aarch64-unknown-linux-gnu"
 
+define void @test_copysign_f16(ptr %ap, ptr %bp) {
+; SVE-LABEL: test_copysign_f16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    adrp x8, .LCPI0_0
+; SVE-NEXT:    ldr h1, [x0]
+; SVE-NEXT:    ldr h2, [x1]
+; SVE-NEXT:    ldr q0, [x8, :lo12:.LCPI0_0]
+; SVE-NEXT:    adrp x8, .LCPI0_1
+; SVE-NEXT:    ldr q4, [x8, :lo12:.LCPI0_1]
+; SVE-NEXT:    mov z3.d, z0.d
+; SVE-NEXT:    fmov s0, s1
+; SVE-NEXT:    fmov s3, s2
+; SVE-NEXT:    bif v0.16b, v3.16b, v4.16b
+; SVE-NEXT:    str h0, [x0]
+; SVE-NEXT:    ret
+;
+; SVE2-LABEL: test_copysign_f16:
+; SVE2:       // %bb.0:
+; SVE2-NEXT:    adrp x8, .LCPI0_0
+; SVE2-NEXT:    ldr h1, [x0]
+; SVE2-NEXT:    ldr h2, [x1]
+; SVE2-NEXT:    ldr q0, [x8, :lo12:.LCPI0_0]
+; SVE2-NEXT:    adrp x8, .LCPI0_1
+; SVE2-NEXT:    ldr q4, [x8, :lo12:.LCPI0_1]
+; SVE2-NEXT:    mov z3.d, z0.d
+; SVE2-NEXT:    fmov s0, s1
+; SVE2-NEXT:    fmov s3, s2
+; SVE2-NEXT:    bif v0.16b, v3.16b, v4.16b
+; SVE2-NEXT:    str h0, [x0]
+; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_copysign_f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [x0]
+; NONEON-NOSVE-NEXT:    ldr h1, [x1]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    str h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
+  %a = load half, ptr %ap
+  %b = load half, ptr %bp
+  %r = call half @llvm.copysign.f16(half %a, half %b)
+  store half %r, ptr %ap
+  ret void
+}
+
+define void @test_copysign_bf16(ptr %ap, ptr %bp) {
+; SVE-LABEL: test_copysign_bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    adrp x8, .LCPI1_0
+; SVE-NEXT:    ldr h1, [x0]
+; SVE-NEXT:    ldr h2, [x1]
+; SVE-NEXT:    ldr q0, [x8, :lo12:.LCPI1_0]
+; SVE-NEXT:    adrp x8, .LCPI1_1
+; SVE-NEXT:    ldr q4, [x8, :lo12:.LCPI1_1]
+; SVE-NEXT:    mov z3.d, z0.d
+; SVE-NEXT:    fmov s0, s1
+; SVE-NEXT:    fmov s3, s2
+; SVE-NEXT:    bif v0.16b, v3.16b, v4.16b
+; SVE-NEXT:    str h0, [x0]
+; SVE-NEXT:    ret
+;
+; SVE2-LABEL: test_copysign_bf16:
+; SVE2:       // %bb.0:
+; SVE2-NEXT:    adrp x8, .LCPI1_0
+; SVE2-NEXT:    ldr h1, [x0]
+; SVE2-NEXT:    ldr h2, [x1]
+; SVE2-NEXT:    ldr q0, [x8, :lo12:.LCPI1_0]
+; SVE2-NEXT:    adrp x8, .LCPI1_1
+; SVE2-NEXT:    ldr q4, [x8, :lo12:.LCPI1_1]
+; SVE2-NEXT:    mov z3.d, z0.d
+; SVE2-NEXT:    fmov s0, s1
+; SVE2-NEXT:    fmov s3, s2
+; SVE2-NEXT:    bif v0.16b, v3.16b, v4.16b
+; SVE2-NEXT:    str h0, [x0]
+; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_copysign_bf16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    ldr h0, [x0]
+; NONEON-NOSVE-NEXT:    ldr h1, [x1]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #76]
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #16
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #16
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    str q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
+; NONEON-NOSVE-NEXT:    ret
+  %a = load bfloat, ptr %ap
+  %b = load bfloat, ptr %bp
+  %r = call bfloat @llvm.copysign.bf16(bfloat %a, bfloat %b)
+  store bfloat %r, ptr %ap
+  ret void
+}
+
+define void @test_copysign_f32(ptr %ap, ptr %bp) {
+; SVE-LABEL: test_copysign_f32:
+; SVE:       // %bb.0:
+; SVE-NEXT:    adrp x8, .LCPI2_0
+; SVE-NEXT:    ldr s1, [x0]
+; SVE-NEXT:    ldr s2, [x1]
+; SVE-NEXT:    ldr q0, [x8, :lo12:.LCPI2_0]
+; SVE-NEXT:    adrp x8, .LCPI2_1
+; SVE-NEXT:    ldr q4, [x8, :lo12:.LCPI2_1]
+; SVE-NEXT:    mov z3.d, z0.d
+; SVE-NEXT:    fmov s0, s1
+; SVE-NEXT:    fmov s3, s2
+; SVE-NEXT:    bif v0.16b, v3.16b, v4.16b
+; SVE-NEXT:    str s0, [x0]
+; SVE-NEXT:    ret
+;
+; SVE2-LABEL: test_copysign_f32:
+; SVE2:       // %bb.0:
+; SVE2-NEXT:    adrp x8, .LCPI2_0
+; SVE2-NEXT:    ldr s1, [x0]
+; SVE2-NEXT:    ldr s2, [x1]
+; SVE2-NEXT:    ldr q0, [x8, :lo12:.LCPI2_0]
+; SVE2-NEXT:    adrp x8, .LCPI2_1
+; SVE2-NEXT:    ldr q4, [x8, :lo12:.LCPI2_1]
+; SVE2-NEXT:    mov z3.d, z0.d
+; SVE2-NEXT:    fmov s0, s1
+; SVE2-NEXT:    fmov s3, s2
+; SVE2-NEXT:    bif v0.16b, v3.16b, v4.16b
+; SVE2-NEXT:    str s0, [x0]
+; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_copysign_f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr s0, [x0]
+; NONEON-NOSVE-NEXT:    ldr w8, [x1]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    tst w8, #0x80000000
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    str s0, [x0]
+; NONEON-NOSVE-NEXT:    ret
+  %a = load float, ptr %ap
+  %b = load float, ptr %bp
+  %r = call float @llvm.copysign.f32(float %a, float %b)
+  store float %r, ptr %ap
+  ret void
+}
+
+define void @test_copysign_f64(ptr %ap, ptr %bp) {
+; SVE-LABEL: test_copysign_f64:
+; SVE:       // %bb.0:
+; SVE-NEXT:    adrp x8, .LCPI3_1
+; SVE-NEXT:    ptrue p0.d, vl2
+; SVE-NEXT:    ldr d2, [x0]
+; SVE-NEXT:    ldr q0, [x8, :lo12:.LCPI3_1]
+; SVE-NEXT:    adrp x8, .LCPI3_0
+; SVE-NEXT:    ldr d3, [x1]
+; SVE-NEXT:    ldr q1, [x8, :lo12:.LCPI3_0]
+; SVE-NEXT:    fneg z0.d, p0/m, z0.d
+; SVE-NEXT:    mov z4.d, z1.d
+; SVE-NEXT:    fmov d1, d2
+; SVE-NEXT:    fmov d4, d3
+; SVE-NEXT:    bsl v0.16b, v1.16b, v4.16b
+; SVE-NEXT:    str d0, [x0]
+; SVE-NEXT:    ret
+;
+; SVE2-LABEL: test_copysign_f64:
+; SVE2:       // %bb.0:
+; SVE2-NEXT:    adrp x8, .LCPI3_1
+; SVE2-NEXT:    ptrue p0.d, vl2
+; SVE2-NEXT:    ldr d2, [x0]
+; SVE2-NEXT:    ldr q0, [x8, :lo12:.LCPI3_1]
+; SVE2-NEXT:    adrp x8, .LCPI3_0
+; SVE2-NEXT:    ldr d3, [x1]
+; SVE2-NEXT:    ldr q1, [x8, :lo12:.LCPI3_0]
+; SVE2-NEXT:    fneg z0.d, p0/m, z0.d
+; SVE2-NEXT:    mov z4.d, z1.d
+; SVE2-NEXT:    fmov d1, d2
+; SVE2-NEXT:    fmov d4, d3
+; SVE2-NEXT:    bsl v0.16b, v1.16b, v4.16b
+; SVE2-NEXT:    str d0, [x0]
+; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_copysign_f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    ldr x8, [x1]
+; NONEON-NOSVE-NEXT:    fabs d0, d0
+; NONEON-NOSVE-NEXT:    tst x8, #0x8000000000000000
+; NONEON-NOSVE-NEXT:    fneg d1, d0
+; NONEON-NOSVE-NEXT:    fcsel d0, d1, d0, ne
+; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
+  %a = load double, ptr %ap
+  %b = load double, ptr %bp
+  %r = call double @llvm.copysign.f64(double %a, double %b)
+  store double %r, ptr %ap
+  ret void
+}
+
 ;============ f16
 
 define void @test_copysign_v4f16_v4f16(ptr %ap, ptr %bp) {

From dc7b743515d3a463465dd38a62869ab9f77704cd Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Wed, 5 Mar 2025 17:18:07 +0000
Subject: [PATCH 262/282] [AArch64] Fix SVE scalar fcopysign lowering without
 neon. (#129787)

Without this we can try to generate invalid instructions or create
illegal types. This patch generates a SVE fcopysign instead and use its
lowering. BF16 is left out of the moment as it doesn't lower
successfully (but could use the same code as fp16).

(cherry picked from commit d4ab3df320f9eebf11cc5fb600a0919f93678abe)
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  19 +++
 ...e-streaming-mode-fixed-length-fcopysign.ll | 139 +++++++-----------
 2 files changed, 74 insertions(+), 84 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 16fcd589cecd1..cfd0fc32357ce 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -10683,6 +10683,25 @@ SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
     return convertFromScalableVector(DAG, VT, Res);
   }
 
+  // With SVE, but without Neon, extend the scalars to scalable vectors and use
+  // a SVE FCOPYSIGN.
+  if (!VT.isVector() && !Subtarget->isNeonAvailable() &&
+      Subtarget->isSVEorStreamingSVEAvailable()) {
+    if (VT != MVT::f16 && VT != MVT::f32 && VT != MVT::f64)
+      return SDValue();
+    EVT SVT = getPackedSVEVectorVT(VT);
+
+    SDValue Ins1 =
+        DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SVT, DAG.getUNDEF(SVT), In1,
+                    DAG.getConstant(0, DL, MVT::i64));
+    SDValue Ins2 =
+        DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SVT, DAG.getUNDEF(SVT), In2,
+                    DAG.getConstant(0, DL, MVT::i64));
+    SDValue FCS = DAG.getNode(ISD::FCOPYSIGN, DL, SVT, Ins1, Ins2);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, FCS,
+                       DAG.getConstant(0, DL, MVT::i64));
+  }
+
   auto BitCast = [this](EVT VT, SDValue Op, SelectionDAG &DAG) {
     if (VT.isScalableVector())
       return getSVESafeBitCast(VT, Op, DAG);
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
index 238c124b7cb06..79921e25caf53 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
@@ -11,32 +11,21 @@ target triple = "aarch64-unknown-linux-gnu"
 define void @test_copysign_f16(ptr %ap, ptr %bp) {
 ; SVE-LABEL: test_copysign_f16:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    adrp x8, .LCPI0_0
+; SVE-NEXT:    ldr h0, [x1]
 ; SVE-NEXT:    ldr h1, [x0]
-; SVE-NEXT:    ldr h2, [x1]
-; SVE-NEXT:    ldr q0, [x8, :lo12:.LCPI0_0]
-; SVE-NEXT:    adrp x8, .LCPI0_1
-; SVE-NEXT:    ldr q4, [x8, :lo12:.LCPI0_1]
-; SVE-NEXT:    mov z3.d, z0.d
-; SVE-NEXT:    fmov s0, s1
-; SVE-NEXT:    fmov s3, s2
-; SVE-NEXT:    bif v0.16b, v3.16b, v4.16b
+; SVE-NEXT:    and z0.h, z0.h, #0x8000
+; SVE-NEXT:    and z1.h, z1.h, #0x7fff
+; SVE-NEXT:    orr z0.d, z1.d, z0.d
 ; SVE-NEXT:    str h0, [x0]
 ; SVE-NEXT:    ret
 ;
 ; SVE2-LABEL: test_copysign_f16:
 ; SVE2:       // %bb.0:
-; SVE2-NEXT:    adrp x8, .LCPI0_0
-; SVE2-NEXT:    ldr h1, [x0]
-; SVE2-NEXT:    ldr h2, [x1]
-; SVE2-NEXT:    ldr q0, [x8, :lo12:.LCPI0_0]
-; SVE2-NEXT:    adrp x8, .LCPI0_1
-; SVE2-NEXT:    ldr q4, [x8, :lo12:.LCPI0_1]
-; SVE2-NEXT:    mov z3.d, z0.d
-; SVE2-NEXT:    fmov s0, s1
-; SVE2-NEXT:    fmov s3, s2
-; SVE2-NEXT:    bif v0.16b, v3.16b, v4.16b
-; SVE2-NEXT:    str h0, [x0]
+; SVE2-NEXT:    mov z0.h, #32767 // =0x7fff
+; SVE2-NEXT:    ldr h1, [x1]
+; SVE2-NEXT:    ldr h2, [x0]
+; SVE2-NEXT:    bsl z2.d, z2.d, z1.d, z0.d
+; SVE2-NEXT:    str h2, [x0]
 ; SVE2-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: test_copysign_f16:
@@ -66,32 +55,40 @@ define void @test_copysign_f16(ptr %ap, ptr %bp) {
 define void @test_copysign_bf16(ptr %ap, ptr %bp) {
 ; SVE-LABEL: test_copysign_bf16:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    adrp x8, .LCPI1_0
-; SVE-NEXT:    ldr h1, [x0]
-; SVE-NEXT:    ldr h2, [x1]
-; SVE-NEXT:    ldr q0, [x8, :lo12:.LCPI1_0]
-; SVE-NEXT:    adrp x8, .LCPI1_1
-; SVE-NEXT:    ldr q4, [x8, :lo12:.LCPI1_1]
-; SVE-NEXT:    mov z3.d, z0.d
-; SVE-NEXT:    fmov s0, s1
-; SVE-NEXT:    fmov s3, s2
-; SVE-NEXT:    bif v0.16b, v3.16b, v4.16b
+; SVE-NEXT:    sub sp, sp, #16
+; SVE-NEXT:    .cfi_def_cfa_offset 16
+; SVE-NEXT:    ldr h0, [x0]
+; SVE-NEXT:    ldr h1, [x1]
+; SVE-NEXT:    fmov w8, s0
+; SVE-NEXT:    str h1, [sp, #12]
+; SVE-NEXT:    ldrb w9, [sp, #13]
+; SVE-NEXT:    and w8, w8, #0x7fff
+; SVE-NEXT:    tst w9, #0x80
+; SVE-NEXT:    fmov s0, w8
+; SVE-NEXT:    eor w8, w8, #0x8000
+; SVE-NEXT:    fmov s1, w8
+; SVE-NEXT:    fcsel h0, h1, h0, ne
 ; SVE-NEXT:    str h0, [x0]
+; SVE-NEXT:    add sp, sp, #16
 ; SVE-NEXT:    ret
 ;
 ; SVE2-LABEL: test_copysign_bf16:
 ; SVE2:       // %bb.0:
-; SVE2-NEXT:    adrp x8, .LCPI1_0
-; SVE2-NEXT:    ldr h1, [x0]
-; SVE2-NEXT:    ldr h2, [x1]
-; SVE2-NEXT:    ldr q0, [x8, :lo12:.LCPI1_0]
-; SVE2-NEXT:    adrp x8, .LCPI1_1
-; SVE2-NEXT:    ldr q4, [x8, :lo12:.LCPI1_1]
-; SVE2-NEXT:    mov z3.d, z0.d
-; SVE2-NEXT:    fmov s0, s1
-; SVE2-NEXT:    fmov s3, s2
-; SVE2-NEXT:    bif v0.16b, v3.16b, v4.16b
+; SVE2-NEXT:    sub sp, sp, #16
+; SVE2-NEXT:    .cfi_def_cfa_offset 16
+; SVE2-NEXT:    ldr h0, [x0]
+; SVE2-NEXT:    ldr h1, [x1]
+; SVE2-NEXT:    fmov w8, s0
+; SVE2-NEXT:    str h1, [sp, #12]
+; SVE2-NEXT:    ldrb w9, [sp, #13]
+; SVE2-NEXT:    and w8, w8, #0x7fff
+; SVE2-NEXT:    tst w9, #0x80
+; SVE2-NEXT:    fmov s0, w8
+; SVE2-NEXT:    eor w8, w8, #0x8000
+; SVE2-NEXT:    fmov s1, w8
+; SVE2-NEXT:    fcsel h0, h1, h0, ne
 ; SVE2-NEXT:    str h0, [x0]
+; SVE2-NEXT:    add sp, sp, #16
 ; SVE2-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: test_copysign_bf16:
@@ -139,32 +136,21 @@ define void @test_copysign_bf16(ptr %ap, ptr %bp) {
 define void @test_copysign_f32(ptr %ap, ptr %bp) {
 ; SVE-LABEL: test_copysign_f32:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    adrp x8, .LCPI2_0
+; SVE-NEXT:    ldr s0, [x1]
 ; SVE-NEXT:    ldr s1, [x0]
-; SVE-NEXT:    ldr s2, [x1]
-; SVE-NEXT:    ldr q0, [x8, :lo12:.LCPI2_0]
-; SVE-NEXT:    adrp x8, .LCPI2_1
-; SVE-NEXT:    ldr q4, [x8, :lo12:.LCPI2_1]
-; SVE-NEXT:    mov z3.d, z0.d
-; SVE-NEXT:    fmov s0, s1
-; SVE-NEXT:    fmov s3, s2
-; SVE-NEXT:    bif v0.16b, v3.16b, v4.16b
+; SVE-NEXT:    and z0.s, z0.s, #0x80000000
+; SVE-NEXT:    and z1.s, z1.s, #0x7fffffff
+; SVE-NEXT:    orr z0.d, z1.d, z0.d
 ; SVE-NEXT:    str s0, [x0]
 ; SVE-NEXT:    ret
 ;
 ; SVE2-LABEL: test_copysign_f32:
 ; SVE2:       // %bb.0:
-; SVE2-NEXT:    adrp x8, .LCPI2_0
-; SVE2-NEXT:    ldr s1, [x0]
-; SVE2-NEXT:    ldr s2, [x1]
-; SVE2-NEXT:    ldr q0, [x8, :lo12:.LCPI2_0]
-; SVE2-NEXT:    adrp x8, .LCPI2_1
-; SVE2-NEXT:    ldr q4, [x8, :lo12:.LCPI2_1]
-; SVE2-NEXT:    mov z3.d, z0.d
-; SVE2-NEXT:    fmov s0, s1
-; SVE2-NEXT:    fmov s3, s2
-; SVE2-NEXT:    bif v0.16b, v3.16b, v4.16b
-; SVE2-NEXT:    str s0, [x0]
+; SVE2-NEXT:    mov z0.s, #0x7fffffff
+; SVE2-NEXT:    ldr s1, [x1]
+; SVE2-NEXT:    ldr s2, [x0]
+; SVE2-NEXT:    bsl z2.d, z2.d, z1.d, z0.d
+; SVE2-NEXT:    str s2, [x0]
 ; SVE2-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: test_copysign_f32:
@@ -187,36 +173,21 @@ define void @test_copysign_f32(ptr %ap, ptr %bp) {
 define void @test_copysign_f64(ptr %ap, ptr %bp) {
 ; SVE-LABEL: test_copysign_f64:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    adrp x8, .LCPI3_1
-; SVE-NEXT:    ptrue p0.d, vl2
-; SVE-NEXT:    ldr d2, [x0]
-; SVE-NEXT:    ldr q0, [x8, :lo12:.LCPI3_1]
-; SVE-NEXT:    adrp x8, .LCPI3_0
-; SVE-NEXT:    ldr d3, [x1]
-; SVE-NEXT:    ldr q1, [x8, :lo12:.LCPI3_0]
-; SVE-NEXT:    fneg z0.d, p0/m, z0.d
-; SVE-NEXT:    mov z4.d, z1.d
-; SVE-NEXT:    fmov d1, d2
-; SVE-NEXT:    fmov d4, d3
-; SVE-NEXT:    bsl v0.16b, v1.16b, v4.16b
+; SVE-NEXT:    ldr d0, [x1]
+; SVE-NEXT:    ldr d1, [x0]
+; SVE-NEXT:    and z0.d, z0.d, #0x8000000000000000
+; SVE-NEXT:    and z1.d, z1.d, #0x7fffffffffffffff
+; SVE-NEXT:    orr z0.d, z1.d, z0.d
 ; SVE-NEXT:    str d0, [x0]
 ; SVE-NEXT:    ret
 ;
 ; SVE2-LABEL: test_copysign_f64:
 ; SVE2:       // %bb.0:
-; SVE2-NEXT:    adrp x8, .LCPI3_1
-; SVE2-NEXT:    ptrue p0.d, vl2
+; SVE2-NEXT:    mov z0.d, #0x7fffffffffffffff
+; SVE2-NEXT:    ldr d1, [x1]
 ; SVE2-NEXT:    ldr d2, [x0]
-; SVE2-NEXT:    ldr q0, [x8, :lo12:.LCPI3_1]
-; SVE2-NEXT:    adrp x8, .LCPI3_0
-; SVE2-NEXT:    ldr d3, [x1]
-; SVE2-NEXT:    ldr q1, [x8, :lo12:.LCPI3_0]
-; SVE2-NEXT:    fneg z0.d, p0/m, z0.d
-; SVE2-NEXT:    mov z4.d, z1.d
-; SVE2-NEXT:    fmov d1, d2
-; SVE2-NEXT:    fmov d4, d3
-; SVE2-NEXT:    bsl v0.16b, v1.16b, v4.16b
-; SVE2-NEXT:    str d0, [x0]
+; SVE2-NEXT:    bsl z2.d, z2.d, z1.d, z0.d
+; SVE2-NEXT:    str d2, [x0]
 ; SVE2-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: test_copysign_f64:

From 0383020b6c1a1fdfa765050b9ef6172a422d332b Mon Sep 17 00:00:00 2001
From: Nicholas Guy <nicholas.guy@arm.com>
Date: Wed, 19 Mar 2025 13:44:02 +0000
Subject: [PATCH 263/282] [llvm] Fix crash when complex deinterleaving operates
 on an unrolled loop (#129735)

When attempting to perform complex deinterleaving on an unrolled loop
containing a reduction, the complex deinterleaving pass would fail to
accommodate the wider types when accumulating the unrolled paths.
Instead of trying to alter the incoming IR to fit expectations, the pass
should instead decide against processing any reduction that results in a
non-complex or non-vector value.

(cherry picked from commit 3f4b2f12a1e3e87e4bfb86937cc1ccdd4d38dcf5)
---
 .../lib/CodeGen/ComplexDeinterleavingPass.cpp |  11 +
 .../complex-deinterleaving-unrolled-cdot.ll   | 191 ++++++++++++++++++
 2 files changed, 202 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/complex-deinterleaving-unrolled-cdot.ll

diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
index 92053ed561901..4cd378f9aa595 100644
--- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
+++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
@@ -1741,6 +1741,17 @@ void ComplexDeinterleavingGraph::identifyReductionNodes() {
       LLVM_DEBUG(
           dbgs() << "Identified single reduction starting from instruction: "
                  << *Real << "/" << *ReductionInfo[Real].second << "\n");
+
+      // Reducing to a single vector is not supported, only permit reducing down
+      // to scalar values.
+      // Doing this here will leave the prior node in the graph,
+      // however with no uses the node will be unreachable by the replacement
+      // process. That along with the usage outside the graph should prevent the
+      // replacement process from kicking off at all for this graph.
+      // TODO Add support for reducing to a single vector value
+      if (ReductionInfo[Real].second->getType()->isVectorTy())
+        continue;
+
       Processed[i] = true;
       auto RootNode = prepareCompositeNode(
           ComplexDeinterleavingOperation::ReductionSingle, Real, nullptr);
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-unrolled-cdot.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-unrolled-cdot.ll
new file mode 100644
index 0000000000000..faefaf9bad7b1
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-unrolled-cdot.ll
@@ -0,0 +1,191 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=complex-deinterleaving %s --mattr=+sve2 -o - | FileCheck %s --check-prefix=CHECK-SVE2
+; RUN: opt -S --passes=complex-deinterleaving %s --mattr=+sve -o - | FileCheck %s --check-prefix=CHECK-SVE
+; RUN: opt -S --passes=complex-deinterleaving %s -o - | FileCheck %s --check-prefix=CHECK-NOSVE
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-unknown-elf"
+
+define i32 @cdotp_i8_rot0(<vscale x 32 x i8> %a0, <vscale x 32 x i8> %b0, <vscale x 32 x i8> %a1, <vscale x 32 x i8> %b1) {
+; CHECK-SVE2-LABEL: define i32 @cdotp_i8_rot0(
+; CHECK-SVE2-SAME: <vscale x 32 x i8> [[A0:%.*]], <vscale x 32 x i8> [[B0:%.*]], <vscale x 32 x i8> [[A1:%.*]], <vscale x 32 x i8> [[B1:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SVE2-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE2:       [[VECTOR_BODY]]:
+; CHECK-SVE2-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE33:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE2-NEXT:    [[VEC_PHI25:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE34:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE2-NEXT:    [[A0_DEINTERLEAVED:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A0]])
+; CHECK-SVE2-NEXT:    [[A0_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A0_DEINTERLEAVED]], 0
+; CHECK-SVE2-NEXT:    [[A0_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A0_DEINTERLEAVED]], 1
+; CHECK-SVE2-NEXT:    [[A1_DEINTERLEAVED:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A1]])
+; CHECK-SVE2-NEXT:    [[A1_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A1_DEINTERLEAVED]], 0
+; CHECK-SVE2-NEXT:    [[A1_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A1_DEINTERLEAVED]], 1
+; CHECK-SVE2-NEXT:    [[A0_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A0_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE2-NEXT:    [[A1_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A1_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE2-NEXT:    [[B0_DEINTERLEAVED:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B0]])
+; CHECK-SVE2-NEXT:    [[B0_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B0_DEINTERLEAVED]], 0
+; CHECK-SVE2-NEXT:    [[B0_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B0_DEINTERLEAVED]], 1
+; CHECK-SVE2-NEXT:    [[B1_DEINTERLEAVED:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B1]])
+; CHECK-SVE2-NEXT:    [[B1_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B1_DEINTERLEAVED]], 0
+; CHECK-SVE2-NEXT:    [[B1_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B1_DEINTERLEAVED]], 1
+; CHECK-SVE2-NEXT:    [[B0_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B0_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE2-NEXT:    [[B1_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B1_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE2-NEXT:    [[TMP0:%.*]] = mul nsw <vscale x 16 x i32> [[B0_REAL_EXT]], [[A0_REAL_EXT]]
+; CHECK-SVE2-NEXT:    [[TMP1:%.*]] = mul nsw <vscale x 16 x i32> [[B1_REAL_EXT]], [[A1_REAL_EXT]]
+; CHECK-SVE2-NEXT:    [[A0_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A0_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE2-NEXT:    [[A1_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A1_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE2-NEXT:    [[B0_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B0_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE2-NEXT:    [[B1_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B1_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE2-NEXT:    [[TMP2:%.*]] = mul nsw <vscale x 16 x i32> [[B0_IMAG_EXT]], [[A0_IMAG_EXT]]
+; CHECK-SVE2-NEXT:    [[TMP3:%.*]] = mul nsw <vscale x 16 x i32> [[B1_IMAG_EXT]], [[A1_IMAG_EXT]]
+; CHECK-SVE2-NEXT:    [[PARTIAL_REDUCE:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP0]])
+; CHECK-SVE2-NEXT:    [[PARTIAL_REDUCE32:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI25]], <vscale x 16 x i32> [[TMP1]])
+; CHECK-SVE2-NEXT:    [[TMP4:%.*]] = sub nsw <vscale x 16 x i32> zeroinitializer, [[TMP2]]
+; CHECK-SVE2-NEXT:    [[TMP5:%.*]] = sub nsw <vscale x 16 x i32> zeroinitializer, [[TMP3]]
+; CHECK-SVE2-NEXT:    [[PARTIAL_REDUCE33]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]], <vscale x 16 x i32> [[TMP4]])
+; CHECK-SVE2-NEXT:    [[PARTIAL_REDUCE34]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE32]], <vscale x 16 x i32> [[TMP5]])
+; CHECK-SVE2-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE2:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE2-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[PARTIAL_REDUCE34]], [[PARTIAL_REDUCE33]]
+; CHECK-SVE2-NEXT:    [[TMP23:%.*]] = tail call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
+; CHECK-SVE2-NEXT:    ret i32 [[TMP23]]
+;
+; CHECK-SVE-LABEL: define i32 @cdotp_i8_rot0(
+; CHECK-SVE-SAME: <vscale x 32 x i8> [[A0:%.*]], <vscale x 32 x i8> [[B0:%.*]], <vscale x 32 x i8> [[A1:%.*]], <vscale x 32 x i8> [[B1:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE:       [[VECTOR_BODY]]:
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE33:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[VEC_PHI25:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE34:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[A0_DEINTERLEAVED:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A0]])
+; CHECK-SVE-NEXT:    [[A0_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A0_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[A0_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A0_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[A1_DEINTERLEAVED:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A1]])
+; CHECK-SVE-NEXT:    [[A1_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A1_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[A1_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A1_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[A0_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A0_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[A1_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A1_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[B0_DEINTERLEAVED:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B0]])
+; CHECK-SVE-NEXT:    [[B0_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B0_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[B0_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B0_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[B1_DEINTERLEAVED:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B1]])
+; CHECK-SVE-NEXT:    [[B1_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B1_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[B1_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B1_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[B0_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B0_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[B1_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B1_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[TMP0:%.*]] = mul nsw <vscale x 16 x i32> [[B0_REAL_EXT]], [[A0_REAL_EXT]]
+; CHECK-SVE-NEXT:    [[TMP1:%.*]] = mul nsw <vscale x 16 x i32> [[B1_REAL_EXT]], [[A1_REAL_EXT]]
+; CHECK-SVE-NEXT:    [[A0_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A0_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[A1_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A1_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[B0_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B0_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[B1_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B1_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[TMP2:%.*]] = mul nsw <vscale x 16 x i32> [[B0_IMAG_EXT]], [[A0_IMAG_EXT]]
+; CHECK-SVE-NEXT:    [[TMP3:%.*]] = mul nsw <vscale x 16 x i32> [[B1_IMAG_EXT]], [[A1_IMAG_EXT]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP0]])
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE32:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI25]], <vscale x 16 x i32> [[TMP1]])
+; CHECK-SVE-NEXT:    [[TMP4:%.*]] = sub nsw <vscale x 16 x i32> zeroinitializer, [[TMP2]]
+; CHECK-SVE-NEXT:    [[TMP5:%.*]] = sub nsw <vscale x 16 x i32> zeroinitializer, [[TMP3]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE33]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]], <vscale x 16 x i32> [[TMP4]])
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE34]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE32]], <vscale x 16 x i32> [[TMP5]])
+; CHECK-SVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[PARTIAL_REDUCE34]], [[PARTIAL_REDUCE33]]
+; CHECK-SVE-NEXT:    [[TMP6:%.*]] = tail call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
+; CHECK-SVE-NEXT:    ret i32 [[TMP6]]
+;
+; CHECK-NOSVE-LABEL: define i32 @cdotp_i8_rot0(
+; CHECK-NOSVE-SAME: <vscale x 32 x i8> [[A0:%.*]], <vscale x 32 x i8> [[B0:%.*]], <vscale x 32 x i8> [[A1:%.*]], <vscale x 32 x i8> [[B1:%.*]]) {
+; CHECK-NOSVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-NOSVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-NOSVE:       [[VECTOR_BODY]]:
+; CHECK-NOSVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE33:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NOSVE-NEXT:    [[VEC_PHI25:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE34:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NOSVE-NEXT:    [[A0_DEINTERLEAVED:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A0]])
+; CHECK-NOSVE-NEXT:    [[A0_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A0_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[A0_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A0_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[A1_DEINTERLEAVED:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A1]])
+; CHECK-NOSVE-NEXT:    [[A1_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A1_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[A1_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A1_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[A0_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A0_REAL]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[A1_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A1_REAL]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[B0_DEINTERLEAVED:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B0]])
+; CHECK-NOSVE-NEXT:    [[B0_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B0_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[B0_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B0_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[B1_DEINTERLEAVED:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B1]])
+; CHECK-NOSVE-NEXT:    [[B1_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B1_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[B1_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B1_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[B0_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B0_REAL]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[B1_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B1_REAL]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[TMP0:%.*]] = mul nsw <vscale x 16 x i32> [[B0_REAL_EXT]], [[A0_REAL_EXT]]
+; CHECK-NOSVE-NEXT:    [[TMP1:%.*]] = mul nsw <vscale x 16 x i32> [[B1_REAL_EXT]], [[A1_REAL_EXT]]
+; CHECK-NOSVE-NEXT:    [[A0_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A0_IMAG]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[A1_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A1_IMAG]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[B0_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B0_IMAG]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[B1_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B1_IMAG]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[TMP2:%.*]] = mul nsw <vscale x 16 x i32> [[B0_IMAG_EXT]], [[A0_IMAG_EXT]]
+; CHECK-NOSVE-NEXT:    [[TMP3:%.*]] = mul nsw <vscale x 16 x i32> [[B1_IMAG_EXT]], [[A1_IMAG_EXT]]
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP0]])
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE32:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI25]], <vscale x 16 x i32> [[TMP1]])
+; CHECK-NOSVE-NEXT:    [[TMP4:%.*]] = sub nsw <vscale x 16 x i32> zeroinitializer, [[TMP2]]
+; CHECK-NOSVE-NEXT:    [[TMP5:%.*]] = sub nsw <vscale x 16 x i32> zeroinitializer, [[TMP3]]
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE33]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]], <vscale x 16 x i32> [[TMP4]])
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE34]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE32]], <vscale x 16 x i32> [[TMP5]])
+; CHECK-NOSVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-NOSVE:       [[MIDDLE_BLOCK]]:
+; CHECK-NOSVE-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[PARTIAL_REDUCE34]], [[PARTIAL_REDUCE33]]
+; CHECK-NOSVE-NEXT:    [[TMP6:%.*]] = tail call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
+; CHECK-NOSVE-NEXT:    ret i32 [[TMP6]]
+;
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %vec.phi = phi <vscale x 4 x i32> [ zeroinitializer, %entry ], [ %partial.reduce33, %vector.body ]
+  %vec.phi25 = phi <vscale x 4 x i32> [ zeroinitializer, %entry ], [ %partial.reduce34, %vector.body ]
+  %a0.deinterleaved = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %a0)
+  %a0.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a0.deinterleaved, 0
+  %a0.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a0.deinterleaved, 1
+  %a1.deinterleaved = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %a1)
+  %a1.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a1.deinterleaved, 0
+  %a1.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a1.deinterleaved, 1
+  %a0.real.ext = sext <vscale x 16 x i8> %a0.real to <vscale x 16 x i32>
+  %a1.real.ext = sext <vscale x 16 x i8> %a1.real to <vscale x 16 x i32>
+  %b0.deinterleaved = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %b0)
+  %b0.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b0.deinterleaved, 0
+  %b0.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b0.deinterleaved, 1
+  %b1.deinterleaved = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %b1)
+  %b1.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b1.deinterleaved, 0
+  %b1.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b1.deinterleaved, 1
+  %b0.real.ext = sext <vscale x 16 x i8> %b0.real to <vscale x 16 x i32>
+  %b1.real.ext = sext <vscale x 16 x i8> %b1.real to <vscale x 16 x i32>
+  %18 = mul nsw <vscale x 16 x i32> %b0.real.ext, %a0.real.ext
+  %19 = mul nsw <vscale x 16 x i32> %b1.real.ext, %a1.real.ext
+  %a0.imag.ext = sext <vscale x 16 x i8> %a0.imag to <vscale x 16 x i32>
+  %a1.imag.ext = sext <vscale x 16 x i8> %a1.imag to <vscale x 16 x i32>
+  %b0.imag.ext = sext <vscale x 16 x i8> %b0.imag to <vscale x 16 x i32>
+  %b1.imag.ext = sext <vscale x 16 x i8> %b1.imag to <vscale x 16 x i32>
+  %24 = mul nsw <vscale x 16 x i32> %b0.imag.ext, %a0.imag.ext
+  %25 = mul nsw <vscale x 16 x i32> %b1.imag.ext, %a1.imag.ext
+  %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi, <vscale x 16 x i32> %18)
+  %partial.reduce32 = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi25, <vscale x 16 x i32> %19)
+  %26 = sub nsw <vscale x 16 x i32> zeroinitializer, %24
+  %27 = sub nsw <vscale x 16 x i32> zeroinitializer, %25
+  %partial.reduce33 = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %partial.reduce, <vscale x 16 x i32> %26)
+  %partial.reduce34 = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %partial.reduce32, <vscale x 16 x i32> %27)
+  br i1 true, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %bin.rdx = add <vscale x 4 x i32> %partial.reduce34, %partial.reduce33
+  %29 = tail call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %bin.rdx)
+  ret i32 %29
+}
+
+declare <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16>, <vscale x 16 x i32>)
+declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32>, <vscale x 16 x i32>)
+declare <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i32(<vscale x 2 x i64>, <vscale x 16 x i32>)
+
+declare <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32>, <16 x i32>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
+
+declare i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32>)
+declare i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64>)

From 2198410a8a8a0723dc744af5a2b7239ee5722662 Mon Sep 17 00:00:00 2001
From: David Tellenbach <dtellenbach@apple.com>
Date: Mon, 17 Mar 2025 17:23:58 -0700
Subject: [PATCH 264/282] [compiler-rt][Darwin][x86] Fix
 instrprof-darwin-exports test (#131425)

ld64 issues a warning about section alignment which was counted as an
unexpected exported symbol and the test failed.

Fixed by disabling all linker warnings using -Wl,-w.

(cherry picked from commit 94426df66a8d7c2321f9e197e5ef9636b0d5ce70)
---
 compiler-rt/test/profile/instrprof-darwin-exports.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/compiler-rt/test/profile/instrprof-darwin-exports.c b/compiler-rt/test/profile/instrprof-darwin-exports.c
index 079d5d28ed24d..1a2ac8c813272 100644
--- a/compiler-rt/test/profile/instrprof-darwin-exports.c
+++ b/compiler-rt/test/profile/instrprof-darwin-exports.c
@@ -7,13 +7,13 @@
 // just "_main" produces no warnings or errors.
 //
 // RUN: echo "_main" > %t.exports
-// RUN: %clang_pgogen -Werror -Wl,-exported_symbols_list,%t.exports -o %t %s 2>&1 | tee %t.log
-// RUN: %clang_profgen -Werror -fcoverage-mapping -Wl,-exported_symbols_list,%t.exports -o %t %s 2>&1 | tee -a %t.log
+// RUN: %clang_pgogen -Werror -Wl,-exported_symbols_list,%t.exports -Wl,-w -o %t %s 2>&1 | tee %t.log
+// RUN: %clang_profgen -Werror -fcoverage-mapping -Wl,-exported_symbols_list,%t.exports -Wl,-w -o %t %s 2>&1 | tee -a %t.log
 // RUN: cat %t.log | count 0
 
 // 2) Ditto (1), but for GCOV.
 //
-// RUN: %clang -Werror -Wl,-exported_symbols_list,%t.exports --coverage -o %t.gcov %s | tee -a %t.gcov.log
+// RUN: %clang -Werror -Wl,-exported_symbols_list,%t.exports -Wl,-w --coverage -o %t.gcov %s | tee -a %t.gcov.log
 // RUN: cat %t.gcov.log | count 0
 
 // 3) The default set of weak external symbols should match the set of symbols

From e0e8071815c76b935c2145699c72bf833f0d0af5 Mon Sep 17 00:00:00 2001
From: Alexey Karyakin <akaryaki@quicinc.com>
Date: Tue, 11 Mar 2025 10:41:01 -0500
Subject: [PATCH 265/282] [hexagon] Prevent alignment search beyond a label
 (#130631)

When searching for packets to .align, don't consider ones which would
require padding beyond a label.

There are two problems with padding beyond a label:
- the distance between labels may increase for some offsets to become
too large;
- u/sleb128 values that encode a difference will not be updated because
they are computed before the align command is handled.

This is more a short-term fix/hack. The proper solution would be to
unify `.align` and `.falign` handling and move it to the layout loop.

(cherry picked from commit 1fe463182cead6e7c6119ab410eae9e9d969325a)
---
 .../Hexagon/MCTargetDesc/HexagonAsmBackend.cpp | 18 ++++++++++++++++++
 llvm/test/MC/Hexagon/align-leb128.s            | 18 ++++++++++++++++++
 llvm/test/MC/Hexagon/align.s                   | 13 +++++++++++++
 3 files changed, 49 insertions(+)
 create mode 100644 llvm/test/MC/Hexagon/align-leb128.s

diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
index 98b1dde8fa3fc..725067e0c9bdd 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
@@ -728,6 +728,24 @@ class HexagonAsmBackend : public MCAsmBackend {
               MCContext &Context = Asm.getContext();
               auto &RF = cast<MCRelaxableFragment>(*Frags[K]);
               auto &Inst = const_cast<MCInst &>(RF.getInst());
+
+              const bool WouldTraverseLabel = llvm::any_of(
+                  Asm.symbols(), [&Asm, &RF, &Inst](MCSymbol const &sym) {
+                    uint64_t Offset = 0;
+                    const bool HasOffset = Asm.getSymbolOffset(sym, Offset);
+                    const unsigned PacketSizeBytes =
+                        HexagonMCInstrInfo::bundleSize(Inst) *
+                        HEXAGON_INSTR_SIZE;
+                    const bool OffsetPastSym =
+                        Offset <= (Asm.getFragmentOffset(RF) + PacketSizeBytes);
+                    return !sym.isVariable() && Offset != 0 && HasOffset &&
+                           OffsetPastSym;
+                  });
+              if (WouldTraverseLabel) {
+                Size = 0;
+                break;
+              }
+
               while (Size > 0 &&
                      HexagonMCInstrInfo::bundleSize(Inst) < MaxPacketSize) {
                 MCInst *Nop = Context.createMCInst();
diff --git a/llvm/test/MC/Hexagon/align-leb128.s b/llvm/test/MC/Hexagon/align-leb128.s
new file mode 100644
index 0000000000000..77018f0114311
--- /dev/null
+++ b/llvm/test/MC/Hexagon/align-leb128.s
@@ -0,0 +1,18 @@
+# RUN: llvm-mc -triple=hexagon -filetype=obj %s | llvm-readelf -x .data - \
+# RUN:   | FileCheck %s --match-full-lines
+
+# Illustrate the case when padding packets across labels also breaks leb128
+# relocations. This happens because .align padding is inserted once at the
+# very end of the section layout.
+L1:
+  nop
+L2:
+.size L1, L2-L1
+.align 16
+  nop
+.data
+.word L2-L1
+.uleb128 L2-L1
+
+# CHECK: Hex dump of section '.data':
+# CHECK-NEXT: 0x00000000 04000000 04 .....
diff --git a/llvm/test/MC/Hexagon/align.s b/llvm/test/MC/Hexagon/align.s
index 9c2978df71373..e17d09cfd8c96 100644
--- a/llvm/test/MC/Hexagon/align.s
+++ b/llvm/test/MC/Hexagon/align.s
@@ -58,3 +58,16 @@ r0 = vextract(v0, r0)
   r1 = sub (##1, r1) }
 .align 16
 { r0 = sub (#1, r0) }
+
+# Don't search backwards to pad packets beyond a label:
+{ r1 = add(r1, r0) }
+# CHECK-NEXT: { r1 = add(r1,r0)
+# CHECK-NOT:  nop
+
+post_label:
+.align 16
+# CHECK-LABEL: post_label
+# CHECK-NEXT: { nop
+# CHECK-NEXT:   nop }
+# CHECK-NEXT: { r1 = sub(#1,r1) }
+{ r1 = sub(#1, r1) }

From 95763651e25c6d31a6f41c28e7a22e9203a4dba9 Mon Sep 17 00:00:00 2001
From: Abinaya Saravanan <quic_asaravan@quicinc.com>
Date: Thu, 13 Mar 2025 03:28:26 +0530
Subject: [PATCH 266/282] [HEXAGON] Add support to lower "FREEZE a half(f16)"
 instruction on Hexagon and fix the isel-buildvector-v2f16.ll assertion
 (#130977)

(cherry picked from commit 9c65e6ac115a7d8566c874537791125c3ace7c1a)
---
 llvm/lib/Target/Hexagon/HexagonISelLowering.h |  1 +
 .../Target/Hexagon/HexagonISelLoweringHVX.cpp | 22 +++++-----
 llvm/test/CodeGen/Hexagon/fp16-promote.ll     | 44 +++++++++++++++++++
 3 files changed, 56 insertions(+), 11 deletions(-)
 create mode 100644 llvm/test/CodeGen/Hexagon/fp16-promote.ll

diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.h b/llvm/lib/Target/Hexagon/HexagonISelLowering.h
index aaa9c65c1e07e..4df88b3a8abd7 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.h
@@ -362,6 +362,7 @@ class HexagonTargetLowering : public TargetLowering {
   shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override {
     return AtomicExpansionKind::LLSC;
   }
+  bool softPromoteHalfType() const override { return true; }
 
 private:
   void initializeHVXLowering();
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
index 1a19e81a68f08..a7eb20a3e5ff9 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
@@ -1618,17 +1618,6 @@ HexagonTargetLowering::LowerHvxBuildVector(SDValue Op, SelectionDAG &DAG)
   for (unsigned i = 0; i != Size; ++i)
     Ops.push_back(Op.getOperand(i));
 
-  // First, split the BUILD_VECTOR for vector pairs. We could generate
-  // some pairs directly (via splat), but splats should be generated
-  // by the combiner prior to getting here.
-  if (VecTy.getSizeInBits() == 16*Subtarget.getVectorLength()) {
-    ArrayRef<SDValue> A(Ops);
-    MVT SingleTy = typeSplit(VecTy).first;
-    SDValue V0 = buildHvxVectorReg(A.take_front(Size/2), dl, SingleTy, DAG);
-    SDValue V1 = buildHvxVectorReg(A.drop_front(Size/2), dl, SingleTy, DAG);
-    return DAG.getNode(ISD::CONCAT_VECTORS, dl, VecTy, V0, V1);
-  }
-
   if (VecTy.getVectorElementType() == MVT::i1)
     return buildHvxVectorPred(Ops, dl, VecTy, DAG);
 
@@ -1645,6 +1634,17 @@ HexagonTargetLowering::LowerHvxBuildVector(SDValue Op, SelectionDAG &DAG)
     return DAG.getBitcast(tyVector(VecTy, MVT::f16), T0);
   }
 
+  // First, split the BUILD_VECTOR for vector pairs. We could generate
+  // some pairs directly (via splat), but splats should be generated
+  // by the combiner prior to getting here.
+  if (VecTy.getSizeInBits() == 16 * Subtarget.getVectorLength()) {
+    ArrayRef<SDValue> A(Ops);
+    MVT SingleTy = typeSplit(VecTy).first;
+    SDValue V0 = buildHvxVectorReg(A.take_front(Size / 2), dl, SingleTy, DAG);
+    SDValue V1 = buildHvxVectorReg(A.drop_front(Size / 2), dl, SingleTy, DAG);
+    return DAG.getNode(ISD::CONCAT_VECTORS, dl, VecTy, V0, V1);
+  }
+
   return buildHvxVectorReg(Ops, dl, VecTy, DAG);
 }
 
diff --git a/llvm/test/CodeGen/Hexagon/fp16-promote.ll b/llvm/test/CodeGen/Hexagon/fp16-promote.ll
new file mode 100644
index 0000000000000..1ef0a133ce30a
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/fp16-promote.ll
@@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=hexagon  < %s | FileCheck %s
+
+define half @freeze_half_undef() nounwind {
+; CHECK-LABEL: freeze_half_undef:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call __truncsfhf2
+; CHECK-NEXT:     r0 = #0
+; CHECK-NEXT:     allocframe(#0)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call __extendhfsf2
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call __truncsfhf2
+; CHECK-NEXT:     r0 = sfadd(r0,r0)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r31:30 = dealloc_return(r30):raw
+; CHECK-NEXT:    }
+  %y1 = freeze half undef
+  %t1 = fadd half %y1, %y1
+  ret half %t1
+}
+
+define half @freeze_half_poison(half %maybe.poison) {
+; CHECK-LABEL: freeze_half_poison:
+; CHECK:  // %bb.0:
+; CHECK:    {
+; CHECK-NEXT:     call __extendhfsf2
+; CHECK-NEXT:     allocframe(r29,#0):raw
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call __truncsfhf2
+; CHECK-NEXT:     r0 = sfadd(r0,r0)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r31:30 = dealloc_return(r30):raw
+; CHECK-NEXT:    }
+  %y1 = freeze half %maybe.poison
+  %t1 = fadd half %y1, %y1
+  ret half %t1
+}

From 6034661369c40b82538f44e6d742a7120ec418c4 Mon Sep 17 00:00:00 2001
From: Qi Zhao <zhaoqi01@loongson.cn>
Date: Fri, 21 Mar 2025 10:55:50 +0800
Subject: [PATCH 267/282] [LoongArch] Pre-commit test for fixing tls-le symbol
 type

The symbol type of tls-le must be `TLS`, it was incorrectly set
as `NOTYPE`.

A later commit will fix it.

(cherry picked from commit 87adafcd2e248fa69d1f776a9e60f95df03b885d)
---
 .../CodeGen/LoongArch/fix-tle-le-sym-type.ll  | 24 +++++++++++++++++++
 1 file changed, 24 insertions(+)
 create mode 100644 llvm/test/CodeGen/LoongArch/fix-tle-le-sym-type.ll

diff --git a/llvm/test/CodeGen/LoongArch/fix-tle-le-sym-type.ll b/llvm/test/CodeGen/LoongArch/fix-tle-le-sym-type.ll
new file mode 100644
index 0000000000000..fe5f2195f0dc7
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/fix-tle-le-sym-type.ll
@@ -0,0 +1,24 @@
+; RUN: llc --mtriple=loongarch32 --filetype=obj %s -o %t-la32
+; RUN: llvm-readelf -s %t-la32 | FileCheck %s --check-prefix=LA32
+
+; RUN: llc --mtriple=loongarch64 --filetype=obj %s -o %t-la64
+; RUN: llvm-readelf -s %t-la64 | FileCheck %s --check-prefix=LA64
+
+; LA32:      Symbol table '.symtab' contains [[#]] entries:
+; LA32-NEXT:    Num:    Value  Size Type    Bind   Vis      Ndx Name
+; LA32:              00000000     0 NOTYPE  GLOBAL DEFAULT  UND tls_sym
+
+; LA64:      Symbol table '.symtab' contains [[#]] entries:
+; LA64-NEXT:    Num:    Value          Size Type    Bind   Vis      Ndx Name
+; LA64:              0000000000000000     0 NOTYPE  GLOBAL DEFAULT  UND tls_sym
+
+@tls_sym = external thread_local(localexec) global i32
+
+define dso_local signext i32 @test_tlsle() nounwind {
+entry:
+  %0 = call ptr @llvm.threadlocal.address.p0(ptr @tls_sym)
+  %1 = load i32, ptr %0
+  ret i32 %1
+}
+
+declare nonnull ptr @llvm.threadlocal.address.p0(ptr nonnull)

From a311bc81d957c74739f39889329d13161673a696 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Thu, 20 Feb 2025 21:49:19 -0100
Subject: [PATCH 268/282] [llvm-dlltool] Implement the --identify option
 (#127465)

This option prints the name of the DLL that gets imported, when linking
against an import library.

This is implemented using the same strategy as GNU dlltool does; looking
for the contents of .idata$6 or .idata$7 chunks. The right section name
to check for is chosen by identifying whether the library is GNU or LLVM
style. In the case of GNU import libraries, the DLL name is in an
.idata$7 chunk. However there are also other chunks with that section
name (for entries for the IAT or ILT); identify these by looking for
whether a chunk contains relocations.

Alternatively, one could also just look for .idata$2 chunks, look for
relocations at the right offset, and locate data at the symbol that the
relocation points at (which may be in the same or in another object
file).

(cherry picked from commit dcc08a17c781a5066ab17b9791e1c455f7cedbf7)
---
 .../llvm-dlltool/DlltoolDriver.cpp            | 146 +++++++++++++++++-
 llvm/lib/ToolDrivers/llvm-dlltool/Options.td  |   5 +
 .../llvm-dlltool/Inputs/gnu_foo_lib_h.yaml    | 133 ++++++++++++++++
 .../Inputs/gnu_foo_lib_s00000.yaml            | 116 ++++++++++++++
 .../llvm-dlltool/Inputs/gnu_foo_lib_t.yaml    | 119 ++++++++++++++
 .../llvm-dlltool/Inputs/llvm_foo_dll_1.yaml   |  69 +++++++++
 .../llvm-dlltool/Inputs/llvm_foo_dll_2.yaml   |  18 +++
 .../llvm-dlltool/Inputs/llvm_foo_dll_3.yaml   |  23 +++
 llvm/test/tools/llvm-dlltool/identify.test    |  69 +++++++++
 9 files changed, 697 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/tools/llvm-dlltool/Inputs/gnu_foo_lib_h.yaml
 create mode 100644 llvm/test/tools/llvm-dlltool/Inputs/gnu_foo_lib_s00000.yaml
 create mode 100644 llvm/test/tools/llvm-dlltool/Inputs/gnu_foo_lib_t.yaml
 create mode 100644 llvm/test/tools/llvm-dlltool/Inputs/llvm_foo_dll_1.yaml
 create mode 100644 llvm/test/tools/llvm-dlltool/Inputs/llvm_foo_dll_2.yaml
 create mode 100644 llvm/test/tools/llvm-dlltool/Inputs/llvm_foo_dll_3.yaml
 create mode 100644 llvm/test/tools/llvm-dlltool/identify.test

diff --git a/llvm/lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp b/llvm/lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp
index 1782e24287860..380fbd8b6fc6c 100644
--- a/llvm/lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp
+++ b/llvm/lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp
@@ -12,6 +12,7 @@
 
 #include "llvm/ToolDrivers/llvm-dlltool/DlltoolDriver.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/Object/Archive.h"
 #include "llvm/Object/COFF.h"
 #include "llvm/Object/COFFImportFile.h"
 #include "llvm/Object/COFFModuleDefinition.h"
@@ -158,6 +159,143 @@ bool parseModuleDefinition(StringRef DefFileName, MachineTypes Machine,
   return true;
 }
 
+int printError(llvm::Error E, Twine File) {
+  if (!E)
+    return 0;
+  handleAllErrors(std::move(E), [&](const llvm::ErrorInfoBase &EIB) {
+    llvm::errs() << "error opening " << File << ": " << EIB.message() << "\n";
+  });
+  return 1;
+}
+
+template <typename Callable>
+int forEachCoff(object::Archive &Archive, StringRef Name, Callable Callback) {
+  Error Err = Error::success();
+  for (auto &C : Archive.children(Err)) {
+    Expected<StringRef> NameOrErr = C.getName();
+    if (!NameOrErr)
+      return printError(NameOrErr.takeError(), Name);
+    StringRef Name = *NameOrErr;
+
+    Expected<MemoryBufferRef> ChildMB = C.getMemoryBufferRef();
+    if (!ChildMB)
+      return printError(ChildMB.takeError(), Name);
+
+    if (identify_magic(ChildMB->getBuffer()) == file_magic::coff_object) {
+      auto Obj = object::COFFObjectFile::create(*ChildMB);
+      if (!Obj)
+        return printError(Obj.takeError(), Name);
+      if (!Callback(*Obj->get(), Name))
+        return 1;
+    }
+  }
+  if (Err)
+    return printError(std::move(Err), Name);
+  return 0;
+}
+
+// To find the named of the imported DLL from an import library, we can either
+// inspect the object files that form the import table entries, or we could
+// just look at the archive member names, for MSVC style import libraries.
+// Looking at the archive member names doesn't work for GNU style import
+// libraries though, while inspecting the import table entries works for
+// both. (MSVC style import libraries contain a couple regular object files
+// for the header/trailers.)
+//
+// This implementation does the same as GNU dlltool does; look at the
+// content of ".idata$7" sections, or for MSVC style libraries, look
+// at ".idata$6" sections.
+//
+// For GNU style import libraries, there are also other data chunks in sections
+// named ".idata$7" (entries to the IAT or ILT); these are distinguished
+// by seeing that they contain relocations. (They also look like an empty
+// string when looking for null termination.)
+//
+// Alternatively, we could do things differently - look for any .idata$2
+// section; this would be import directory entries. At offset 0xc in them
+// there is the RVA of the import DLL name; look for a relocation at this
+// spot and locate the symbol that it points at. That symbol may either
+// be within the same object file (in the case of MSVC style import libraries)
+// or another object file (in the case of GNU import libraries).
+bool identifyImportName(const COFFObjectFile &Obj, StringRef ObjName,
+                        std::vector<StringRef> &Names, bool IsMsStyleImplib) {
+  StringRef TargetName = IsMsStyleImplib ? ".idata$6" : ".idata$7";
+  for (const auto &S : Obj.sections()) {
+    Expected<StringRef> NameOrErr = S.getName();
+    if (!NameOrErr) {
+      printError(NameOrErr.takeError(), ObjName);
+      return false;
+    }
+    StringRef Name = *NameOrErr;
+    if (Name != TargetName)
+      continue;
+
+    // GNU import libraries contain .idata$7 section in the per function
+    // objects too, but they contain relocations.
+    if (!IsMsStyleImplib && !S.relocations().empty())
+      continue;
+
+    Expected<StringRef> ContentsOrErr = S.getContents();
+    if (!ContentsOrErr) {
+      printError(ContentsOrErr.takeError(), ObjName);
+      return false;
+    }
+    StringRef Contents = *ContentsOrErr;
+    Contents = Contents.substr(0, Contents.find('\0'));
+    if (Contents.empty())
+      continue;
+    Names.push_back(Contents);
+    return true;
+  }
+  return true;
+}
+
+int doIdentify(StringRef File, bool IdentifyStrict) {
+  ErrorOr<std::unique_ptr<MemoryBuffer>> MaybeBuf = MemoryBuffer::getFile(
+      File, /*IsText=*/false, /*RequiredNullTerminator=*/false);
+  if (!MaybeBuf)
+    return printError(errorCodeToError(MaybeBuf.getError()), File);
+  if (identify_magic(MaybeBuf.get()->getBuffer()) != file_magic::archive) {
+    llvm::errs() << File << " is not a library\n";
+    return 1;
+  }
+
+  std::unique_ptr<MemoryBuffer> B = std::move(MaybeBuf.get());
+  Error Err = Error::success();
+  object::Archive Archive(B->getMemBufferRef(), Err);
+  if (Err)
+    return printError(std::move(Err), B->getBufferIdentifier());
+
+  bool IsMsStyleImplib = false;
+  for (const auto &S : Archive.symbols()) {
+    if (S.getName() == "__NULL_IMPORT_DESCRIPTOR") {
+      IsMsStyleImplib = true;
+      break;
+    }
+  }
+  std::vector<StringRef> Names;
+  if (forEachCoff(Archive, B->getBufferIdentifier(),
+                  [&](const COFFObjectFile &Obj, StringRef ObjName) -> bool {
+                    return identifyImportName(Obj, ObjName, Names,
+                                              IsMsStyleImplib);
+                  }))
+    return 1;
+
+  if (Names.empty()) {
+    llvm::errs() << "No DLL import name found in " << File << "\n";
+    return 1;
+  }
+  if (Names.size() > 1 && IdentifyStrict) {
+    llvm::errs() << File << "contains imports for two or more DLLs\n";
+    return 1;
+  }
+
+  for (StringRef S : Names)
+    llvm::outs() << S << "\n";
+
+  return 0;
+}
+
 } // namespace
 
 int llvm::dlltoolDriverMain(llvm::ArrayRef<const char *> ArgsArr) {
@@ -173,7 +311,8 @@ int llvm::dlltoolDriverMain(llvm::ArrayRef<const char *> ArgsArr) {
 
   // Handle when no input or output is specified
   if (Args.hasArgNoClaim(OPT_INPUT) ||
-      (!Args.hasArgNoClaim(OPT_d) && !Args.hasArgNoClaim(OPT_l))) {
+      (!Args.hasArgNoClaim(OPT_d) && !Args.hasArgNoClaim(OPT_l) &&
+       !Args.hasArgNoClaim(OPT_I))) {
     Table.printHelp(outs(), "llvm-dlltool [options] file...", "llvm-dlltool",
                     false);
     llvm::outs()
@@ -185,6 +324,11 @@ int llvm::dlltoolDriverMain(llvm::ArrayRef<const char *> ArgsArr) {
     llvm::errs() << "ignoring unknown argument: " << Arg->getAsString(Args)
                  << "\n";
 
+  if (Args.hasArg(OPT_I)) {
+    return doIdentify(Args.getLastArg(OPT_I)->getValue(),
+                      Args.hasArg(OPT_identify_strict));
+  }
+
   if (!Args.hasArg(OPT_d)) {
     llvm::errs() << "no definition file specified\n";
     return 1;
diff --git a/llvm/lib/ToolDrivers/llvm-dlltool/Options.td b/llvm/lib/ToolDrivers/llvm-dlltool/Options.td
index 7810694c98e36..4fd80189aff29 100644
--- a/llvm/lib/ToolDrivers/llvm-dlltool/Options.td
+++ b/llvm/lib/ToolDrivers/llvm-dlltool/Options.td
@@ -21,6 +21,11 @@ def k_alias: Flag<["--"], "kill-at">, Alias<k>;
 def no_leading_underscore: Flag<["--"], "no-leading-underscore">,
     HelpText<"Don't add leading underscores on symbols">;
 
+def I: JoinedOrSeparate<["-"], "I">, HelpText<"Identify DLL name from import library">;
+def I_long : JoinedOrSeparate<["--"], "identify">, Alias<I>;
+
+def identify_strict : Flag<["--"], "identify-strict">, HelpText<"Error out if the --identify option detects more than one DLL">;
+
 //==============================================================================
 // The flags below do nothing. They are defined only for dlltool compatibility.
 //==============================================================================
diff --git a/llvm/test/tools/llvm-dlltool/Inputs/gnu_foo_lib_h.yaml b/llvm/test/tools/llvm-dlltool/Inputs/gnu_foo_lib_h.yaml
new file mode 100644
index 0000000000000..26f3493d62143
--- /dev/null
+++ b/llvm/test/tools/llvm-dlltool/Inputs/gnu_foo_lib_h.yaml
@@ -0,0 +1,133 @@
+--- !COFF
+header:
+  Machine:         IMAGE_FILE_MACHINE_I386
+  Characteristics: [ IMAGE_FILE_LINE_NUMS_STRIPPED, IMAGE_FILE_32BIT_MACHINE ]
+sections:
+  - Name:            .text
+    Characteristics: [ IMAGE_SCN_CNT_CODE, IMAGE_SCN_MEM_EXECUTE, IMAGE_SCN_MEM_READ ]
+    Alignment:       4
+    SectionData:     ''
+  - Name:            .data
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ, IMAGE_SCN_MEM_WRITE ]
+    Alignment:       4
+    SectionData:     ''
+  - Name:            .bss
+    Characteristics: [ IMAGE_SCN_CNT_UNINITIALIZED_DATA, IMAGE_SCN_MEM_READ, IMAGE_SCN_MEM_WRITE ]
+    Alignment:       4
+    SectionData:     ''
+  - Name:            '.idata$2'
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ, IMAGE_SCN_MEM_WRITE ]
+    Alignment:       4
+    SectionData:     '0000000000000000000000000000000000000000'
+    SizeOfRawData:   20
+    Relocations:
+      - VirtualAddress:  0
+        SymbolName:      '.idata$4'
+        Type:            IMAGE_REL_I386_DIR32NB
+      - VirtualAddress:  12
+        SymbolName:      __foo_lib_iname
+        Type:            IMAGE_REL_I386_DIR32NB
+      - VirtualAddress:  16
+        SymbolName:      '.idata$5'
+        Type:            IMAGE_REL_I386_DIR32NB
+  - Name:            '.idata$5'
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ, IMAGE_SCN_MEM_WRITE ]
+    Alignment:       4
+    SectionData:     ''
+  - Name:            '.idata$4'
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ, IMAGE_SCN_MEM_WRITE ]
+    Alignment:       4
+    SectionData:     ''
+symbols:
+  - Name:            .file
+    Value:           0
+    SectionNumber:   -2
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_FILE
+    File:            fake
+  - Name:            hname
+    Value:           0
+    SectionNumber:   6
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+  - Name:            fthunk
+    Value:           0
+    SectionNumber:   5
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+  - Name:            .text
+    Value:           0
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition:
+      Length:          0
+      NumberOfRelocations: 0
+      NumberOfLinenumbers: 0
+      CheckSum:        0
+      Number:          0
+  - Name:            .data
+    Value:           0
+    SectionNumber:   2
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition:
+      Length:          0
+      NumberOfRelocations: 0
+      NumberOfLinenumbers: 0
+      CheckSum:        0
+      Number:          0
+  - Name:            .bss
+    Value:           0
+    SectionNumber:   3
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition:
+      Length:          0
+      NumberOfRelocations: 0
+      NumberOfLinenumbers: 0
+      CheckSum:        0
+      Number:          0
+  - Name:            '.idata$2'
+    Value:           0
+    SectionNumber:   4
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition:
+      Length:          20
+      NumberOfRelocations: 3
+      NumberOfLinenumbers: 0
+      CheckSum:        0
+      Number:          0
+  - Name:            '.idata$4'
+    Value:           0
+    SectionNumber:   6
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+  - Name:            '.idata$5'
+    Value:           0
+    SectionNumber:   5
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+  - Name:            __head_foo_lib
+    Value:           0
+    SectionNumber:   4
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_EXTERNAL
+  - Name:            __foo_lib_iname
+    Value:           0
+    SectionNumber:   0
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_EXTERNAL
+...
diff --git a/llvm/test/tools/llvm-dlltool/Inputs/gnu_foo_lib_s00000.yaml b/llvm/test/tools/llvm-dlltool/Inputs/gnu_foo_lib_s00000.yaml
new file mode 100644
index 0000000000000..f09437fc99255
--- /dev/null
+++ b/llvm/test/tools/llvm-dlltool/Inputs/gnu_foo_lib_s00000.yaml
@@ -0,0 +1,116 @@
+--- !COFF
+header:
+  Machine:         IMAGE_FILE_MACHINE_I386
+  Characteristics: [ IMAGE_FILE_LINE_NUMS_STRIPPED, IMAGE_FILE_32BIT_MACHINE ]
+sections:
+  - Name:            .text
+    Characteristics: [ IMAGE_SCN_CNT_CODE, IMAGE_SCN_MEM_EXECUTE, IMAGE_SCN_MEM_READ ]
+    Alignment:       4
+    SectionData:     FF25000000009090
+    SizeOfRawData:   8
+    Relocations:
+      - VirtualAddress:  2
+        SymbolName:      '.idata$5'
+        Type:            IMAGE_REL_I386_DIR32
+  - Name:            .data
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ, IMAGE_SCN_MEM_WRITE ]
+    Alignment:       4
+    SectionData:     ''
+  - Name:            .bss
+    Characteristics: [ IMAGE_SCN_CNT_UNINITIALIZED_DATA, IMAGE_SCN_MEM_READ, IMAGE_SCN_MEM_WRITE ]
+    Alignment:       4
+    SectionData:     ''
+  - Name:            '.idata$7'
+    Characteristics: [ IMAGE_SCN_MEM_READ, IMAGE_SCN_MEM_WRITE ]
+    Alignment:       4
+    SectionData:     '00000000'
+    SizeOfRawData:   4
+    Relocations:
+      - VirtualAddress:  0
+        SymbolName:      __head_foo_lib
+        Type:            IMAGE_REL_I386_DIR32NB
+  - Name:            '.idata$5'
+    Characteristics: [ IMAGE_SCN_MEM_READ, IMAGE_SCN_MEM_WRITE ]
+    Alignment:       4
+    SectionData:     '00000000'
+    SizeOfRawData:   4
+    Relocations:
+      - VirtualAddress:  0
+        SymbolName:      '.idata$6'
+        Type:            IMAGE_REL_I386_DIR32NB
+  - Name:            '.idata$4'
+    Characteristics: [ IMAGE_SCN_MEM_READ, IMAGE_SCN_MEM_WRITE ]
+    Alignment:       4
+    SectionData:     '00000000'
+    SizeOfRawData:   4
+    Relocations:
+      - VirtualAddress:  0
+        SymbolName:      '.idata$6'
+        Type:            IMAGE_REL_I386_DIR32NB
+  - Name:            '.idata$6'
+    Characteristics: [ IMAGE_SCN_MEM_READ, IMAGE_SCN_MEM_WRITE ]
+    Alignment:       2
+    SectionData:     '010066756E633100'
+    SizeOfRawData:   8
+symbols:
+  - Name:            .text
+    Value:           0
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+  - Name:            .data
+    Value:           0
+    SectionNumber:   2
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+  - Name:            .bss
+    Value:           0
+    SectionNumber:   3
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+  - Name:            '.idata$7'
+    Value:           0
+    SectionNumber:   4
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+  - Name:            '.idata$5'
+    Value:           0
+    SectionNumber:   5
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+  - Name:            '.idata$4'
+    Value:           0
+    SectionNumber:   6
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+  - Name:            '.idata$6'
+    Value:           0
+    SectionNumber:   7
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+  - Name:            _func1
+    Value:           0
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_EXTERNAL
+  - Name:            __imp__func1
+    Value:           0
+    SectionNumber:   5
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_EXTERNAL
+  - Name:            __head_foo_lib
+    Value:           0
+    SectionNumber:   0
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_EXTERNAL
+...
diff --git a/llvm/test/tools/llvm-dlltool/Inputs/gnu_foo_lib_t.yaml b/llvm/test/tools/llvm-dlltool/Inputs/gnu_foo_lib_t.yaml
new file mode 100644
index 0000000000000..e4465293bec1a
--- /dev/null
+++ b/llvm/test/tools/llvm-dlltool/Inputs/gnu_foo_lib_t.yaml
@@ -0,0 +1,119 @@
+--- !COFF
+header:
+  Machine:         IMAGE_FILE_MACHINE_I386
+  Characteristics: [ IMAGE_FILE_RELOCS_STRIPPED, IMAGE_FILE_LINE_NUMS_STRIPPED, IMAGE_FILE_32BIT_MACHINE ]
+sections:
+  - Name:            .text
+    Characteristics: [ IMAGE_SCN_CNT_CODE, IMAGE_SCN_MEM_EXECUTE, IMAGE_SCN_MEM_READ ]
+    Alignment:       4
+    SectionData:     ''
+  - Name:            .data
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ, IMAGE_SCN_MEM_WRITE ]
+    Alignment:       4
+    SectionData:     ''
+  - Name:            .bss
+    Characteristics: [ IMAGE_SCN_CNT_UNINITIALIZED_DATA, IMAGE_SCN_MEM_READ, IMAGE_SCN_MEM_WRITE ]
+    Alignment:       4
+    SectionData:     ''
+  - Name:            '.idata$4'
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ, IMAGE_SCN_MEM_WRITE ]
+    Alignment:       4
+    SectionData:     '00000000'
+    SizeOfRawData:   4
+  - Name:            '.idata$5'
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ, IMAGE_SCN_MEM_WRITE ]
+    Alignment:       4
+    SectionData:     '00000000'
+    SizeOfRawData:   4
+  - Name:            '.idata$7'
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ, IMAGE_SCN_MEM_WRITE ]
+    Alignment:       4
+    SectionData:     666F6F2E646C6C00
+    SizeOfRawData:   8
+symbols:
+  - Name:            .file
+    Value:           0
+    SectionNumber:   -2
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_FILE
+    File:            fake
+  - Name:            .text
+    Value:           0
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition:
+      Length:          0
+      NumberOfRelocations: 0
+      NumberOfLinenumbers: 0
+      CheckSum:        0
+      Number:          0
+  - Name:            .data
+    Value:           0
+    SectionNumber:   2
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition:
+      Length:          0
+      NumberOfRelocations: 0
+      NumberOfLinenumbers: 0
+      CheckSum:        0
+      Number:          0
+  - Name:            .bss
+    Value:           0
+    SectionNumber:   3
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition:
+      Length:          0
+      NumberOfRelocations: 0
+      NumberOfLinenumbers: 0
+      CheckSum:        0
+      Number:          0
+  - Name:            '.idata$4'
+    Value:           0
+    SectionNumber:   4
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition:
+      Length:          4
+      NumberOfRelocations: 0
+      NumberOfLinenumbers: 0
+      CheckSum:        0
+      Number:          0
+  - Name:            '.idata$5'
+    Value:           0
+    SectionNumber:   5
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition:
+      Length:          4
+      NumberOfRelocations: 0
+      NumberOfLinenumbers: 0
+      CheckSum:        0
+      Number:          0
+  - Name:            '.idata$7'
+    Value:           0
+    SectionNumber:   6
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition:
+      Length:          8
+      NumberOfRelocations: 0
+      NumberOfLinenumbers: 0
+      CheckSum:        0
+      Number:          0
+  - Name:            __foo_lib_iname
+    Value:           0
+    SectionNumber:   6
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_EXTERNAL
+...
diff --git a/llvm/test/tools/llvm-dlltool/Inputs/llvm_foo_dll_1.yaml b/llvm/test/tools/llvm-dlltool/Inputs/llvm_foo_dll_1.yaml
new file mode 100644
index 0000000000000..f3f669d63bcad
--- /dev/null
+++ b/llvm/test/tools/llvm-dlltool/Inputs/llvm_foo_dll_1.yaml
@@ -0,0 +1,69 @@
+--- !COFF
+header:
+  Machine:         IMAGE_FILE_MACHINE_AMD64
+  Characteristics: [  ]
+sections:
+  - Name:            '.idata$2'
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ, IMAGE_SCN_MEM_WRITE ]
+    Alignment:       4
+    SectionData:     '0000000000000000000000000000000000000000'
+    SizeOfRawData:   20
+    Relocations:
+      - VirtualAddress:  12
+        SymbolName:      '.idata$6'
+        Type:            IMAGE_REL_AMD64_ADDR32NB
+      - VirtualAddress:  0
+        SymbolName:      '.idata$4'
+        Type:            IMAGE_REL_AMD64_ADDR32NB
+      - VirtualAddress:  16
+        SymbolName:      '.idata$5'
+        Type:            IMAGE_REL_AMD64_ADDR32NB
+  - Name:            '.idata$6'
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ, IMAGE_SCN_MEM_WRITE ]
+    Alignment:       2
+    SectionData:     666F6F2E646C6C00
+    SizeOfRawData:   8
+symbols:
+  - Name:            __IMPORT_DESCRIPTOR_foo
+    Value:           0
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_EXTERNAL
+  - Name:            '.idata$2'
+    Value:           0
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_SECTION
+  - Name:            '.idata$6'
+    Value:           0
+    SectionNumber:   2
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+  - Name:            '.idata$4'
+    Value:           0
+    SectionNumber:   0
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_SECTION
+  - Name:            '.idata$5'
+    Value:           0
+    SectionNumber:   0
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_SECTION
+  - Name:            __NULL_IMPORT_DESCRIPTOR
+    Value:           0
+    SectionNumber:   0
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_EXTERNAL
+  - Name:            "foo_NULL_THUNK_DATA"
+    Value:           0
+    SectionNumber:   0
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_EXTERNAL
+...
diff --git a/llvm/test/tools/llvm-dlltool/Inputs/llvm_foo_dll_2.yaml b/llvm/test/tools/llvm-dlltool/Inputs/llvm_foo_dll_2.yaml
new file mode 100644
index 0000000000000..26b601fb74c54
--- /dev/null
+++ b/llvm/test/tools/llvm-dlltool/Inputs/llvm_foo_dll_2.yaml
@@ -0,0 +1,18 @@
+--- !COFF
+header:
+  Machine:         IMAGE_FILE_MACHINE_AMD64
+  Characteristics: [  ]
+sections:
+  - Name:            '.idata$3'
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ, IMAGE_SCN_MEM_WRITE ]
+    Alignment:       4
+    SectionData:     '0000000000000000000000000000000000000000'
+    SizeOfRawData:   20
+symbols:
+  - Name:            __NULL_IMPORT_DESCRIPTOR
+    Value:           0
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_EXTERNAL
+...
diff --git a/llvm/test/tools/llvm-dlltool/Inputs/llvm_foo_dll_3.yaml b/llvm/test/tools/llvm-dlltool/Inputs/llvm_foo_dll_3.yaml
new file mode 100644
index 0000000000000..68248597cbaeb
--- /dev/null
+++ b/llvm/test/tools/llvm-dlltool/Inputs/llvm_foo_dll_3.yaml
@@ -0,0 +1,23 @@
+--- !COFF
+header:
+  Machine:         IMAGE_FILE_MACHINE_AMD64
+  Characteristics: [  ]
+sections:
+  - Name:            '.idata$5'
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ, IMAGE_SCN_MEM_WRITE ]
+    Alignment:       8
+    SectionData:     '0000000000000000'
+    SizeOfRawData:   8
+  - Name:            '.idata$4'
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ, IMAGE_SCN_MEM_WRITE ]
+    Alignment:       8
+    SectionData:     '0000000000000000'
+    SizeOfRawData:   8
+symbols:
+  - Name:            "foo_NULL_THUNK_DATA"
+    Value:           0
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_EXTERNAL
+...
diff --git a/llvm/test/tools/llvm-dlltool/identify.test b/llvm/test/tools/llvm-dlltool/identify.test
new file mode 100644
index 0000000000000..eb2792a8e41ae
--- /dev/null
+++ b/llvm/test/tools/llvm-dlltool/identify.test
@@ -0,0 +1,69 @@
+Test the -I / --identify option.
+
+Test with both GNU style and LLVM style import libraries; using
+sources from yaml to preserve the checking behaviour even if the
+output of llvm-dlltool itself would change.
+
+RUN: rm -rf %t && mkdir -p %t
+RUN: split-file %s %t
+
+RUN: yaml2obj %S/Inputs/gnu_foo_lib_h.yaml > %t/gnu_foo_lib_h.o
+RUN: yaml2obj %S/Inputs/gnu_foo_lib_s00000.yaml > %t/gnu_foo_lib_s00000.o
+RUN: yaml2obj %S/Inputs/gnu_foo_lib_t.yaml > %t/gnu_foo_lib_t.o
+RUN: llvm-ar rcs %t/gnu.a %t/gnu_foo_lib_h.o %t/gnu_foo_lib_s00000.o %t/gnu_foo_lib_t.o
+
+RUN: yaml2obj %S/Inputs/llvm_foo_dll_1.yaml > %t/llvm_foo_dll_1.o
+RUN: yaml2obj %S/Inputs/llvm_foo_dll_2.yaml > %t/llvm_foo_dll_2.o
+RUN: yaml2obj %S/Inputs/llvm_foo_dll_3.yaml > %t/llvm_foo_dll_3.o
+RUN: llvm-ar rcs %t/llvm.a %t/llvm_foo_dll_1.o %t/llvm_foo_dll_2.o %t/llvm_foo_dll_3.o
+
+
+Check that we can identify the DLL name from a GNU style import library.
+
+RUN: llvm-dlltool -I %t/gnu.a | FileCheck --check-prefix=FOO %s
+RUN: llvm-dlltool --identify %t/gnu.a | count 1
+
+FOO: foo.dll
+
+
+Check that we successfully can identify run while passing the
+--identify-strict option.
+
+RUN: llvm-dlltool -I %t/gnu.a --identify-strict | FileCheck --check-prefix=FOO %s
+
+
+Check that we can identify the DLL name from an LLVM style import library.
+
+RUN: llvm-dlltool -I %t/llvm.a | FileCheck --check-prefix=FOO %s
+RUN: llvm-dlltool -I %t/llvm.a | count 1
+
+
+Check that we can identify the DLL names from an import library that
+contains imports for multiple DLLs.
+
+RUN: llvm-dlltool -m i386:x86-64 -d %t/lib1.def -l %t/lib1.a
+RUN: llvm-dlltool -m i386:x86-64 -d %t/lib2.def -l %t/lib2.a
+RUN: llvm-ar qcsL %t/merged.a %t/lib1.a %t/lib2.a
+
+RUN: llvm-dlltool -I %t/merged.a | FileCheck --check-prefix=MERGED %s
+
+MERGED-DAG: lib1.dll
+MERGED-DAG: lib2.dll
+
+Check that --identify-strict fails this case, when there are multiple
+outputs.
+
+RUN: not llvm-dlltool -I %t/merged.a --identify-strict 2>&1 | FileCheck --check-prefix=ERROR %s
+
+ERROR: contains imports for two or more DLLs
+
+
+#--- lib1.def
+LIBRARY lib1.dll
+EXPORTS
+    func1
+
+#--- lib2.def
+LIBRARY lib2.dll
+EXPORTS
+    func2

From f7b6f23c6bb7a1276b8de569014672b7d29c2b7e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Fri, 21 Feb 2025 01:20:29 +0200
Subject: [PATCH 269/282] [llvm-dlltool] Add a missing dependency

This was missed in dcc08a17c781a5066ab17b9791e1c455f7cedbf7.

(cherry picked from commit 1ca93b15482d3bfa1560b35960ab46fea65b3074)
---
 llvm/lib/ToolDrivers/llvm-dlltool/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/lib/ToolDrivers/llvm-dlltool/CMakeLists.txt b/llvm/lib/ToolDrivers/llvm-dlltool/CMakeLists.txt
index 855ae5f048ff7..5db08e7852d03 100644
--- a/llvm/lib/ToolDrivers/llvm-dlltool/CMakeLists.txt
+++ b/llvm/lib/ToolDrivers/llvm-dlltool/CMakeLists.txt
@@ -6,6 +6,7 @@ add_llvm_component_library(LLVMDlltoolDriver
   DlltoolDriver.cpp
 
   LINK_COMPONENTS
+  BinaryFormat
   Object
   Option
   Support

From c86df914dee1b48ff54dda0eaec87f59bad069bb Mon Sep 17 00:00:00 2001
From: Younan Zhang <zyn7109@gmail.com>
Date: Sat, 22 Mar 2025 22:55:58 +0800
Subject: [PATCH 270/282] release/20.x: [Clang] Fix various bugs in alias CTAD
 transform

This patch cherry-picks 032ad590d6, 868c89ff0 and 38d71c9bd onto the 20
release branch.

The first patch addresses recently surfaced CTAD problems, which we
believe it would be nice to roll out the fix quickly, given the release
window is not closed yet.

The second patch is a follow-up to the first and fixed a test failure
on the arm32 platform.

The third patch follows-up on the previous patch that I cherry-picked to
the 20 release branch, which removes a unnecessary assertion.
---
 clang/docs/ReleaseNotes.rst                   |   2 +
 clang/lib/Sema/SemaTemplateDeductionGuide.cpp |  28 ++--
 clang/lib/Sema/SemaTemplateInstantiate.cpp    |  46 ++++--
 clang/lib/Sema/TreeTransform.h                |  84 ++++++-----
 clang/test/SemaCXX/ctad.cpp                   | 132 +++++++++++++++++-
 clang/test/SemaTemplate/deduction-guide.cpp   |  48 +++++++
 6 files changed, 283 insertions(+), 57 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 03b68271b7864..955325026f369 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -1060,6 +1060,8 @@ Bug Fixes to C++ Support
 - Clang is now better at keeping track of friend function template instance contexts. (#GH55509)
 - Fixed an integer overflow bug in computing template parameter depths when synthesizing CTAD guides. (#GH128691)
 - Fixed an incorrect pointer access when checking access-control on concepts. (#GH131530)
+- Fixed various alias CTAD bugs involving variadic template arguments. (#GH123591), (#GH127539), (#GH129077),
+  (#GH129620), and (#GH129998).
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
index b424de9c8a945..6728857edc6d8 100644
--- a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
+++ b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
@@ -377,12 +377,10 @@ struct ConvertConstructorToDeductionGuideTransform {
         if (NestedPattern)
           Args.addOuterRetainedLevels(NestedPattern->getTemplateDepth());
         auto [Depth, Index] = getDepthAndIndex(Param);
-        // Depth can still be 0 if FTD belongs to an explicit class template
-        // specialization with an empty template parameter list. In that case,
-        // we don't want the NewDepth to overflow, and it should remain 0.
-        assert(Depth ||
-               cast<ClassTemplateSpecializationDecl>(FTD->getDeclContext())
-                   ->isExplicitSpecialization());
+        // Depth can be 0 if FTD belongs to a non-template class/a class
+        // template specialization with an empty template parameter list. In
+        // that case, we don't want the NewDepth to overflow, and it should
+        // remain 0.
         NamedDecl *NewParam = transformTemplateParameter(
             SemaRef, DC, Param, Args, Index + Depth1IndexAdjustment,
             Depth ? Depth - 1 : 0);
@@ -989,6 +987,19 @@ getRHSTemplateDeclAndArgs(Sema &SemaRef, TypeAliasTemplateDecl *AliasTemplate) {
   return {Template, AliasRhsTemplateArgs};
 }
 
+bool IsNonDeducedArgument(const TemplateArgument &TA) {
+  // The following cases indicate the template argument is non-deducible:
+  //   1. The result is null. E.g. When it comes from a default template
+  //   argument that doesn't appear in the alias declaration.
+  //   2. The template parameter is a pack and that cannot be deduced from
+  //   the arguments within the alias declaration.
+  // Non-deducible template parameters will persist in the transformed
+  // deduction guide.
+  return TA.isNull() ||
+         (TA.getKind() == TemplateArgument::Pack &&
+          llvm::any_of(TA.pack_elements(), IsNonDeducedArgument));
+}
+
 // Build deduction guides for a type alias template from the given underlying
 // deduction guide F.
 FunctionTemplateDecl *
@@ -1057,7 +1068,8 @@ BuildDeductionGuideForTypeAlias(Sema &SemaRef,
   // !!NOTE: DeduceResults respects the sequence of template parameters of
   // the deduction guide f.
   for (unsigned Index = 0; Index < DeduceResults.size(); ++Index) {
-    if (const auto &D = DeduceResults[Index]; !D.isNull()) // Deduced
+    const auto &D = DeduceResults[Index];
+    if (!IsNonDeducedArgument(D))
       DeducedArgs.push_back(D);
     else
       NonDeducedTemplateParamsInFIndex.push_back(Index);
@@ -1121,7 +1133,7 @@ BuildDeductionGuideForTypeAlias(Sema &SemaRef,
   Args.addOuterTemplateArguments(TransformedDeducedAliasArgs);
   for (unsigned Index = 0; Index < DeduceResults.size(); ++Index) {
     const auto &D = DeduceResults[Index];
-    if (D.isNull()) {
+    if (IsNonDeducedArgument(D)) {
       // 2): Non-deduced template parameters would be substituted later.
       continue;
     }
diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp
index cf29d8a101b43..73567f3be814d 100644
--- a/clang/lib/Sema/SemaTemplateInstantiate.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp
@@ -1347,6 +1347,16 @@ std::optional<TemplateDeductionInfo *> Sema::isSFINAEContext() const {
   return std::nullopt;
 }
 
+static TemplateArgument
+getPackSubstitutedTemplateArgument(Sema &S, TemplateArgument Arg) {
+  assert(S.ArgumentPackSubstitutionIndex >= 0);
+  assert(S.ArgumentPackSubstitutionIndex < (int)Arg.pack_size());
+  Arg = Arg.pack_begin()[S.ArgumentPackSubstitutionIndex];
+  if (Arg.isPackExpansion())
+    Arg = Arg.getPackExpansionPattern();
+  return Arg;
+}
+
 //===----------------------------------------------------------------------===/
 // Template Instantiation for Types
 //===----------------------------------------------------------------------===/
@@ -1466,11 +1476,13 @@ namespace {
       }
     }
 
-    static TemplateArgument
+    TemplateArgument
     getTemplateArgumentPackPatternForRewrite(const TemplateArgument &TA) {
       if (TA.getKind() != TemplateArgument::Pack)
         return TA;
-      assert(TA.pack_size() == 1 &&
+      if (SemaRef.ArgumentPackSubstitutionIndex != -1)
+        return getPackSubstitutedTemplateArgument(SemaRef, TA);
+      assert(TA.pack_size() == 1 && TA.pack_begin()->isPackExpansion() &&
              "unexpected pack arguments in template rewrite");
       TemplateArgument Arg = *TA.pack_begin();
       if (Arg.isPackExpansion())
@@ -1629,6 +1641,9 @@ namespace {
       std::vector<TemplateArgument> TArgs;
       switch (Arg.getKind()) {
       case TemplateArgument::Pack:
+        assert(SemaRef.CodeSynthesisContexts.empty() ||
+               SemaRef.CodeSynthesisContexts.back().Kind ==
+                   Sema::CodeSynthesisContext::BuildingDeductionGuides);
         // Literally rewrite the template argument pack, instead of unpacking
         // it.
         for (auto &pack : Arg.getPackAsArray()) {
@@ -1649,6 +1664,23 @@ namespace {
       return inherited::TransformTemplateArgument(Input, Output, Uneval);
     }
 
+    std::optional<unsigned> ComputeSizeOfPackExprWithoutSubstitution(
+        ArrayRef<TemplateArgument> PackArgs) {
+      // Don't do this when rewriting template parameters for CTAD:
+      //   1) The heuristic needs the unpacked Subst* nodes to figure out the
+      //   expanded size, but this never applies since Subst* nodes are not
+      //   created in rewrite scenarios.
+      //
+      //   2) The heuristic substitutes into the pattern with pack expansion
+      //   suppressed, which does not meet the requirements for argument
+      //   rewriting when template arguments include a non-pack matching against
+      //   a pack, particularly when rewriting an alias CTAD.
+      if (TemplateArgs.isRewrite())
+        return std::nullopt;
+
+      return inherited::ComputeSizeOfPackExprWithoutSubstitution(PackArgs);
+    }
+
     template<typename Fn>
     QualType TransformFunctionProtoType(TypeLocBuilder &TLB,
                                         FunctionProtoTypeLoc TL,
@@ -1867,16 +1899,6 @@ bool TemplateInstantiator::AlreadyTransformed(QualType T) {
   return true;
 }
 
-static TemplateArgument
-getPackSubstitutedTemplateArgument(Sema &S, TemplateArgument Arg) {
-  assert(S.ArgumentPackSubstitutionIndex >= 0);
-  assert(S.ArgumentPackSubstitutionIndex < (int)Arg.pack_size());
-  Arg = Arg.pack_begin()[S.ArgumentPackSubstitutionIndex];
-  if (Arg.isPackExpansion())
-    Arg = Arg.getPackExpansionPattern();
-  return Arg;
-}
-
 Decl *TemplateInstantiator::TransformDecl(SourceLocation Loc, Decl *D) {
   if (!D)
     return nullptr;
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 2a5e354ff716a..3e8f0ec485e9b 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -3660,6 +3660,9 @@ class TreeTransform {
     return SemaRef.BuildCXXNoexceptExpr(Range.getBegin(), Arg, Range.getEnd());
   }
 
+  std::optional<unsigned>
+  ComputeSizeOfPackExprWithoutSubstitution(ArrayRef<TemplateArgument> PackArgs);
+
   /// Build a new expression to compute the length of a parameter pack.
   ExprResult RebuildSizeOfPackExpr(SourceLocation OperatorLoc, NamedDecl *Pack,
                                    SourceLocation PackLoc,
@@ -15877,6 +15880,49 @@ TreeTransform<Derived>::TransformPackExpansionExpr(PackExpansionExpr *E) {
                                            E->getNumExpansions());
 }
 
+template <typename Derived>
+std::optional<unsigned>
+TreeTransform<Derived>::ComputeSizeOfPackExprWithoutSubstitution(
+    ArrayRef<TemplateArgument> PackArgs) {
+  std::optional<unsigned> Result = 0;
+  for (const TemplateArgument &Arg : PackArgs) {
+    if (!Arg.isPackExpansion()) {
+      Result = *Result + 1;
+      continue;
+    }
+
+    TemplateArgumentLoc ArgLoc;
+    InventTemplateArgumentLoc(Arg, ArgLoc);
+
+    // Find the pattern of the pack expansion.
+    SourceLocation Ellipsis;
+    std::optional<unsigned> OrigNumExpansions;
+    TemplateArgumentLoc Pattern =
+        getSema().getTemplateArgumentPackExpansionPattern(ArgLoc, Ellipsis,
+                                                          OrigNumExpansions);
+
+    // Substitute under the pack expansion. Do not expand the pack (yet).
+    TemplateArgumentLoc OutPattern;
+    Sema::ArgumentPackSubstitutionIndexRAII SubstIndex(getSema(), -1);
+    if (getDerived().TransformTemplateArgument(Pattern, OutPattern,
+                                               /*Uneval*/ true))
+      return true;
+
+    // See if we can determine the number of arguments from the result.
+    std::optional<unsigned> NumExpansions =
+        getSema().getFullyPackExpandedSize(OutPattern.getArgument());
+    if (!NumExpansions) {
+      // No: we must be in an alias template expansion, and we're going to
+      // need to actually expand the packs.
+      Result = std::nullopt;
+      break;
+    }
+
+    Result = *Result + *NumExpansions;
+  }
+  return Result;
+}
+
 template<typename Derived>
 ExprResult
 TreeTransform<Derived>::TransformSizeOfPackExpr(SizeOfPackExpr *E) {
@@ -15942,42 +15988,8 @@ TreeTransform<Derived>::TransformSizeOfPackExpr(SizeOfPackExpr *E) {
   }
 
   // Try to compute the result without performing a partial substitution.
-  std::optional<unsigned> Result = 0;
-  for (const TemplateArgument &Arg : PackArgs) {
-    if (!Arg.isPackExpansion()) {
-      Result = *Result + 1;
-      continue;
-    }
-
-    TemplateArgumentLoc ArgLoc;
-    InventTemplateArgumentLoc(Arg, ArgLoc);
-
-    // Find the pattern of the pack expansion.
-    SourceLocation Ellipsis;
-    std::optional<unsigned> OrigNumExpansions;
-    TemplateArgumentLoc Pattern =
-        getSema().getTemplateArgumentPackExpansionPattern(ArgLoc, Ellipsis,
-                                                          OrigNumExpansions);
-
-    // Substitute under the pack expansion. Do not expand the pack (yet).
-    TemplateArgumentLoc OutPattern;
-    Sema::ArgumentPackSubstitutionIndexRAII SubstIndex(getSema(), -1);
-    if (getDerived().TransformTemplateArgument(Pattern, OutPattern,
-                                               /*Uneval*/ true))
-      return true;
-
-    // See if we can determine the number of arguments from the result.
-    std::optional<unsigned> NumExpansions =
-        getSema().getFullyPackExpandedSize(OutPattern.getArgument());
-    if (!NumExpansions) {
-      // No: we must be in an alias template expansion, and we're going to need
-      // to actually expand the packs.
-      Result = std::nullopt;
-      break;
-    }
-
-    Result = *Result + *NumExpansions;
-  }
+  std::optional<unsigned> Result =
+      getDerived().ComputeSizeOfPackExprWithoutSubstitution(PackArgs);
 
   // Common case: we could determine the number of expansions without
   // substituting.
diff --git a/clang/test/SemaCXX/ctad.cpp b/clang/test/SemaCXX/ctad.cpp
index 10806f107b4ee..00a861d0f567c 100644
--- a/clang/test/SemaCXX/ctad.cpp
+++ b/clang/test/SemaCXX/ctad.cpp
@@ -1,5 +1,4 @@
 // RUN: %clang_cc1 -fsyntax-only -verify -Wno-unused-value -std=c++20 %s
-// expected-no-diagnostics
 
 namespace GH64347 {
 
@@ -17,3 +16,134 @@ void k() {
 }
 
 } // namespace GH64347
+
+namespace GH123591 {
+
+
+template < typename... _Types >
+struct variant {
+  template <int N = sizeof...(_Types)>
+  variant(_Types...);
+};
+
+template <class T>
+using AstNode = variant<T, T, T>;
+
+AstNode tree(42, 43, 44);
+
+}
+
+namespace GH123591_2 {
+
+template <int>
+using enable_if_t = char;
+
+template < typename... Types >
+struct variant {
+  template < enable_if_t<sizeof...(Types)>>
+  variant();
+};
+
+template <int>
+using AstNode = variant<>;
+// expected-note@-1 {{couldn't infer template argument ''}} \
+// expected-note@-1 2{{implicit deduction guide declared as}} \
+// expected-note@-1 {{candidate function template not viable}}
+
+
+AstNode tree; // expected-error {{no viable constructor or deduction guide}}
+
+}
+
+namespace GH127539 {
+
+template <class...>
+struct A {
+    template <class... ArgTs>
+    A(ArgTs...) {}
+};
+
+template <class... ArgTs>
+A(ArgTs...) -> A<typename ArgTs::value_type...>;
+
+template <class... Ts>
+using AA = A<Ts..., Ts...>;
+
+AA a{};
+
+}
+
+namespace GH129077 {
+
+using size_t = decltype(sizeof(0));
+
+struct index_type
+{
+  size_t value = 0;
+  index_type() = default;
+  constexpr index_type(size_t i) noexcept : value(i) {}
+};
+
+template <index_type... Extents>
+struct extents
+{
+  constexpr extents(decltype(Extents)...) noexcept {}
+};
+
+template <class... Extents>
+extents(Extents...) -> extents<(requires { Extents::value; } ? Extents{} : ~0ull)...>;
+
+template <index_type... Index>
+using index = extents<Index...>;
+
+int main()
+{
+  extents i{0,0};
+  auto j = extents<64,{}>({}, 42);
+
+  index k{0,0};
+  auto l = index<64,{}>({}, 42);
+
+  return 0;
+}
+
+}
+
+namespace GH129620 {
+
+template <class... Ts>
+struct A {
+    constexpr A(Ts...) {}
+};
+
+template <class... Ts>
+using Foo = A<Ts...>;
+
+template <class T>
+using Bar = Foo<T, T>;
+
+Bar a{0, 0};
+
+}
+
+namespace GH129998 {
+
+struct converible_to_one {
+    constexpr operator int() const noexcept { return 1; }
+};
+
+template <int... Extents>
+struct class_template {
+    class_template() = default;
+    constexpr class_template(auto&&...) noexcept {}
+};
+
+template <class... Extents>
+class_template(Extents...) -> class_template<(true ? 0 : +Extents{})...>;
+
+template <int... Extents>
+using alias_template = class_template<Extents...>;
+
+alias_template var2{converible_to_one{}, 2};
+
+}
diff --git a/clang/test/SemaTemplate/deduction-guide.cpp b/clang/test/SemaTemplate/deduction-guide.cpp
index ecd152abebd74..6db132ca37c7e 100644
--- a/clang/test/SemaTemplate/deduction-guide.cpp
+++ b/clang/test/SemaTemplate/deduction-guide.cpp
@@ -723,3 +723,51 @@ void test() { NewDeleteAllocator abc(42); } // expected-error {{no viable constr
 // CHECK-NEXT:  `-ParmVarDecl {{.+}} 'T'
 
 } // namespace GH128691
+
+namespace GH132616_DeductionGuide {
+
+template <class T> struct A {
+  template <class U>
+  A(U);
+};
+
+template <typename>
+struct B : A<int> {
+  using A::A;
+};
+
+template <class T>
+B(T) -> B<T>;
+
+B b(24);
+
+// CHECK-LABEL: Dumping GH132616_DeductionGuide::<deduction guide for B>:
+// CHECK-NEXT: FunctionTemplateDecl {{.+}} implicit <deduction guide for B>
+// CHECK-NEXT: |-TemplateTypeParmDecl {{.+}} typename depth 0 index 0
+// CHECK-NEXT: |-TemplateTypeParmDecl {{.+}} class depth 0 index 1 U
+// CHECK-NEXT: `-CXXDeductionGuideDecl {{.+}} implicit <deduction guide for B> 'auto (U) -> B<type-parameter-0-0>'
+// CHECK-NEXT:  `-ParmVarDecl {{.+}} 'U'
+
+struct C {
+  template <class U>
+  C(U);
+};
+
+template <typename>
+struct D : C {
+  using C::C;
+};
+
+template <class T>
+D(T) -> D<T>;
+
+D d(24);
+
+// CHECK-LABEL: Dumping GH132616_DeductionGuide::<deduction guide for D>:
+// CHECK-NEXT: FunctionTemplateDecl {{.+}} implicit <deduction guide for D>
+// CHECK-NEXT: |-TemplateTypeParmDecl {{.+}} typename depth 0 index 0
+// CHECK-NEXT: |-TemplateTypeParmDecl {{.+}} class depth 0 index 1 U
+// CHECK-NEXT: `-CXXDeductionGuideDecl {{.+}} implicit <deduction guide for D> 'auto (U) -> D<type-parameter-0-0>'
+// CHECK-NEXT:  `-ParmVarDecl {{.+}} 'U'
+
+} // namespace GH132616_DeductionGuide

From ecde8c235e5e09ff71789725c96416f8daf93cd7 Mon Sep 17 00:00:00 2001
From: Matheus Izvekov <mizvekov@gmail.com>
Date: Sat, 8 Mar 2025 20:32:14 -0300
Subject: [PATCH 271/282] [clang] fix matching of nested template template
 parameters

When checking the template template parameters of template template
parameters, the PartialOrdering context was not correctly propagated.

This also has a few drive-by fixes, such as checking the template parameter
lists of template template parameters, which was previously missing and
would have been it's own bug, but we need to fix it in order to
prevent crashes in error recovery in a simple way.

Fixes #130362

Backport of: https://github.com/llvm/llvm-project/pull/130447
---
 clang/docs/ReleaseNotes.rst                   |  3 ++
 clang/include/clang/Sema/Sema.h               |  8 +++--
 clang/lib/Sema/SemaDecl.cpp                   |  2 +-
 clang/lib/Sema/SemaDeclCXX.cpp                |  2 +-
 clang/lib/Sema/SemaTemplate.cpp               | 36 ++++++++++++-------
 clang/lib/Sema/SemaTemplateDeduction.cpp      | 16 +++++----
 .../lib/Sema/SemaTemplateInstantiateDecl.cpp  |  2 +-
 clang/test/SemaTemplate/cwg2398.cpp           | 22 ++++++------
 .../SemaTemplate/temp_arg_template_p0522.cpp  |  3 +-
 clang/unittests/AST/DeclPrinterTest.cpp       | 16 ++++-----
 10 files changed, 64 insertions(+), 46 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 955325026f369..c921ac3518f01 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -1058,6 +1058,9 @@ Bug Fixes to C++ Support
 - Fixed a substitution bug in transforming CTAD aliases when the type alias contains a non-pack template argument
   corresponding to a pack parameter (#GH124715)
 - Clang is now better at keeping track of friend function template instance contexts. (#GH55509)
+- Fixes matching of nested template template parameters. (#GH130362)
+- Correctly diagnoses template template paramters which have a pack parameter
+  not in the last position.
 - Fixed an integer overflow bug in computing template parameter depths when synthesizing CTAD guides. (#GH128691)
 - Fixed an incorrect pointer access when checking access-control on concepts. (#GH131530)
 - Fixed various alias CTAD bugs involving variadic template arguments. (#GH123591), (#GH127539), (#GH129077),
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index cecf5cff332f4..d8cc0171c22c6 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -11279,14 +11279,16 @@ class Sema final : public SemaBase {
 
   /// The context in which we are checking a template parameter list.
   enum TemplateParamListContext {
-    TPC_ClassTemplate,
-    TPC_VarTemplate,
+    // For this context, Class, Variable, TypeAlias, and non-pack Template
+    // Template Parameters are treated uniformly.
+    TPC_Other,
+
     TPC_FunctionTemplate,
     TPC_ClassTemplateMember,
     TPC_FriendClassTemplate,
     TPC_FriendFunctionTemplate,
     TPC_FriendFunctionTemplateDefinition,
-    TPC_TypeAliasTemplate
+    TPC_TemplateTemplateParameterPack,
   };
 
   /// Checks the validity of a template parameter list, possibly
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index f70401ea33b4a..41d5f9f2f3420 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -8145,7 +8145,7 @@ NamedDecl *Sema::ActOnVariableDeclarator(
               (D.getCXXScopeSpec().isSet() && DC && DC->isRecord() &&
                DC->isDependentContext())
                   ? TPC_ClassTemplateMember
-                  : TPC_VarTemplate))
+                  : TPC_Other))
         NewVD->setInvalidDecl();
 
       // If we are providing an explicit specialization of a static variable
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index e4e3bbad1f520..85de46c9adab4 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -13533,7 +13533,7 @@ Decl *Sema::ActOnAliasDeclaration(Scope *S, AccessSpecifier AS,
     // Merge any previous default template arguments into our parameters,
     // and check the parameter list.
     if (CheckTemplateParameterList(TemplateParams, OldTemplateParams,
-                                   TPC_TypeAliasTemplate))
+                                   TPC_Other))
       return nullptr;
 
     TypeAliasTemplateDecl *NewDecl =
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index 938671055333c..1c555b38277b0 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -1591,8 +1591,16 @@ NamedDecl *Sema::ActOnTemplateTemplateParameter(
   assert(S->isTemplateParamScope() &&
          "Template template parameter not in template parameter scope!");
 
-  // Construct the parameter object.
   bool IsParameterPack = EllipsisLoc.isValid();
+
+  bool Invalid = false;
+  if (CheckTemplateParameterList(
+          Params,
+          /*OldParams=*/nullptr,
+          IsParameterPack ? TPC_TemplateTemplateParameterPack : TPC_Other))
+    Invalid = true;
+
+  // Construct the parameter object.
   TemplateTemplateParmDecl *Param = TemplateTemplateParmDecl::Create(
       Context, Context.getTranslationUnitDecl(),
       NameLoc.isInvalid() ? TmpLoc : NameLoc, Depth, Position, IsParameterPack,
@@ -1615,9 +1623,12 @@ NamedDecl *Sema::ActOnTemplateTemplateParameter(
   if (Params->size() == 0) {
     Diag(Param->getLocation(), diag::err_template_template_parm_no_parms)
     << SourceRange(Params->getLAngleLoc(), Params->getRAngleLoc());
-    Param->setInvalidDecl();
+    Invalid = true;
   }
 
+  if (Invalid)
+    Param->setInvalidDecl();
+
   // C++0x [temp.param]p9:
   //   A default template-argument may be specified for any kind of
   //   template-parameter that is not a template parameter pack.
@@ -2066,7 +2077,7 @@ DeclResult Sema::CheckClassTemplate(
            SemanticContext->isDependentContext())
               ? TPC_ClassTemplateMember
           : TUK == TagUseKind::Friend ? TPC_FriendClassTemplate
-                                      : TPC_ClassTemplate,
+                                      : TPC_Other,
           SkipBody))
     Invalid = true;
 
@@ -2208,9 +2219,8 @@ static bool DiagnoseDefaultTemplateArgument(Sema &S,
                                             SourceLocation ParamLoc,
                                             SourceRange DefArgRange) {
   switch (TPC) {
-  case Sema::TPC_ClassTemplate:
-  case Sema::TPC_VarTemplate:
-  case Sema::TPC_TypeAliasTemplate:
+  case Sema::TPC_Other:
+  case Sema::TPC_TemplateTemplateParameterPack:
     return false;
 
   case Sema::TPC_FunctionTemplate:
@@ -2383,8 +2393,11 @@ bool Sema::CheckTemplateParameterList(TemplateParameterList *NewParams,
         MissingDefaultArg = true;
     } else if (NonTypeTemplateParmDecl *NewNonTypeParm
                = dyn_cast<NonTypeTemplateParmDecl>(*NewParam)) {
-      // Check for unexpanded parameter packs.
-      if (!NewNonTypeParm->isParameterPack() &&
+      // Check for unexpanded parameter packs, except in a template template
+      // parameter pack, as in those any unexpanded packs should be expanded
+      // along with the parameter itself.
+      if (TPC != TPC_TemplateTemplateParameterPack &&
+          !NewNonTypeParm->isParameterPack() &&
           DiagnoseUnexpandedParameterPack(NewNonTypeParm->getLocation(),
                                           NewNonTypeParm->getTypeSourceInfo(),
                                           UPPC_NonTypeTemplateParameterType)) {
@@ -2492,8 +2505,7 @@ bool Sema::CheckTemplateParameterList(TemplateParameterList *NewParams,
     //   If a template parameter of a primary class template or alias template
     //   is a template parameter pack, it shall be the last template parameter.
     if (SawParameterPack && (NewParam + 1) != NewParamEnd &&
-        (TPC == TPC_ClassTemplate || TPC == TPC_VarTemplate ||
-         TPC == TPC_TypeAliasTemplate)) {
+        (TPC == TPC_Other || TPC == TPC_TemplateTemplateParameterPack)) {
       Diag((*NewParam)->getLocation(),
            diag::err_template_param_pack_must_be_last_template_parameter);
       Invalid = true;
@@ -2526,8 +2538,8 @@ bool Sema::CheckTemplateParameterList(TemplateParameterList *NewParams,
           << PrevModuleName;
       Invalid = true;
     } else if (MissingDefaultArg &&
-               (TPC == TPC_ClassTemplate || TPC == TPC_FriendClassTemplate ||
-                TPC == TPC_VarTemplate || TPC == TPC_TypeAliasTemplate)) {
+               (TPC == TPC_Other || TPC == TPC_TemplateTemplateParameterPack ||
+                TPC == TPC_FriendClassTemplate)) {
       // C++ 23[temp.param]p14:
       // If a template-parameter of a class template, variable template, or
       // alias template has a default template argument, each subsequent
diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp
index 5304b5a2155b4..7a880505a53ff 100644
--- a/clang/lib/Sema/SemaTemplateDeduction.cpp
+++ b/clang/lib/Sema/SemaTemplateDeduction.cpp
@@ -3427,9 +3427,9 @@ static TemplateDeductionResult FinishTemplateArgumentDeduction(
       if (!P.isPackExpansion() && !A.isPackExpansion()) {
         Info.Param =
             makeTemplateParameter(Template->getTemplateParameters()->getParam(
-                (PsStack.empty() ? TemplateArgs.end()
-                                 : PsStack.front().begin()) -
-                TemplateArgs.begin()));
+                (AsStack.empty() ? CTAI.CanonicalConverted.end()
+                                 : AsStack.front().begin()) -
+                1 - CTAI.CanonicalConverted.begin()));
         Info.FirstArg = P;
         Info.SecondArg = A;
         return TemplateDeductionResult::NonDeducedMismatch;
@@ -6625,17 +6625,19 @@ bool Sema::isTemplateTemplateParameterAtLeastAsSpecializedAs(
 
   TemplateDeductionResult TDK;
   runWithSufficientStackSpace(Info.getLocation(), [&] {
-    TDK = ::FinishTemplateArgumentDeduction(
-        *this, AArg, /*IsPartialOrdering=*/true, PArgs, Deduced, Info);
+    TDK = ::FinishTemplateArgumentDeduction(*this, AArg, PartialOrdering, PArgs,
+                                            Deduced, Info);
   });
   switch (TDK) {
   case TemplateDeductionResult::Success:
     return true;
 
   // It doesn't seem possible to get a non-deduced mismatch when partial
-  // ordering TTPs.
+  // ordering TTPs, except with an invalid template parameter list which has
+  // a parameter after a pack.
   case TemplateDeductionResult::NonDeducedMismatch:
-    llvm_unreachable("Unexpected NonDeducedMismatch");
+    assert(PArg->isInvalidDecl() && "Unexpected NonDeducedMismatch");
+    return false;
 
   // Substitution failures should have already been diagnosed.
   case TemplateDeductionResult::AlreadyDiagnosed:
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index 89ad2a0a9b7bb..0c25b87439a95 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -1827,7 +1827,7 @@ Decl *TemplateDeclInstantiator::VisitClassTemplateDecl(ClassTemplateDecl *D) {
       // Do some additional validation, then merge default arguments
       // from the existing declarations.
       if (SemaRef.CheckTemplateParameterList(InstParams, PrevParams,
-                                             Sema::TPC_ClassTemplate))
+                                             Sema::TPC_Other))
         return nullptr;
 
       Inst->setAccess(PrevClassTemplate->getAccess());
diff --git a/clang/test/SemaTemplate/cwg2398.cpp b/clang/test/SemaTemplate/cwg2398.cpp
index 8592be469bb50..33b288acce82a 100644
--- a/clang/test/SemaTemplate/cwg2398.cpp
+++ b/clang/test/SemaTemplate/cwg2398.cpp
@@ -650,6 +650,11 @@ namespace regression3 {
   template struct A<B, Node<None>>;
   // old-error@-1 {{different template}}
 } // namespace regression3
+namespace GH130362 {
+  template <template <template <class... T1> class TT1> class TT2> struct A {};
+  template <template <class U1> class UU1> struct B {};
+  template struct A<B>;
+} // namespace GH130362
 
 namespace nttp_auto {
   namespace t1 {
@@ -658,26 +663,19 @@ namespace nttp_auto {
     template struct A<B>;
   } // namespace t1
   namespace t2 {
-    // FIXME: Shouldn't accept parameters after a parameter pack.
     template<template<auto... Va1, auto Va2> class> struct A {};
-    // new-error@-1 {{deduced non-type template argument does not have the same type as the corresponding template parameter ('auto' vs 'int')}}
-    // expected-note@-2 {{previous template template parameter is here}}
+    // expected-error@-1 {{template parameter pack must be the last template parameter}}
+    // old-note@-2 {{previous template template parameter is here}}
     template<int... Vi> struct B;
-    // new-note@-1 {{template parameter is declared here}}
-    // old-note@-2 {{too few template parameters}}
+    // old-note@-1 {{too few template parameters}}
     template struct A<B>;
-    // new-note@-1 {{different template parameters}}
-    // old-error@-2 {{different template parameters}}
+    // old-error@-1 {{different template parameters}}
   } // namespace t2
   namespace t3 {
-    // FIXME: Shouldn't accept parameters after a parameter pack.
     template<template<auto... Va1, auto... Va2> class> struct A {};
-    // new-error@-1 {{deduced non-type template argument does not have the same type as the corresponding template parameter ('auto' vs 'int')}}
-    // new-note@-2 {{previous template template parameter is here}}
+    // expected-error@-1 {{template parameter pack must be the last template parameter}}
     template<int... Vi> struct B;
-    // new-note@-1 {{template parameter is declared here}}
     template struct A<B>;
-    // new-note@-1 {{different template parameters}}
   } // namespace t3
 } // namespace nttp_auto
 
diff --git a/clang/test/SemaTemplate/temp_arg_template_p0522.cpp b/clang/test/SemaTemplate/temp_arg_template_p0522.cpp
index 2e5a36ae6ed08..d8a81bb363112 100644
--- a/clang/test/SemaTemplate/temp_arg_template_p0522.cpp
+++ b/clang/test/SemaTemplate/temp_arg_template_p0522.cpp
@@ -7,7 +7,8 @@
 template<template<int> typename> struct Ti; // #Ti
 template<template<int...> typename> struct TPi; // #TPi
 template<template<int, int...> typename> struct TiPi;
-template<template<int..., int...> typename> struct TPiPi; // FIXME: Why is this not ill-formed?
+template<template<int..., int...> typename> struct TPiPi;
+// expected-error@-1 {{template parameter pack must be the last template parameter}}
 
 template<typename T, template<T> typename> struct tT0; // #tT0
 template<template<typename T, T> typename> struct Tt0; // #Tt0
diff --git a/clang/unittests/AST/DeclPrinterTest.cpp b/clang/unittests/AST/DeclPrinterTest.cpp
index 6945dff537cae..124b1a166cb18 100644
--- a/clang/unittests/AST/DeclPrinterTest.cpp
+++ b/clang/unittests/AST/DeclPrinterTest.cpp
@@ -1196,21 +1196,21 @@ TEST(DeclPrinter, TestUnnamedTemplateParameters) {
 }
 
 TEST(DeclPrinter, TestUnnamedTemplateParametersPacks) {
-  ASSERT_TRUE(PrintedDeclCXX17Matches(
-      "template <typename ..., int ...,"
-      " template <typename ..., bool ...> class ...> void A();",
-      functionTemplateDecl(hasName("A")).bind("id"),
-      "template <typename ..., int ...,"
-      " template <typename ..., bool ...> class ...> void A()"));
+  ASSERT_TRUE(
+      PrintedDeclCXX17Matches("template <typename ..., int ...,"
+                              " template <typename ...> class ...> void A();",
+                              functionTemplateDecl(hasName("A")).bind("id"),
+                              "template <typename ..., int ...,"
+                              " template <typename ...> class ...> void A()"));
 }
 
 TEST(DeclPrinter, TestNamedTemplateParametersPacks) {
   ASSERT_TRUE(PrintedDeclCXX17Matches(
       "template <typename ...T, int ...I,"
-      " template <typename ...X, bool ...B> class ...Z> void A();",
+      " template <typename ...X> class ...Z> void A();",
       functionTemplateDecl(hasName("A")).bind("id"),
       "template <typename ...T, int ...I,"
-      " template <typename ...X, bool ...B> class ...Z> void A()"));
+      " template <typename ...X> class ...Z> void A()"));
 }
 
 TEST(DeclPrinter, TestTemplateTemplateParameterWrittenWithTypename) {

From 1a76c29a9ba877c85acf455f05af47039607958b Mon Sep 17 00:00:00 2001
From: Brian Cain <brian.cain@oss.qualcomm.com>
Date: Mon, 10 Mar 2025 16:35:22 -0600
Subject: [PATCH 272/282] [hexagon] Enable --eh-frame-hdr (#130225)

The missing `PT_GNU_EH_FRAME` was causing C++ exception handling test
failures in llvm-test-suite. We should unconditionally add this argument
like the other drivers do.

Discovered-by: Alexey Karyakin <akaryaki@quicinc.com>
Fixes: #129745
(cherry picked from commit 6657769199ad625ea0cb7f7c054d4f6a27806080)
---
 clang/lib/Driver/ToolChains/Hexagon.cpp     | 1 +
 clang/test/Driver/hexagon-toolchain-elf.c   | 1 +
 clang/test/Driver/hexagon-toolchain-linux.c | 1 +
 3 files changed, 3 insertions(+)

diff --git a/clang/lib/Driver/ToolChains/Hexagon.cpp b/clang/lib/Driver/ToolChains/Hexagon.cpp
index 76cedf312d68a..772a9827cb211 100644
--- a/clang/lib/Driver/ToolChains/Hexagon.cpp
+++ b/clang/lib/Driver/ToolChains/Hexagon.cpp
@@ -313,6 +313,7 @@ constructHexagonLinkArgs(Compilation &C, const JobAction &JA,
                                      // handled somewhere else.
   Args.ClaimAllArgs(options::OPT_static_libgcc);
 
+  CmdArgs.push_back("--eh-frame-hdr");
   //----------------------------------------------------------------------------
   //
   //----------------------------------------------------------------------------
diff --git a/clang/test/Driver/hexagon-toolchain-elf.c b/clang/test/Driver/hexagon-toolchain-elf.c
index be812dda40d57..de2ebfeeda26c 100644
--- a/clang/test/Driver/hexagon-toolchain-elf.c
+++ b/clang/test/Driver/hexagon-toolchain-elf.c
@@ -555,6 +555,7 @@
 // RUN:   -ccc-install-dir %S/Inputs/hexagon_tree/Tools/bin \
 // RUN:   -mcpu=hexagonv60 \
 // RUN:   -fuse-ld=lld %s 2>&1 | FileCheck -check-prefix=CHECK382 %s
+// CHECK382:          "--eh-frame-hdr
 // CHECK382-NOT:      "-march=
 // CHECK382-NOT:      "-mcpu=
 // -----------------------------------------------------------------------------
diff --git a/clang/test/Driver/hexagon-toolchain-linux.c b/clang/test/Driver/hexagon-toolchain-linux.c
index 6f7f3b20f9141..e791353cca07f 100644
--- a/clang/test/Driver/hexagon-toolchain-linux.c
+++ b/clang/test/Driver/hexagon-toolchain-linux.c
@@ -127,6 +127,7 @@
 // RUN:    --target=hexagon-unknown-linux-musl %s -### 2>&1 \
 // RUN:    | FileCheck -check-prefix=CHECK011 %s
 // CHECK011:   InstalledDir: [[INSTALLED_DIR:.+]]
+// CHECK011:   "--eh-frame-hdr"
 // CHECK011:   crt1.o
 // CHECK011-NOT:  "-lunwind"
 // CHECK011-NOT:  "-lgcc_eh"

From d60baf3d47863083079e840efc8912d37399076f Mon Sep 17 00:00:00 2001
From: aankit-ca <quic_aankit@quicinc.com>
Date: Thu, 13 Mar 2025 12:48:31 -0700
Subject: [PATCH 273/282] [HEXAGON] Fix semantics of ordered FP compares
 (#131089)

For the ordered FP compare bitcode instructions, the Hexagon backend was
assuming that no operand could be a NaN. This assumption is flawed. This
patch fixes the code-generation to produce fpcmp.uo and and appropriate
bit comparison operators to account for the case when an operand to a FP
compare is a NaN.

Fix for https://github.com/llvm/llvm-project/issues/129391

Co-authored-by: aankit-quic <aankit@quicinc.com>
(cherry picked from commit d642eec78fc94ef3c5266dc0b10b8c51ea046e7a)
---
 llvm/lib/Target/Hexagon/HexagonPatterns.td |  38 +++--
 llvm/test/CodeGen/Hexagon/fcmp-nan.ll      | 189 +++++++++++++++++++++
 2 files changed, 213 insertions(+), 14 deletions(-)
 create mode 100644 llvm/test/CodeGen/Hexagon/fcmp-nan.ll

diff --git a/llvm/lib/Target/Hexagon/HexagonPatterns.td b/llvm/lib/Target/Hexagon/HexagonPatterns.td
index cba5ff1ab0d9b..244f204539c89 100644
--- a/llvm/lib/Target/Hexagon/HexagonPatterns.td
+++ b/llvm/lib/Target/Hexagon/HexagonPatterns.td
@@ -721,11 +721,6 @@ def: OpR_RR_pat<A2_vcmpwgtu,  setugt,         v2i1, V2I32>;
 def: OpR_RR_pat<F2_sfcmpeq,   seteq,          i1, F32>;
 def: OpR_RR_pat<F2_sfcmpgt,   setgt,          i1, F32>;
 def: OpR_RR_pat<F2_sfcmpge,   setge,          i1, F32>;
-def: OpR_RR_pat<F2_sfcmpeq,   setoeq,         i1, F32>;
-def: OpR_RR_pat<F2_sfcmpgt,   setogt,         i1, F32>;
-def: OpR_RR_pat<F2_sfcmpge,   setoge,         i1, F32>;
-def: OpR_RR_pat<F2_sfcmpgt,   RevCmp<setolt>, i1, F32>;
-def: OpR_RR_pat<F2_sfcmpge,   RevCmp<setole>, i1, F32>;
 def: OpR_RR_pat<F2_sfcmpgt,   RevCmp<setlt>,  i1, F32>;
 def: OpR_RR_pat<F2_sfcmpge,   RevCmp<setle>,  i1, F32>;
 def: OpR_RR_pat<F2_sfcmpuo,   setuo,          i1, F32>;
@@ -733,11 +728,6 @@ def: OpR_RR_pat<F2_sfcmpuo,   setuo,          i1, F32>;
 def: OpR_RR_pat<F2_dfcmpeq,   seteq,          i1, F64>;
 def: OpR_RR_pat<F2_dfcmpgt,   setgt,          i1, F64>;
 def: OpR_RR_pat<F2_dfcmpge,   setge,          i1, F64>;
-def: OpR_RR_pat<F2_dfcmpeq,   setoeq,         i1, F64>;
-def: OpR_RR_pat<F2_dfcmpgt,   setogt,         i1, F64>;
-def: OpR_RR_pat<F2_dfcmpge,   setoge,         i1, F64>;
-def: OpR_RR_pat<F2_dfcmpgt,   RevCmp<setolt>, i1, F64>;
-def: OpR_RR_pat<F2_dfcmpge,   RevCmp<setole>, i1, F64>;
 def: OpR_RR_pat<F2_dfcmpgt,   RevCmp<setlt>,  i1, F64>;
 def: OpR_RR_pat<F2_dfcmpge,   RevCmp<setle>,  i1, F64>;
 def: OpR_RR_pat<F2_dfcmpuo,   setuo,          i1, F64>;
@@ -900,15 +890,35 @@ def: OpmR_RR_pat<Cmpud<F2_dfcmpge>,  RevCmp<setule>, i1, F64>;
 def: OpmR_RR_pat<Cmpud<F2_dfcmpgt>,  RevCmp<setult>, i1, F64>;
 def: OpmR_RR_pat<Cmpudn<F2_dfcmpeq>, setune,         i1, F64>;
 
-def: OpmR_RR_pat<Outn<F2_sfcmpeq>, setone, i1, F32>;
-def: OpmR_RR_pat<Outn<F2_sfcmpeq>, setne,  i1, F32>;
+class T4<InstHexagon MI1, InstHexagon MI2, InstHexagon MI3, InstHexagon MI4>
+  : OutPatFrag<(ops node:$Rs, node:$Rt),
+               (MI1 (MI2 (MI3 $Rs, $Rt), (MI4 $Rs, $Rt)))>;
 
-def: OpmR_RR_pat<Outn<F2_dfcmpeq>, setone, i1, F64>;
-def: OpmR_RR_pat<Outn<F2_dfcmpeq>, setne,  i1, F64>;
+class Cmpof<InstHexagon MI>: T3<C2_andn, MI,  F2_sfcmpuo>;
+class Cmpod<InstHexagon MI>: T3<C2_andn, MI,  F2_dfcmpuo>;
+
+class Cmpofn<InstHexagon MI>: T4<C2_not,  C2_or, MI,  F2_sfcmpuo>;
+class Cmpodn<InstHexagon MI>: T4<C2_not,  C2_or, MI,  F2_dfcmpuo>;
+
+def: OpmR_RR_pat<Cmpof<F2_sfcmpeq>,  setoeq,         i1, F32>;
+def: OpmR_RR_pat<Cmpof<F2_sfcmpge>,  setoge,         i1, F32>;
+def: OpmR_RR_pat<Cmpof<F2_sfcmpgt>,  setogt,         i1, F32>;
+def: OpmR_RR_pat<Cmpof<F2_sfcmpge>,  RevCmp<setole>, i1, F32>;
+def: OpmR_RR_pat<Cmpof<F2_sfcmpgt>,  RevCmp<setolt>, i1, F32>;
+def: OpmR_RR_pat<Cmpofn<F2_sfcmpeq>, setone,         i1, F32>;
+
+def: OpmR_RR_pat<Cmpod<F2_dfcmpeq>,  setoeq,         i1, F64>;
+def: OpmR_RR_pat<Cmpod<F2_dfcmpge>,  setoge,         i1, F64>;
+def: OpmR_RR_pat<Cmpod<F2_dfcmpgt>,  setogt,         i1, F64>;
+def: OpmR_RR_pat<Cmpod<F2_dfcmpge>,  RevCmp<setole>, i1, F64>;
+def: OpmR_RR_pat<Cmpod<F2_dfcmpgt>,  RevCmp<setolt>, i1, F64>;
+def: OpmR_RR_pat<Cmpodn<F2_dfcmpeq>, setone,         i1, F64>;
 
 def: OpmR_RR_pat<Outn<F2_sfcmpuo>, seto,   i1, F32>;
 def: OpmR_RR_pat<Outn<F2_dfcmpuo>, seto,   i1, F64>;
 
+def: OpmR_RR_pat<Outn<F2_sfcmpeq>, setne,  i1, F32>;
+def: OpmR_RR_pat<Outn<F2_dfcmpeq>, setne,  i1, F64>;
 
 // --(6) Select ----------------------------------------------------------
 //
diff --git a/llvm/test/CodeGen/Hexagon/fcmp-nan.ll b/llvm/test/CodeGen/Hexagon/fcmp-nan.ll
new file mode 100644
index 0000000000000..1469402911601
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/fcmp-nan.ll
@@ -0,0 +1,189 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+;
+; Test that all FP ordered compare instructions generate the correct
+; post-processing to accommodate NaNs.
+;
+; Specifically for ordered FP compares, we have to check if one of
+; the operands was a NaN to comform to the semantics of the ordered
+; fcmp bitcode instruction
+;
+target triple = "hexagon"
+
+;
+; Functions for float:
+;
+
+;
+; CHECK-DAG: [[REG0:p([0-3])]] = sfcmp.eq(r0,r1)
+; CHECK-DAG: [[REG1:p([0-3])]] = sfcmp.uo(r0,r1)
+; CHECK: [[REG2:p([0-3])]] = and([[REG0]],![[REG1]])
+; CHECK: r0 = mux([[REG2]],#1,#0)
+;
+define i32 @compare_oeq_f(float %val, float %val2) local_unnamed_addr #0 {
+entry:
+  %cmpinf = fcmp oeq float %val, %val2
+  %0 = zext i1 %cmpinf to i32
+  ret i32 %0
+}
+
+
+;
+; CHECK-DAG: [[REG0:p([0-3])]] = sfcmp.eq(r0,r1)
+; CHECK-DAG: [[REG1:p([0-3])]] = sfcmp.uo(r0,r1)
+; CHECK: [[REG2:p([0-3])]] = or([[REG0]],[[REG1]])
+; CHECK: r0 = mux([[REG2]],#0,#1)
+;
+define i32 @compare_one_f(float %val, float %val2) local_unnamed_addr #0 {
+entry:
+  %cmpinf = fcmp one float %val, %val2
+  %0 = zext i1 %cmpinf to i32
+  ret i32 %0
+}
+
+
+;
+; CHECK-DAG: [[REG0:p([0-3])]] = sfcmp.gt(r0,r1)
+; CHECK-DAG: [[REG1:p([0-3])]] = sfcmp.uo(r0,r1)
+; CHECK: [[REG2:p([0-3])]] = and([[REG0]],![[REG1]])
+; CHECK: r0 = mux([[REG2]],#1,#0)
+;
+define i32 @compare_ogt_f(float %val, float %val2) local_unnamed_addr #0 {
+entry:
+  %cmpinf = fcmp ogt float %val, %val2
+  %0 = zext i1 %cmpinf to i32
+  ret i32 %0
+}
+
+
+;
+; CHECK-DAG: [[REG0:p([0-3])]] = sfcmp.ge(r1,r0)
+; CHECK-DAG: [[REG1:p([0-3])]] = sfcmp.uo(r1,r0)
+; CHECK: [[REG2:p([0-3])]] = and([[REG0]],![[REG1]])
+; CHECK: r0 = mux([[REG2]],#1,#0)
+;
+define i32 @compare_ole_f(float %val, float %val2) local_unnamed_addr #0 {
+entry:
+  %cmpinf = fcmp ole float %val, %val2
+  %0 = zext i1 %cmpinf to i32
+  ret i32 %0
+}
+
+
+
+;
+; CHECK-DAG: [[REG0:p([0-3])]] = sfcmp.ge(r0,r1)
+; CHECK-DAG: [[REG1:p([0-3])]] = sfcmp.uo(r0,r1)
+; CHECK: [[REG2:p([0-3])]] = and([[REG0]],![[REG1]])
+; CHECK: r0 = mux([[REG2]],#1,#0)
+;
+define i32 @compare_oge_f(float %val, float %val2) local_unnamed_addr #0 {
+entry:
+  %cmpinf = fcmp oge float %val, %val2
+  %0 = zext i1 %cmpinf to i32
+  ret i32 %0
+}
+
+
+;
+; CHECK-DAG: [[REG0:p([0-3])]] = sfcmp.gt(r1,r0)
+; CHECK-DAG: [[REG1:p([0-3])]] = sfcmp.uo(r1,r0)
+; CHECK: [[REG2:p([0-3])]] = and([[REG0]],![[REG1]])
+; CHECK: r0 = mux([[REG2]],#1,#0)
+;
+define i32 @compare_olt_f(float %val, float %val2) local_unnamed_addr #0 {
+entry:
+  %cmpinf = fcmp olt float %val, %val2
+  %0 = zext i1 %cmpinf to i32
+  ret i32 %0
+}
+
+
+
+;
+; Functions for double:
+;
+
+;
+; CHECK-DAG: [[REG0:p([0-3])]] = dfcmp.eq(r1:0,r3:2)
+; CHECK-DAG: [[REG1:p([0-3])]] = dfcmp.uo(r1:0,r3:2)
+; CHECK: [[REG2:p([0-3])]] = and([[REG0]],![[REG1]])
+; CHECK: r0 = mux([[REG2]],#1,#0)
+;
+define i32 @compare_oeq_d(double %val, double %val2) local_unnamed_addr #0 {
+entry:
+  %cmpinf = fcmp oeq double %val, %val2
+  %0 = zext i1 %cmpinf to i32
+  ret i32 %0
+}
+
+
+;
+; CHECK-DAG: [[REG0:p([0-3])]] = dfcmp.eq(r1:0,r3:2)
+; CHECK-DAG: [[REG1:p([0-3])]] = dfcmp.uo(r1:0,r3:2)
+; CHECK: [[REG2:p([0-3])]] = or([[REG0]],[[REG1]])
+; CHECK: r0 = mux([[REG2]],#0,#1)
+;
+define i32 @compare_one_d(double %val, double %val2) local_unnamed_addr #0 {
+entry:
+  %cmpinf = fcmp one double %val, %val2
+  %0 = zext i1 %cmpinf to i32
+  ret i32 %0
+}
+
+
+
+;
+; CHECK-DAG: [[REG0:p([0-3])]] = dfcmp.gt(r1:0,r3:2)
+; CHECK-DAG: [[REG1:p([0-3])]] = dfcmp.uo(r1:0,r3:2)
+; CHECK: [[REG2:p([0-3])]] = and([[REG0]],![[REG1]])
+; CHECK: r0 = mux([[REG2]],#1,#0)
+;
+define i32 @compare_ogt_d(double %val, double %val2) local_unnamed_addr #0 {
+entry:
+  %cmpinf = fcmp ogt double %val, %val2
+  %0 = zext i1 %cmpinf to i32
+  ret i32 %0
+}
+
+
+;
+; CHECK-DAG: [[REG0:p([0-3])]] = dfcmp.ge(r3:2,r1:0)
+; CHECK-DAG: [[REG1:p([0-3])]] = dfcmp.uo(r3:2,r1:0)
+; CHECK: [[REG2:p([0-3])]] = and([[REG0]],![[REG1]])
+; CHECK: r0 = mux([[REG2]],#1,#0)
+;
+define i32 @compare_ole_d(double %val, double %val2) local_unnamed_addr #0 {
+entry:
+  %cmpinf = fcmp ole double %val, %val2
+  %0 = zext i1 %cmpinf to i32
+  ret i32 %0
+}
+
+
+;
+; CHECK-DAG: [[REG0:p([0-3])]] = dfcmp.ge(r1:0,r3:2)
+; CHECK-DAG: [[REG1:p([0-3])]] = dfcmp.uo(r1:0,r3:2)
+; CHECK: [[REG2:p([0-3])]] = and([[REG0]],![[REG1]])
+; CHECK: r0 = mux([[REG2]],#1,#0)
+;
+define i32 @compare_oge_d(double %val, double %val2) local_unnamed_addr #0 {
+entry:
+  %cmpinf = fcmp oge double %val, %val2
+  %0 = zext i1 %cmpinf to i32
+  ret i32 %0
+}
+
+
+;
+; CHECK-DAG: [[REG0:p([0-3])]] = dfcmp.gt(r3:2,r1:0)
+; CHECK-DAG: [[REG1:p([0-3])]] = dfcmp.uo(r3:2,r1:0)
+; CHECK: [[REG2:p([0-3])]] = and([[REG0]],![[REG1]])
+; CHECK: r0 = mux([[REG2]],#1,#0)
+;
+define i32 @compare_olt_d(double %val, double %val2) local_unnamed_addr #0 {
+entry:
+  %cmpinf = fcmp olt double %val, %val2
+  %0 = zext i1 %cmpinf to i32
+  ret i32 %0
+}
+

From 3e2801eb634e6010ee783ea7c93718bbb6ed4c35 Mon Sep 17 00:00:00 2001
From: Lei Huang <lei@ca.ibm.com>
Date: Wed, 19 Mar 2025 10:19:57 -0400
Subject: [PATCH 274/282] [PowerPC] Support conversion between f16 and f128
 (#130158)

Enables conversion between f16 and f128.
Expanding on pre-Power9 targets and using HW instructions on Power9.

Fixes https://github.com/llvm/llvm-project/issues/92866
Commandeer of:  https://github.com/llvm/llvm-project/pull/97677

---------

Co-authored-by: esmeyi <esme.yi@ibm.com>
(cherry picked from commit ade22fc1d9616c95bd1aa4ea658a21ddb073b73c)
---
 llvm/lib/IR/RuntimeLibcalls.cpp             |   1 +
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp |   6 +
 llvm/lib/Target/PowerPC/PPCInstrVSX.td      |   4 +
 llvm/test/CodeGen/PowerPC/f128-conv.ll      | 487 ++++++++------------
 llvm/test/CodeGen/PowerPC/fp128-libcalls.ll |  17 +
 5 files changed, 215 insertions(+), 300 deletions(-)

diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index e38fce764b640..085a3bc0586b6 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -82,6 +82,7 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) {
     setLibcallName(RTLIB::POWI_F128, "__powikf2");
     setLibcallName(RTLIB::FPEXT_F32_F128, "__extendsfkf2");
     setLibcallName(RTLIB::FPEXT_F64_F128, "__extenddfkf2");
+    setLibcallName(RTLIB::FPROUND_F128_F16, "__trunckfhf2");
     setLibcallName(RTLIB::FPROUND_F128_F32, "__trunckfsf2");
     setLibcallName(RTLIB::FPROUND_F128_F64, "__trunckfdf2");
     setLibcallName(RTLIB::FPTOSINT_F128_I32, "__fixkfsi");
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 21ff6f050817a..16491a145a5b9 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -223,13 +223,19 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Expand);
   }
 
+  setTruncStoreAction(MVT::f128, MVT::f16, Expand);
+  setOperationAction(ISD::FP_TO_FP16, MVT::f128, Expand);
+
   if (Subtarget.isISA3_0()) {
+    setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f16, Legal);
     setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Legal);
     setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Legal);
     setTruncStoreAction(MVT::f64, MVT::f16, Legal);
     setTruncStoreAction(MVT::f32, MVT::f16, Legal);
   } else {
     // No extending loads from f16 or HW conversions back and forth.
+    setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f16, Expand);
+    setOperationAction(ISD::FP16_TO_FP, MVT::f128, Expand);
     setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
     setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
     setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
index 8e400bc63b785..a8724ea125140 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
@@ -3997,6 +3997,8 @@ defm : ScalToVecWPermute<
   (SUBREG_TO_REG (i64 1), (VEXTSH2Ds (LXSIHZX ForceXForm:$src)), sub_64)>;
 
 // Load/convert and convert/store patterns for f16.
+def : Pat<(f128 (extloadf16 ForceXForm:$src)),
+          (f128 (XSCVDPQP (XSCVHPDP (LXSIHZX ForceXForm:$src))))>;
 def : Pat<(f64 (extloadf16 ForceXForm:$src)),
           (f64 (XSCVHPDP (LXSIHZX ForceXForm:$src)))>;
 def : Pat<(truncstoref16 f64:$src, ForceXForm:$dst),
@@ -4005,6 +4007,8 @@ def : Pat<(f32 (extloadf16 ForceXForm:$src)),
           (f32 (COPY_TO_REGCLASS (XSCVHPDP (LXSIHZX ForceXForm:$src)), VSSRC))>;
 def : Pat<(truncstoref16 f32:$src, ForceXForm:$dst),
           (STXSIHX (XSCVDPHP (COPY_TO_REGCLASS $src, VSFRC)), ForceXForm:$dst)>;
+def : Pat<(f128 (f16_to_fp i32:$A)),
+          (f128 (XSCVDPQP (XSCVHPDP (MTVSRWZ $A))))>;
 def : Pat<(f64 (f16_to_fp i32:$A)),
           (f64 (XSCVHPDP (MTVSRWZ $A)))>;
 def : Pat<(f32 (f16_to_fp i32:$A)),
diff --git a/llvm/test/CodeGen/PowerPC/f128-conv.ll b/llvm/test/CodeGen/PowerPC/f128-conv.ll
index d8eed1fb4092c..f8b2861156db4 100644
--- a/llvm/test/CodeGen/PowerPC/f128-conv.ll
+++ b/llvm/test/CodeGen/PowerPC/f128-conv.ll
@@ -10,11 +10,11 @@
 @umem = global [5 x i64] [i64 560, i64 100, i64 34, i64 2, i64 5], align 8
 @swMem = global [5 x i32] [i32 5, i32 2, i32 3, i32 4, i32 0], align 4
 @uwMem = global [5 x i32] [i32 5, i32 2, i32 3, i32 4, i32 0], align 4
-@uhwMem = local_unnamed_addr global [5 x i16] [i16 5, i16 2, i16 3, i16 4, i16 0], align 2
-@ubMem = local_unnamed_addr global [5 x i8] c"\05\02\03\04\00", align 1
+@uhwMem = global [5 x i16] [i16 5, i16 2, i16 3, i16 4, i16 0], align 2
+@ubMem = global [5 x i8] c"\05\02\03\04\00", align 1
 
 ; Function Attrs: norecurse nounwind
-define void @sdwConv2qp(ptr nocapture %a, i64 %b) {
+define void @sdwConv2qp(ptr nocapture %a, i64 %b) nounwind {
 ; CHECK-LABEL: sdwConv2qp:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    mtvsrd v2, r4
@@ -25,9 +25,6 @@ define void @sdwConv2qp(ptr nocapture %a, i64 %b) {
 ; CHECK-P8-LABEL: sdwConv2qp:
 ; CHECK-P8:       # %bb.0: # %entry
 ; CHECK-P8-NEXT:    mflr r0
-; CHECK-P8-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    .cfi_offset r30, -16
 ; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    stdu r1, -48(r1)
 ; CHECK-P8-NEXT:    mr r30, r3
@@ -50,13 +47,10 @@ entry:
 }
 
 ; Function Attrs: norecurse nounwind
-define void @sdwConv2qp_01(ptr nocapture %a, i128 %b) {
+define void @sdwConv2qp_01(ptr nocapture %a, i128 %b) nounwind {
 ; CHECK-LABEL: sdwConv2qp_01:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    mflr r0
-; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    .cfi_offset lr, 16
-; CHECK-NEXT:    .cfi_offset r30, -16
 ; CHECK-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-NEXT:    stdu r1, -48(r1)
 ; CHECK-NEXT:    mr r30, r3
@@ -75,9 +69,6 @@ define void @sdwConv2qp_01(ptr nocapture %a, i128 %b) {
 ; CHECK-P8-LABEL: sdwConv2qp_01:
 ; CHECK-P8:       # %bb.0: # %entry
 ; CHECK-P8-NEXT:    mflr r0
-; CHECK-P8-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    .cfi_offset r30, -16
 ; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    stdu r1, -48(r1)
 ; CHECK-P8-NEXT:    mr r30, r3
@@ -101,7 +92,7 @@ entry:
 }
 
 ; Function Attrs: norecurse nounwind
-define void @sdwConv2qp_02(ptr nocapture %a) {
+define void @sdwConv2qp_02(ptr nocapture %a) nounwind {
 ; CHECK-LABEL: sdwConv2qp_02:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
@@ -114,9 +105,6 @@ define void @sdwConv2qp_02(ptr nocapture %a) {
 ; CHECK-P8-LABEL: sdwConv2qp_02:
 ; CHECK-P8:       # %bb.0: # %entry
 ; CHECK-P8-NEXT:    mflr r0
-; CHECK-P8-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    .cfi_offset r30, -16
 ; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    stdu r1, -48(r1)
 ; CHECK-P8-NEXT:    mr r30, r3
@@ -134,16 +122,16 @@ define void @sdwConv2qp_02(ptr nocapture %a) {
 ; CHECK-P8-NEXT:    mtlr r0
 ; CHECK-P8-NEXT:    blr
 entry:
-  %0 = load i64, ptr getelementptr inbounds
+  %i = load i64, ptr getelementptr inbounds
                         ([5 x i64], ptr @mem, i64 0, i64 2), align 8
-  %conv = sitofp i64 %0 to fp128
+  %conv = sitofp i64 %i to fp128
   store fp128 %conv, ptr %a, align 16
   ret void
 
 }
 
 ; Function Attrs: norecurse nounwind
-define void @sdwConv2qp_03(ptr nocapture %a, ptr nocapture readonly %b) {
+define void @sdwConv2qp_03(ptr nocapture %a, ptr nocapture readonly %b) nounwind {
 ; CHECK-LABEL: sdwConv2qp_03:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    lxsd v2, 0(r4)
@@ -154,9 +142,6 @@ define void @sdwConv2qp_03(ptr nocapture %a, ptr nocapture readonly %b) {
 ; CHECK-P8-LABEL: sdwConv2qp_03:
 ; CHECK-P8:       # %bb.0: # %entry
 ; CHECK-P8-NEXT:    mflr r0
-; CHECK-P8-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    .cfi_offset r30, -16
 ; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    stdu r1, -48(r1)
 ; CHECK-P8-NEXT:    std r0, 64(r1)
@@ -172,15 +157,15 @@ define void @sdwConv2qp_03(ptr nocapture %a, ptr nocapture readonly %b) {
 ; CHECK-P8-NEXT:    mtlr r0
 ; CHECK-P8-NEXT:    blr
 entry:
-  %0 = load i64, ptr %b, align 8
-  %conv = sitofp i64 %0 to fp128
+  %i = load i64, ptr %b, align 8
+  %conv = sitofp i64 %i to fp128
   store fp128 %conv, ptr %a, align 16
   ret void
 
 }
 
 ; Function Attrs: norecurse nounwind
-define void @sdwConv2qp_04(ptr nocapture %a, i1 %b) {
+define void @sdwConv2qp_04(ptr nocapture %a, i1 %b) nounwind {
 ; CHECK-LABEL: sdwConv2qp_04:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    andi. r4, r4, 1
@@ -195,9 +180,6 @@ define void @sdwConv2qp_04(ptr nocapture %a, i1 %b) {
 ; CHECK-P8-LABEL: sdwConv2qp_04:
 ; CHECK-P8:       # %bb.0: # %entry
 ; CHECK-P8-NEXT:    mflr r0
-; CHECK-P8-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    .cfi_offset r30, -16
 ; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    stdu r1, -48(r1)
 ; CHECK-P8-NEXT:    mr r30, r3
@@ -223,7 +205,7 @@ entry:
 }
 
 ; Function Attrs: norecurse nounwind
-define void @udwConv2qp(ptr nocapture %a, i64 %b) {
+define void @udwConv2qp(ptr nocapture %a, i64 %b) nounwind {
 ; CHECK-LABEL: udwConv2qp:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    mtvsrd v2, r4
@@ -234,9 +216,6 @@ define void @udwConv2qp(ptr nocapture %a, i64 %b) {
 ; CHECK-P8-LABEL: udwConv2qp:
 ; CHECK-P8:       # %bb.0: # %entry
 ; CHECK-P8-NEXT:    mflr r0
-; CHECK-P8-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    .cfi_offset r30, -16
 ; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    stdu r1, -48(r1)
 ; CHECK-P8-NEXT:    mr r30, r3
@@ -259,13 +238,10 @@ entry:
 }
 
 ; Function Attrs: norecurse nounwind
-define void @udwConv2qp_01(ptr nocapture %a, i128 %b) {
+define void @udwConv2qp_01(ptr nocapture %a, i128 %b) nounwind {
 ; CHECK-LABEL: udwConv2qp_01:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    mflr r0
-; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    .cfi_offset lr, 16
-; CHECK-NEXT:    .cfi_offset r30, -16
 ; CHECK-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-NEXT:    stdu r1, -48(r1)
 ; CHECK-NEXT:    mr r30, r3
@@ -284,9 +260,6 @@ define void @udwConv2qp_01(ptr nocapture %a, i128 %b) {
 ; CHECK-P8-LABEL: udwConv2qp_01:
 ; CHECK-P8:       # %bb.0: # %entry
 ; CHECK-P8-NEXT:    mflr r0
-; CHECK-P8-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    .cfi_offset r30, -16
 ; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    stdu r1, -48(r1)
 ; CHECK-P8-NEXT:    mr r30, r3
@@ -310,7 +283,7 @@ entry:
 }
 
 ; Function Attrs: norecurse nounwind
-define void @udwConv2qp_02(ptr nocapture %a) {
+define void @udwConv2qp_02(ptr nocapture %a) nounwind {
 ; CHECK-LABEL: udwConv2qp_02:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addis r4, r2, .LC1@toc@ha
@@ -323,9 +296,6 @@ define void @udwConv2qp_02(ptr nocapture %a) {
 ; CHECK-P8-LABEL: udwConv2qp_02:
 ; CHECK-P8:       # %bb.0: # %entry
 ; CHECK-P8-NEXT:    mflr r0
-; CHECK-P8-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    .cfi_offset r30, -16
 ; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    stdu r1, -48(r1)
 ; CHECK-P8-NEXT:    mr r30, r3
@@ -343,16 +313,16 @@ define void @udwConv2qp_02(ptr nocapture %a) {
 ; CHECK-P8-NEXT:    mtlr r0
 ; CHECK-P8-NEXT:    blr
 entry:
-  %0 = load i64, ptr getelementptr inbounds
+  %i = load i64, ptr getelementptr inbounds
                         ([5 x i64], ptr @umem, i64 0, i64 4), align 8
-  %conv = uitofp i64 %0 to fp128
+  %conv = uitofp i64 %i to fp128
   store fp128 %conv, ptr %a, align 16
   ret void
 
 }
 
 ; Function Attrs: norecurse nounwind
-define void @udwConv2qp_03(ptr nocapture %a, ptr nocapture readonly %b) {
+define void @udwConv2qp_03(ptr nocapture %a, ptr nocapture readonly %b) nounwind {
 ; CHECK-LABEL: udwConv2qp_03:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    lxsd v2, 0(r4)
@@ -363,9 +333,6 @@ define void @udwConv2qp_03(ptr nocapture %a, ptr nocapture readonly %b) {
 ; CHECK-P8-LABEL: udwConv2qp_03:
 ; CHECK-P8:       # %bb.0: # %entry
 ; CHECK-P8-NEXT:    mflr r0
-; CHECK-P8-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    .cfi_offset r30, -16
 ; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    stdu r1, -48(r1)
 ; CHECK-P8-NEXT:    std r0, 64(r1)
@@ -381,15 +348,15 @@ define void @udwConv2qp_03(ptr nocapture %a, ptr nocapture readonly %b) {
 ; CHECK-P8-NEXT:    mtlr r0
 ; CHECK-P8-NEXT:    blr
 entry:
-  %0 = load i64, ptr %b, align 8
-  %conv = uitofp i64 %0 to fp128
+  %i = load i64, ptr %b, align 8
+  %conv = uitofp i64 %i to fp128
   store fp128 %conv, ptr %a, align 16
   ret void
 
 }
 
 ; Function Attrs: norecurse nounwind
-define void @udwConv2qp_04(ptr nocapture %a, i1 %b) {
+define void @udwConv2qp_04(ptr nocapture %a, i1 %b) nounwind {
 ; CHECK-LABEL: udwConv2qp_04:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    clrlwi r4, r4, 31
@@ -401,9 +368,6 @@ define void @udwConv2qp_04(ptr nocapture %a, i1 %b) {
 ; CHECK-P8-LABEL: udwConv2qp_04:
 ; CHECK-P8:       # %bb.0: # %entry
 ; CHECK-P8-NEXT:    mflr r0
-; CHECK-P8-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    .cfi_offset r30, -16
 ; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    stdu r1, -48(r1)
 ; CHECK-P8-NEXT:    mr r30, r3
@@ -439,9 +403,6 @@ define ptr @sdwConv2qp_testXForm(ptr returned %sink,
 ; CHECK-P8-LABEL: sdwConv2qp_testXForm:
 ; CHECK-P8:       # %bb.0: # %entry
 ; CHECK-P8-NEXT:    mflr r0
-; CHECK-P8-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    .cfi_offset r30, -16
 ; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    stdu r1, -48(r1)
 ; CHECK-P8-NEXT:    mr r30, r3
@@ -459,11 +420,11 @@ define ptr @sdwConv2qp_testXForm(ptr returned %sink,
 ; CHECK-P8-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
 ; CHECK-P8-NEXT:    mtlr r0
 ; CHECK-P8-NEXT:    blr
-                                    ptr nocapture readonly %a) {
+                                 ptr nocapture readonly %a) nounwind {
 entry:
   %add.ptr = getelementptr inbounds i8, ptr %a, i64 73333
-  %0 = load i64, ptr %add.ptr, align 8
-  %conv = sitofp i64 %0 to fp128
+  %i = load i64, ptr %add.ptr, align 8
+  %conv = sitofp i64 %i to fp128
   store fp128 %conv, ptr %sink, align 16
   ret ptr %sink
 
@@ -483,9 +444,6 @@ define ptr @udwConv2qp_testXForm(ptr returned %sink,
 ; CHECK-P8-LABEL: udwConv2qp_testXForm:
 ; CHECK-P8:       # %bb.0: # %entry
 ; CHECK-P8-NEXT:    mflr r0
-; CHECK-P8-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    .cfi_offset r30, -16
 ; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    stdu r1, -48(r1)
 ; CHECK-P8-NEXT:    mr r30, r3
@@ -503,18 +461,18 @@ define ptr @udwConv2qp_testXForm(ptr returned %sink,
 ; CHECK-P8-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
 ; CHECK-P8-NEXT:    mtlr r0
 ; CHECK-P8-NEXT:    blr
-                                    ptr nocapture readonly %a) {
+                                 ptr nocapture readonly %a) nounwind {
 entry:
   %add.ptr = getelementptr inbounds i8, ptr %a, i64 73333
-  %0 = load i64, ptr %add.ptr, align 8
-  %conv = uitofp i64 %0 to fp128
+  %i = load i64, ptr %add.ptr, align 8
+  %conv = uitofp i64 %i to fp128
   store fp128 %conv, ptr %sink, align 16
   ret ptr %sink
 
 }
 
 ; Function Attrs: norecurse nounwind
-define void @swConv2qp(ptr nocapture %a, i32 signext %b) {
+define void @swConv2qp(ptr nocapture %a, i32 signext %b) nounwind {
 ; CHECK-LABEL: swConv2qp:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    mtvsrwa v2, r4
@@ -525,9 +483,6 @@ define void @swConv2qp(ptr nocapture %a, i32 signext %b) {
 ; CHECK-P8-LABEL: swConv2qp:
 ; CHECK-P8:       # %bb.0: # %entry
 ; CHECK-P8-NEXT:    mflr r0
-; CHECK-P8-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    .cfi_offset r30, -16
 ; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    stdu r1, -48(r1)
 ; CHECK-P8-NEXT:    mr r30, r3
@@ -550,7 +505,7 @@ entry:
 }
 
 ; Function Attrs: norecurse nounwind
-define void @swConv2qp_02(ptr nocapture %a, ptr nocapture readonly %b) {
+define void @swConv2qp_02(ptr nocapture %a, ptr nocapture readonly %b) nounwind {
 ; CHECK-LABEL: swConv2qp_02:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    lxsiwax v2, 0, r4
@@ -561,9 +516,6 @@ define void @swConv2qp_02(ptr nocapture %a, ptr nocapture readonly %b) {
 ; CHECK-P8-LABEL: swConv2qp_02:
 ; CHECK-P8:       # %bb.0: # %entry
 ; CHECK-P8-NEXT:    mflr r0
-; CHECK-P8-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    .cfi_offset r30, -16
 ; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    stdu r1, -48(r1)
 ; CHECK-P8-NEXT:    std r0, 64(r1)
@@ -579,15 +531,15 @@ define void @swConv2qp_02(ptr nocapture %a, ptr nocapture readonly %b) {
 ; CHECK-P8-NEXT:    mtlr r0
 ; CHECK-P8-NEXT:    blr
 entry:
-  %0 = load i32, ptr %b, align 4
-  %conv = sitofp i32 %0 to fp128
+  %i = load i32, ptr %b, align 4
+  %conv = sitofp i32 %i to fp128
   store fp128 %conv, ptr %a, align 16
   ret void
 
 }
 
 ; Function Attrs: norecurse nounwind
-define void @swConv2qp_03(ptr nocapture %a) {
+define void @swConv2qp_03(ptr nocapture %a) nounwind {
 ; CHECK-LABEL: swConv2qp_03:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addis r4, r2, .LC2@toc@ha
@@ -601,9 +553,6 @@ define void @swConv2qp_03(ptr nocapture %a) {
 ; CHECK-P8-LABEL: swConv2qp_03:
 ; CHECK-P8:       # %bb.0: # %entry
 ; CHECK-P8-NEXT:    mflr r0
-; CHECK-P8-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    .cfi_offset r30, -16
 ; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    stdu r1, -48(r1)
 ; CHECK-P8-NEXT:    mr r30, r3
@@ -621,16 +570,16 @@ define void @swConv2qp_03(ptr nocapture %a) {
 ; CHECK-P8-NEXT:    mtlr r0
 ; CHECK-P8-NEXT:    blr
 entry:
-  %0 = load i32, ptr getelementptr inbounds
+  %i = load i32, ptr getelementptr inbounds
                         ([5 x i32], ptr @swMem, i64 0, i64 3), align 4
-  %conv = sitofp i32 %0 to fp128
+  %conv = sitofp i32 %i to fp128
   store fp128 %conv, ptr %a, align 16
   ret void
 
 }
 
 ; Function Attrs: norecurse nounwind
-define void @uwConv2qp(ptr nocapture %a, i32 zeroext %b) {
+define void @uwConv2qp(ptr nocapture %a, i32 zeroext %b) nounwind {
 ; CHECK-LABEL: uwConv2qp:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    mtvsrwz v2, r4
@@ -641,9 +590,6 @@ define void @uwConv2qp(ptr nocapture %a, i32 zeroext %b) {
 ; CHECK-P8-LABEL: uwConv2qp:
 ; CHECK-P8:       # %bb.0: # %entry
 ; CHECK-P8-NEXT:    mflr r0
-; CHECK-P8-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    .cfi_offset r30, -16
 ; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    stdu r1, -48(r1)
 ; CHECK-P8-NEXT:    mr r30, r3
@@ -666,7 +612,7 @@ entry:
 }
 
 ; Function Attrs: norecurse nounwind
-define void @uwConv2qp_02(ptr nocapture %a, ptr nocapture readonly %b) {
+define void @uwConv2qp_02(ptr nocapture %a, ptr nocapture readonly %b) nounwind {
 ; CHECK-LABEL: uwConv2qp_02:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    lxsiwzx v2, 0, r4
@@ -677,9 +623,6 @@ define void @uwConv2qp_02(ptr nocapture %a, ptr nocapture readonly %b) {
 ; CHECK-P8-LABEL: uwConv2qp_02:
 ; CHECK-P8:       # %bb.0: # %entry
 ; CHECK-P8-NEXT:    mflr r0
-; CHECK-P8-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    .cfi_offset r30, -16
 ; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    stdu r1, -48(r1)
 ; CHECK-P8-NEXT:    std r0, 64(r1)
@@ -695,15 +638,15 @@ define void @uwConv2qp_02(ptr nocapture %a, ptr nocapture readonly %b) {
 ; CHECK-P8-NEXT:    mtlr r0
 ; CHECK-P8-NEXT:    blr
 entry:
-  %0 = load i32, ptr %b, align 4
-  %conv = uitofp i32 %0 to fp128
+  %i = load i32, ptr %b, align 4
+  %conv = uitofp i32 %i to fp128
   store fp128 %conv, ptr %a, align 16
   ret void
 
 }
 
 ; Function Attrs: norecurse nounwind
-define void @uwConv2qp_03(ptr nocapture %a) {
+define void @uwConv2qp_03(ptr nocapture %a) nounwind {
 ; CHECK-LABEL: uwConv2qp_03:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addis r4, r2, .LC3@toc@ha
@@ -717,9 +660,6 @@ define void @uwConv2qp_03(ptr nocapture %a) {
 ; CHECK-P8-LABEL: uwConv2qp_03:
 ; CHECK-P8:       # %bb.0: # %entry
 ; CHECK-P8-NEXT:    mflr r0
-; CHECK-P8-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    .cfi_offset r30, -16
 ; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    stdu r1, -48(r1)
 ; CHECK-P8-NEXT:    mr r30, r3
@@ -737,9 +677,9 @@ define void @uwConv2qp_03(ptr nocapture %a) {
 ; CHECK-P8-NEXT:    mtlr r0
 ; CHECK-P8-NEXT:    blr
 entry:
-  %0 = load i32, ptr getelementptr inbounds
+  %i = load i32, ptr getelementptr inbounds
                         ([5 x i32], ptr @uwMem, i64 0, i64 3), align 4
-  %conv = uitofp i32 %0 to fp128
+  %conv = uitofp i32 %i to fp128
   store fp128 %conv, ptr %a, align 16
   ret void
 
@@ -759,9 +699,6 @@ define void @uwConv2qp_04(ptr nocapture %a,
 ; CHECK-P8-LABEL: uwConv2qp_04:
 ; CHECK-P8:       # %bb.0: # %entry
 ; CHECK-P8-NEXT:    mflr r0
-; CHECK-P8-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    .cfi_offset r30, -16
 ; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    stdu r1, -48(r1)
 ; CHECK-P8-NEXT:    std r0, 64(r1)
@@ -778,10 +715,10 @@ define void @uwConv2qp_04(ptr nocapture %a,
 ; CHECK-P8-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
 ; CHECK-P8-NEXT:    mtlr r0
 ; CHECK-P8-NEXT:    blr
-                          i32 zeroext %b, ptr nocapture readonly %c) {
+                          i32 zeroext %b, ptr nocapture readonly %c) nounwind {
 entry:
-  %0 = load i32, ptr %c, align 4
-  %add = add i32 %0, %b
+  %i = load i32, ptr %c, align 4
+  %add = add i32 %i, %b
   %conv = uitofp i32 %add to fp128
   store fp128 %conv, ptr %a, align 16
   ret void
@@ -789,7 +726,7 @@ entry:
 }
 
 ; Function Attrs: norecurse nounwind
-define void @uhwConv2qp(ptr nocapture %a, i16 zeroext %b) {
+define void @uhwConv2qp(ptr nocapture %a, i16 zeroext %b) nounwind {
 ; CHECK-LABEL: uhwConv2qp:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    mtvsrwz v2, r4
@@ -800,9 +737,6 @@ define void @uhwConv2qp(ptr nocapture %a, i16 zeroext %b) {
 ; CHECK-P8-LABEL: uhwConv2qp:
 ; CHECK-P8:       # %bb.0: # %entry
 ; CHECK-P8-NEXT:    mflr r0
-; CHECK-P8-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    .cfi_offset r30, -16
 ; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    stdu r1, -48(r1)
 ; CHECK-P8-NEXT:    mr r30, r3
@@ -826,7 +760,7 @@ entry:
 }
 
 ; Function Attrs: norecurse nounwind
-define void @uhwConv2qp_02(ptr nocapture %a, ptr nocapture readonly %b) {
+define void @uhwConv2qp_02(ptr nocapture %a, ptr nocapture readonly %b) nounwind {
 ; CHECK-LABEL: uhwConv2qp_02:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    lxsihzx v2, 0, r4
@@ -837,9 +771,6 @@ define void @uhwConv2qp_02(ptr nocapture %a, ptr nocapture readonly %b) {
 ; CHECK-P8-LABEL: uhwConv2qp_02:
 ; CHECK-P8:       # %bb.0: # %entry
 ; CHECK-P8-NEXT:    mflr r0
-; CHECK-P8-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    .cfi_offset r30, -16
 ; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    stdu r1, -48(r1)
 ; CHECK-P8-NEXT:    std r0, 64(r1)
@@ -855,15 +786,15 @@ define void @uhwConv2qp_02(ptr nocapture %a, ptr nocapture readonly %b) {
 ; CHECK-P8-NEXT:    mtlr r0
 ; CHECK-P8-NEXT:    blr
 entry:
-  %0 = load i16, ptr %b, align 2
-  %conv = uitofp i16 %0 to fp128
+  %i = load i16, ptr %b, align 2
+  %conv = uitofp i16 %i to fp128
   store fp128 %conv, ptr %a, align 16
   ret void
 
 }
 
 ; Function Attrs: norecurse nounwind
-define void @uhwConv2qp_03(ptr nocapture %a) {
+define void @uhwConv2qp_03(ptr nocapture %a) nounwind {
 ; CHECK-LABEL: uhwConv2qp_03:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addis r4, r2, .LC4@toc@ha
@@ -877,9 +808,6 @@ define void @uhwConv2qp_03(ptr nocapture %a) {
 ; CHECK-P8-LABEL: uhwConv2qp_03:
 ; CHECK-P8:       # %bb.0: # %entry
 ; CHECK-P8-NEXT:    mflr r0
-; CHECK-P8-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    .cfi_offset r30, -16
 ; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    stdu r1, -48(r1)
 ; CHECK-P8-NEXT:    mr r30, r3
@@ -897,9 +825,9 @@ define void @uhwConv2qp_03(ptr nocapture %a) {
 ; CHECK-P8-NEXT:    mtlr r0
 ; CHECK-P8-NEXT:    blr
 entry:
-  %0 = load i16, ptr getelementptr inbounds
+  %i = load i16, ptr getelementptr inbounds
                         ([5 x i16], ptr @uhwMem, i64 0, i64 3), align 2
-  %conv = uitofp i16 %0 to fp128
+  %conv = uitofp i16 %i to fp128
   store fp128 %conv, ptr %a, align 16
   ret void
 
@@ -919,9 +847,6 @@ define void @uhwConv2qp_04(ptr nocapture %a, i16 zeroext %b,
 ; CHECK-P8-LABEL: uhwConv2qp_04:
 ; CHECK-P8:       # %bb.0: # %entry
 ; CHECK-P8-NEXT:    mflr r0
-; CHECK-P8-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    .cfi_offset r30, -16
 ; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    stdu r1, -48(r1)
 ; CHECK-P8-NEXT:    std r0, 64(r1)
@@ -938,11 +863,11 @@ define void @uhwConv2qp_04(ptr nocapture %a, i16 zeroext %b,
 ; CHECK-P8-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
 ; CHECK-P8-NEXT:    mtlr r0
 ; CHECK-P8-NEXT:    blr
-                           ptr nocapture readonly %c) {
+                           ptr nocapture readonly %c) nounwind {
 entry:
   %conv = zext i16 %b to i32
-  %0 = load i16, ptr %c, align 2
-  %conv1 = zext i16 %0 to i32
+  %i = load i16, ptr %c, align 2
+  %conv1 = zext i16 %i to i32
   %add = add nuw nsw i32 %conv1, %conv
   %conv2 = sitofp i32 %add to fp128
   store fp128 %conv2, ptr %a, align 16
@@ -951,7 +876,7 @@ entry:
 }
 
 ; Function Attrs: norecurse nounwind
-define void @ubConv2qp(ptr nocapture %a, i8 zeroext %b) {
+define void @ubConv2qp(ptr nocapture %a, i8 zeroext %b) nounwind {
 ; CHECK-LABEL: ubConv2qp:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    mtvsrwz v2, r4
@@ -962,9 +887,6 @@ define void @ubConv2qp(ptr nocapture %a, i8 zeroext %b) {
 ; CHECK-P8-LABEL: ubConv2qp:
 ; CHECK-P8:       # %bb.0: # %entry
 ; CHECK-P8-NEXT:    mflr r0
-; CHECK-P8-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    .cfi_offset r30, -16
 ; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    stdu r1, -48(r1)
 ; CHECK-P8-NEXT:    mr r30, r3
@@ -987,7 +909,7 @@ entry:
 }
 
 ; Function Attrs: norecurse nounwind
-define void @ubConv2qp_02(ptr nocapture %a, ptr nocapture readonly %b) {
+define void @ubConv2qp_02(ptr nocapture %a, ptr nocapture readonly %b) nounwind {
 ; CHECK-LABEL: ubConv2qp_02:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    lxsibzx v2, 0, r4
@@ -998,9 +920,6 @@ define void @ubConv2qp_02(ptr nocapture %a, ptr nocapture readonly %b) {
 ; CHECK-P8-LABEL: ubConv2qp_02:
 ; CHECK-P8:       # %bb.0: # %entry
 ; CHECK-P8-NEXT:    mflr r0
-; CHECK-P8-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    .cfi_offset r30, -16
 ; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    stdu r1, -48(r1)
 ; CHECK-P8-NEXT:    std r0, 64(r1)
@@ -1016,15 +935,15 @@ define void @ubConv2qp_02(ptr nocapture %a, ptr nocapture readonly %b) {
 ; CHECK-P8-NEXT:    mtlr r0
 ; CHECK-P8-NEXT:    blr
 entry:
-  %0 = load i8, ptr %b, align 1
-  %conv = uitofp i8 %0 to fp128
+  %i = load i8, ptr %b, align 1
+  %conv = uitofp i8 %i to fp128
   store fp128 %conv, ptr %a, align 16
   ret void
 
 }
 
 ; Function Attrs: norecurse nounwind
-define void @ubConv2qp_03(ptr nocapture %a) {
+define void @ubConv2qp_03(ptr nocapture %a) nounwind {
 ; CHECK-LABEL: ubConv2qp_03:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addis r4, r2, .LC5@toc@ha
@@ -1038,9 +957,6 @@ define void @ubConv2qp_03(ptr nocapture %a) {
 ; CHECK-P8-LABEL: ubConv2qp_03:
 ; CHECK-P8:       # %bb.0: # %entry
 ; CHECK-P8-NEXT:    mflr r0
-; CHECK-P8-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    .cfi_offset r30, -16
 ; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    stdu r1, -48(r1)
 ; CHECK-P8-NEXT:    mr r30, r3
@@ -1058,9 +974,9 @@ define void @ubConv2qp_03(ptr nocapture %a) {
 ; CHECK-P8-NEXT:    mtlr r0
 ; CHECK-P8-NEXT:    blr
 entry:
-  %0 = load i8, ptr getelementptr inbounds
+  %i = load i8, ptr getelementptr inbounds
                       ([5 x i8], ptr @ubMem, i64 0, i64 2), align 1
-  %conv = uitofp i8 %0 to fp128
+  %conv = uitofp i8 %i to fp128
   store fp128 %conv, ptr %a, align 16
   ret void
 
@@ -1080,9 +996,6 @@ define void @ubConv2qp_04(ptr nocapture %a, i8 zeroext %b,
 ; CHECK-P8-LABEL: ubConv2qp_04:
 ; CHECK-P8:       # %bb.0: # %entry
 ; CHECK-P8-NEXT:    mflr r0
-; CHECK-P8-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    .cfi_offset r30, -16
 ; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    stdu r1, -48(r1)
 ; CHECK-P8-NEXT:    std r0, 64(r1)
@@ -1099,11 +1012,11 @@ define void @ubConv2qp_04(ptr nocapture %a, i8 zeroext %b,
 ; CHECK-P8-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
 ; CHECK-P8-NEXT:    mtlr r0
 ; CHECK-P8-NEXT:    blr
-                          ptr nocapture readonly %c) {
+                          ptr nocapture readonly %c) nounwind {
 entry:
   %conv = zext i8 %b to i32
-  %0 = load i8, ptr %c, align 1
-  %conv1 = zext i8 %0 to i32
+  %i = load i8, ptr %c, align 1
+  %conv1 = zext i8 %i to i32
   %add = add nuw nsw i32 %conv1, %conv
   %conv2 = sitofp i32 %add to fp128
   store fp128 %conv2, ptr %a, align 16
@@ -1121,7 +1034,7 @@ entry:
 @f128global = global fp128 0xL300000000000000040089CA8F5C28F5C, align 16
 
 ; Function Attrs: norecurse nounwind readonly
-define double @qpConv2dp(ptr nocapture readonly %a) {
+define double @qpConv2dp(ptr nocapture readonly %a) nounwind {
 ; CHECK-LABEL: qpConv2dp:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    lxv v2, 0(r3)
@@ -1134,8 +1047,6 @@ define double @qpConv2dp(ptr nocapture readonly %a) {
 ; CHECK-P8-NEXT:    mflr r0
 ; CHECK-P8-NEXT:    stdu r1, -32(r1)
 ; CHECK-P8-NEXT:    std r0, 48(r1)
-; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-P8-NEXT:    .cfi_offset lr, 16
 ; CHECK-P8-NEXT:    lxvd2x vs0, 0, r3
 ; CHECK-P8-NEXT:    xxswapd v2, vs0
 ; CHECK-P8-NEXT:    bl __trunckfdf2
@@ -1145,13 +1056,13 @@ define double @qpConv2dp(ptr nocapture readonly %a) {
 ; CHECK-P8-NEXT:    mtlr r0
 ; CHECK-P8-NEXT:    blr
 entry:
-  %0 = load fp128, ptr %a, align 16
-  %conv = fptrunc fp128 %0 to double
+  %i = load fp128, ptr %a, align 16
+  %conv = fptrunc fp128 %i to double
   ret double %conv
 }
 
 ; Function Attrs: norecurse nounwind
-define void @qpConv2dp_02(ptr nocapture %res) {
+define void @qpConv2dp_02(ptr nocapture %res) nounwind {
 ; CHECK-LABEL: qpConv2dp_02:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addis r4, r2, .LC6@toc@ha
@@ -1164,9 +1075,6 @@ define void @qpConv2dp_02(ptr nocapture %res) {
 ; CHECK-P8-LABEL: qpConv2dp_02:
 ; CHECK-P8:       # %bb.0: # %entry
 ; CHECK-P8-NEXT:    mflr r0
-; CHECK-P8-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    .cfi_offset r30, -16
 ; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    stdu r1, -48(r1)
 ; CHECK-P8-NEXT:    mr r30, r3
@@ -1184,14 +1092,14 @@ define void @qpConv2dp_02(ptr nocapture %res) {
 ; CHECK-P8-NEXT:    mtlr r0
 ; CHECK-P8-NEXT:    blr
 entry:
-  %0 = load fp128, ptr @f128global, align 16
-  %conv = fptrunc fp128 %0 to double
+  %i = load fp128, ptr @f128global, align 16
+  %conv = fptrunc fp128 %i to double
   store double %conv, ptr %res, align 8
   ret void
 }
 
 ; Function Attrs: norecurse nounwind
-define void @qpConv2dp_03(ptr nocapture %res, i32 signext %idx) {
+define void @qpConv2dp_03(ptr nocapture %res, i32 signext %idx) nounwind {
 ; CHECK-LABEL: qpConv2dp_03:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addis r5, r2, .LC7@toc@ha
@@ -1205,10 +1113,6 @@ define void @qpConv2dp_03(ptr nocapture %res, i32 signext %idx) {
 ; CHECK-P8-LABEL: qpConv2dp_03:
 ; CHECK-P8:       # %bb.0: # %entry
 ; CHECK-P8-NEXT:    mflr r0
-; CHECK-P8-NEXT:    .cfi_def_cfa_offset 64
-; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    .cfi_offset r29, -24
-; CHECK-P8-NEXT:    .cfi_offset r30, -16
 ; CHECK-P8-NEXT:    std r29, -24(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    stdu r1, -64(r1)
@@ -1230,8 +1134,8 @@ define void @qpConv2dp_03(ptr nocapture %res, i32 signext %idx) {
 ; CHECK-P8-NEXT:    mtlr r0
 ; CHECK-P8-NEXT:    blr
 entry:
-  %0 = load fp128, ptr @f128Array, align 16
-  %conv = fptrunc fp128 %0 to double
+  %i = load fp128, ptr @f128Array, align 16
+  %conv = fptrunc fp128 %i to double
   %idxprom = sext i32 %idx to i64
   %arrayidx = getelementptr inbounds double, ptr %res, i64 %idxprom
   store double %conv, ptr %arrayidx, align 8
@@ -1239,7 +1143,7 @@ entry:
 }
 
 ; Function Attrs: norecurse nounwind
-define void @qpConv2dp_04(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %res) {
+define void @qpConv2dp_04(ptr nocapture readonly %a, ptr nocapture readonly %b,
 ; CHECK-LABEL: qpConv2dp_04:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    lxv v2, 0(r3)
@@ -1252,9 +1156,6 @@ define void @qpConv2dp_04(ptr nocapture readonly %a, ptr nocapture readonly %b,
 ; CHECK-P8-LABEL: qpConv2dp_04:
 ; CHECK-P8:       # %bb.0: # %entry
 ; CHECK-P8-NEXT:    mflr r0
-; CHECK-P8-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    .cfi_offset r30, -16
 ; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    stdu r1, -48(r1)
 ; CHECK-P8-NEXT:    std r0, 64(r1)
@@ -1273,10 +1174,11 @@ define void @qpConv2dp_04(ptr nocapture readonly %a, ptr nocapture readonly %b,
 ; CHECK-P8-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
 ; CHECK-P8-NEXT:    mtlr r0
 ; CHECK-P8-NEXT:    blr
+                          ptr nocapture %res) nounwind {
 entry:
-  %0 = load fp128, ptr %a, align 16
+  %i = load fp128, ptr %a, align 16
   %1 = load fp128, ptr %b, align 16
-  %add = fadd fp128 %0, %1
+  %add = fadd fp128 %i, %1
   %conv = fptrunc fp128 %add to double
   store double %conv, ptr %res, align 8
   ret void
@@ -1285,7 +1187,7 @@ entry:
 ;  Convert QP to SP
 
 ; Function Attrs: norecurse nounwind readonly
-define float @qpConv2sp(ptr nocapture readonly %a) {
+define float @qpConv2sp(ptr nocapture readonly %a) nounwind {
 ; CHECK-LABEL: qpConv2sp:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    lxv v2, 0(r3)
@@ -1298,8 +1200,6 @@ define float @qpConv2sp(ptr nocapture readonly %a) {
 ; CHECK-P8-NEXT:    mflr r0
 ; CHECK-P8-NEXT:    stdu r1, -32(r1)
 ; CHECK-P8-NEXT:    std r0, 48(r1)
-; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-P8-NEXT:    .cfi_offset lr, 16
 ; CHECK-P8-NEXT:    lxvd2x vs0, 0, r3
 ; CHECK-P8-NEXT:    xxswapd v2, vs0
 ; CHECK-P8-NEXT:    bl __trunckfsf2
@@ -1309,13 +1209,13 @@ define float @qpConv2sp(ptr nocapture readonly %a) {
 ; CHECK-P8-NEXT:    mtlr r0
 ; CHECK-P8-NEXT:    blr
 entry:
-  %0 = load fp128, ptr %a, align 16
-  %conv = fptrunc fp128 %0 to float
+  %i = load fp128, ptr %a, align 16
+  %conv = fptrunc fp128 %i to float
   ret float %conv
 }
 
 ; Function Attrs: norecurse nounwind
-define void @qpConv2sp_02(ptr nocapture %res) {
+define void @qpConv2sp_02(ptr nocapture %res) nounwind {
 ; CHECK-LABEL: qpConv2sp_02:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addis r4, r2, .LC6@toc@ha
@@ -1329,9 +1229,6 @@ define void @qpConv2sp_02(ptr nocapture %res) {
 ; CHECK-P8-LABEL: qpConv2sp_02:
 ; CHECK-P8:       # %bb.0: # %entry
 ; CHECK-P8-NEXT:    mflr r0
-; CHECK-P8-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    .cfi_offset r30, -16
 ; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    stdu r1, -48(r1)
 ; CHECK-P8-NEXT:    mr r30, r3
@@ -1349,14 +1246,14 @@ define void @qpConv2sp_02(ptr nocapture %res) {
 ; CHECK-P8-NEXT:    mtlr r0
 ; CHECK-P8-NEXT:    blr
 entry:
-  %0 = load fp128, ptr @f128global, align 16
-  %conv = fptrunc fp128 %0 to float
+  %i = load fp128, ptr @f128global, align 16
+  %conv = fptrunc fp128 %i to float
   store float %conv, ptr %res, align 4
   ret void
 }
 
 ; Function Attrs: norecurse nounwind
-define void @qpConv2sp_03(ptr nocapture %res, i32 signext %idx) {
+define void @qpConv2sp_03(ptr nocapture %res, i32 signext %idx) nounwind {
 ; CHECK-LABEL: qpConv2sp_03:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addis r5, r2, .LC7@toc@ha
@@ -1371,10 +1268,6 @@ define void @qpConv2sp_03(ptr nocapture %res, i32 signext %idx) {
 ; CHECK-P8-LABEL: qpConv2sp_03:
 ; CHECK-P8:       # %bb.0: # %entry
 ; CHECK-P8-NEXT:    mflr r0
-; CHECK-P8-NEXT:    .cfi_def_cfa_offset 64
-; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    .cfi_offset r29, -24
-; CHECK-P8-NEXT:    .cfi_offset r30, -16
 ; CHECK-P8-NEXT:    std r29, -24(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    stdu r1, -64(r1)
@@ -1397,8 +1290,8 @@ define void @qpConv2sp_03(ptr nocapture %res, i32 signext %idx) {
 ; CHECK-P8-NEXT:    mtlr r0
 ; CHECK-P8-NEXT:    blr
 entry:
-  %0 = load fp128, ptr getelementptr inbounds ([4 x fp128], ptr @f128Array, i64 0, i64 3), align 16
-  %conv = fptrunc fp128 %0 to float
+  %i = load fp128, ptr getelementptr inbounds ([4 x fp128], ptr @f128Array, i64 0, i64 3), align 16
+  %conv = fptrunc fp128 %i to float
   %idxprom = sext i32 %idx to i64
   %arrayidx = getelementptr inbounds float, ptr %res, i64 %idxprom
   store float %conv, ptr %arrayidx, align 4
@@ -1406,7 +1299,7 @@ entry:
 }
 
 ; Function Attrs: norecurse nounwind
-define void @qpConv2sp_04(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %res) {
+define void @qpConv2sp_04(ptr nocapture readonly %a, ptr nocapture readonly %b,
 ; CHECK-LABEL: qpConv2sp_04:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    lxv v2, 0(r3)
@@ -1420,9 +1313,6 @@ define void @qpConv2sp_04(ptr nocapture readonly %a, ptr nocapture readonly %b,
 ; CHECK-P8-LABEL: qpConv2sp_04:
 ; CHECK-P8:       # %bb.0: # %entry
 ; CHECK-P8-NEXT:    mflr r0
-; CHECK-P8-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    .cfi_offset r30, -16
 ; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    stdu r1, -48(r1)
 ; CHECK-P8-NEXT:    std r0, 64(r1)
@@ -1441,19 +1331,78 @@ define void @qpConv2sp_04(ptr nocapture readonly %a, ptr nocapture readonly %b,
 ; CHECK-P8-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
 ; CHECK-P8-NEXT:    mtlr r0
 ; CHECK-P8-NEXT:    blr
+                          ptr nocapture %res) nounwind {
 entry:
-  %0 = load fp128, ptr %a, align 16
+  %i = load fp128, ptr %a, align 16
   %1 = load fp128, ptr %b, align 16
-  %add = fadd fp128 %0, %1
+  %add = fadd fp128 %i, %1
   %conv = fptrunc fp128 %add to float
   store float %conv, ptr %res, align 4
   ret void
 }
 
+define half @trunc(fp128 %a) nounwind {
+; CHECK-LABEL: trunc:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stdu r1, -32(r1)
+; CHECK-NEXT:    std r0, 48(r1)
+; CHECK-NEXT:    bl __trunckfhf2
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    clrlwi r3, r3, 16
+; CHECK-NEXT:    mtfprwz f0, r3
+; CHECK-NEXT:    xscvhpdp f1, f0
+; CHECK-NEXT:    addi r1, r1, 32
+; CHECK-NEXT:    ld r0, 16(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+;
+; CHECK-P8-LABEL: trunc:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    mflr r0
+; CHECK-P8-NEXT:    stdu r1, -32(r1)
+; CHECK-P8-NEXT:    std r0, 48(r1)
+; CHECK-P8-NEXT:    bl __trunckfhf2
+; CHECK-P8-NEXT:    nop
+; CHECK-P8-NEXT:    clrldi r3, r3, 48
+; CHECK-P8-NEXT:    bl __extendhfsf2
+; CHECK-P8-NEXT:    nop
+; CHECK-P8-NEXT:    addi r1, r1, 32
+; CHECK-P8-NEXT:    ld r0, 16(r1)
+; CHECK-P8-NEXT:    mtlr r0
+; CHECK-P8-NEXT:    blr
+entry:
+  %i = fptrunc fp128 %a to half
+  ret half %i
+}
+
+define fp128 @ext(half %a) nounwind {
+; CHECK-LABEL: ext:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xscpsgndp v2, f1, f1
+; CHECK-NEXT:    xscvdpqp v2, v2
+; CHECK-NEXT:    blr
+;
+; CHECK-P8-LABEL: ext:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    mflr r0
+; CHECK-P8-NEXT:    stdu r1, -32(r1)
+; CHECK-P8-NEXT:    std r0, 48(r1)
+; CHECK-P8-NEXT:    bl __extendsfkf2
+; CHECK-P8-NEXT:    nop
+; CHECK-P8-NEXT:    addi r1, r1, 32
+; CHECK-P8-NEXT:    ld r0, 16(r1)
+; CHECK-P8-NEXT:    mtlr r0
+; CHECK-P8-NEXT:    blr
+entry:
+  %i = fpext half %a to fp128
+  ret fp128 %i
+}
+
 @f128Glob = common global fp128 0xL00000000000000000000000000000000, align 16
 
 ; Function Attrs: norecurse nounwind readnone
-define fp128 @dpConv2qp(double %a) {
+define fp128 @dpConv2qp(double %a) nounwind {
 ; CHECK-LABEL: dpConv2qp:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xscpsgndp v2, f1, f1
@@ -1465,8 +1414,6 @@ define fp128 @dpConv2qp(double %a) {
 ; CHECK-P8-NEXT:    mflr r0
 ; CHECK-P8-NEXT:    stdu r1, -32(r1)
 ; CHECK-P8-NEXT:    std r0, 48(r1)
-; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-P8-NEXT:    .cfi_offset lr, 16
 ; CHECK-P8-NEXT:    bl __extenddfkf2
 ; CHECK-P8-NEXT:    nop
 ; CHECK-P8-NEXT:    addi r1, r1, 32
@@ -1479,7 +1426,7 @@ entry:
 }
 
 ; Function Attrs: norecurse nounwind
-define void @dpConv2qp_02(ptr nocapture readonly %a) {
+define void @dpConv2qp_02(ptr nocapture readonly %a) nounwind {
 ; CHECK-LABEL: dpConv2qp_02:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    lxsd v2, 0(r3)
@@ -1494,8 +1441,6 @@ define void @dpConv2qp_02(ptr nocapture readonly %a) {
 ; CHECK-P8-NEXT:    mflr r0
 ; CHECK-P8-NEXT:    stdu r1, -32(r1)
 ; CHECK-P8-NEXT:    std r0, 48(r1)
-; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-P8-NEXT:    .cfi_offset lr, 16
 ; CHECK-P8-NEXT:    lfd f1, 0(r3)
 ; CHECK-P8-NEXT:    bl __extenddfkf2
 ; CHECK-P8-NEXT:    nop
@@ -1508,14 +1453,14 @@ define void @dpConv2qp_02(ptr nocapture readonly %a) {
 ; CHECK-P8-NEXT:    mtlr r0
 ; CHECK-P8-NEXT:    blr
 entry:
-  %0 = load double, ptr %a, align 8
-  %conv = fpext double %0 to fp128
+  %i = load double, ptr %a, align 8
+  %conv = fpext double %i to fp128
   store fp128 %conv, ptr @f128Glob, align 16
   ret void
 }
 
 ; Function Attrs: norecurse nounwind
-define void @dpConv2qp_02b(ptr nocapture readonly %a, i32 signext %idx) {
+define void @dpConv2qp_02b(ptr nocapture readonly %a, i32 signext %idx) nounwind {
 ; CHECK-LABEL: dpConv2qp_02b:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    sldi r4, r4, 3
@@ -1530,10 +1475,8 @@ define void @dpConv2qp_02b(ptr nocapture readonly %a, i32 signext %idx) {
 ; CHECK-P8:       # %bb.0: # %entry
 ; CHECK-P8-NEXT:    mflr r0
 ; CHECK-P8-NEXT:    stdu r1, -32(r1)
-; CHECK-P8-NEXT:    std r0, 48(r1)
-; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-P8-NEXT:    .cfi_offset lr, 16
 ; CHECK-P8-NEXT:    sldi r4, r4, 3
+; CHECK-P8-NEXT:    std r0, 48(r1)
 ; CHECK-P8-NEXT:    lfdx f1, r3, r4
 ; CHECK-P8-NEXT:    bl __extenddfkf2
 ; CHECK-P8-NEXT:    nop
@@ -1548,14 +1491,14 @@ define void @dpConv2qp_02b(ptr nocapture readonly %a, i32 signext %idx) {
 entry:
   %idxprom = sext i32 %idx to i64
   %arrayidx = getelementptr inbounds double, ptr %a, i64 %idxprom
-  %0 = load double, ptr %arrayidx, align 8
-  %conv = fpext double %0 to fp128
+  %i = load double, ptr %arrayidx, align 8
+  %conv = fpext double %i to fp128
   store fp128 %conv, ptr @f128Glob, align 16
   ret void
 }
 
 ; Function Attrs: norecurse nounwind
-define void @dpConv2qp_03(ptr nocapture %res, i32 signext %idx, double %a) {
+define void @dpConv2qp_03(ptr nocapture %res, i32 signext %idx, double %a) nounwind {
 ; CHECK-LABEL: dpConv2qp_03:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xscpsgndp v2, f1, f1
@@ -1567,10 +1510,6 @@ define void @dpConv2qp_03(ptr nocapture %res, i32 signext %idx, double %a) {
 ; CHECK-P8-LABEL: dpConv2qp_03:
 ; CHECK-P8:       # %bb.0: # %entry
 ; CHECK-P8-NEXT:    mflr r0
-; CHECK-P8-NEXT:    .cfi_def_cfa_offset 64
-; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    .cfi_offset r29, -24
-; CHECK-P8-NEXT:    .cfi_offset r30, -16
 ; CHECK-P8-NEXT:    std r29, -24(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    stdu r1, -64(r1)
@@ -1597,7 +1536,7 @@ entry:
 }
 
 ; Function Attrs: norecurse nounwind
-define void @dpConv2qp_04(double %a, ptr nocapture %res) {
+define void @dpConv2qp_04(double %a, ptr nocapture %res) nounwind {
 ; CHECK-LABEL: dpConv2qp_04:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xscpsgndp v2, f1, f1
@@ -1608,9 +1547,6 @@ define void @dpConv2qp_04(double %a, ptr nocapture %res) {
 ; CHECK-P8-LABEL: dpConv2qp_04:
 ; CHECK-P8:       # %bb.0: # %entry
 ; CHECK-P8-NEXT:    mflr r0
-; CHECK-P8-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    .cfi_offset r30, -16
 ; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    stdu r1, -48(r1)
 ; CHECK-P8-NEXT:    std r0, 64(r1)
@@ -1631,7 +1567,7 @@ entry:
 }
 
 ; Function Attrs: norecurse nounwind readnone
-define fp128 @spConv2qp(float %a) {
+define fp128 @spConv2qp(float %a) nounwind {
 ; CHECK-LABEL: spConv2qp:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xscpsgndp v2, f1, f1
@@ -1643,8 +1579,6 @@ define fp128 @spConv2qp(float %a) {
 ; CHECK-P8-NEXT:    mflr r0
 ; CHECK-P8-NEXT:    stdu r1, -32(r1)
 ; CHECK-P8-NEXT:    std r0, 48(r1)
-; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-P8-NEXT:    .cfi_offset lr, 16
 ; CHECK-P8-NEXT:    bl __extendsfkf2
 ; CHECK-P8-NEXT:    nop
 ; CHECK-P8-NEXT:    addi r1, r1, 32
@@ -1657,7 +1591,7 @@ entry:
 }
 
 ; Function Attrs: norecurse nounwind
-define void @spConv2qp_02(ptr nocapture readonly %a) {
+define void @spConv2qp_02(ptr nocapture readonly %a) nounwind {
 ; CHECK-LABEL: spConv2qp_02:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    lxssp v2, 0(r3)
@@ -1672,8 +1606,6 @@ define void @spConv2qp_02(ptr nocapture readonly %a) {
 ; CHECK-P8-NEXT:    mflr r0
 ; CHECK-P8-NEXT:    stdu r1, -32(r1)
 ; CHECK-P8-NEXT:    std r0, 48(r1)
-; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-P8-NEXT:    .cfi_offset lr, 16
 ; CHECK-P8-NEXT:    lfs f1, 0(r3)
 ; CHECK-P8-NEXT:    bl __extendsfkf2
 ; CHECK-P8-NEXT:    nop
@@ -1686,14 +1618,14 @@ define void @spConv2qp_02(ptr nocapture readonly %a) {
 ; CHECK-P8-NEXT:    mtlr r0
 ; CHECK-P8-NEXT:    blr
 entry:
-  %0 = load float, ptr %a, align 4
-  %conv = fpext float %0 to fp128
+  %i = load float, ptr %a, align 4
+  %conv = fpext float %i to fp128
   store fp128 %conv, ptr @f128Glob, align 16
   ret void
 }
 
 ; Function Attrs: norecurse nounwind
-define void @spConv2qp_02b(ptr nocapture readonly %a, i32 signext %idx) {
+define void @spConv2qp_02b(ptr nocapture readonly %a, i32 signext %idx) nounwind {
 ; CHECK-LABEL: spConv2qp_02b:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    sldi r4, r4, 2
@@ -1708,10 +1640,8 @@ define void @spConv2qp_02b(ptr nocapture readonly %a, i32 signext %idx) {
 ; CHECK-P8:       # %bb.0: # %entry
 ; CHECK-P8-NEXT:    mflr r0
 ; CHECK-P8-NEXT:    stdu r1, -32(r1)
-; CHECK-P8-NEXT:    std r0, 48(r1)
-; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-P8-NEXT:    .cfi_offset lr, 16
 ; CHECK-P8-NEXT:    sldi r4, r4, 2
+; CHECK-P8-NEXT:    std r0, 48(r1)
 ; CHECK-P8-NEXT:    lfsx f1, r3, r4
 ; CHECK-P8-NEXT:    bl __extendsfkf2
 ; CHECK-P8-NEXT:    nop
@@ -1726,14 +1656,14 @@ define void @spConv2qp_02b(ptr nocapture readonly %a, i32 signext %idx) {
 entry:
   %idxprom = sext i32 %idx to i64
   %arrayidx = getelementptr inbounds float, ptr %a, i64 %idxprom
-  %0 = load float, ptr %arrayidx, align 4
-  %conv = fpext float %0 to fp128
+  %i = load float, ptr %arrayidx, align 4
+  %conv = fpext float %i to fp128
   store fp128 %conv, ptr @f128Glob, align 16
   ret void
 }
 
 ; Function Attrs: norecurse nounwind
-define void @spConv2qp_03(ptr nocapture %res, i32 signext %idx, float %a) {
+define void @spConv2qp_03(ptr nocapture %res, i32 signext %idx, float %a) nounwind {
 ; CHECK-LABEL: spConv2qp_03:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xscpsgndp v2, f1, f1
@@ -1745,10 +1675,6 @@ define void @spConv2qp_03(ptr nocapture %res, i32 signext %idx, float %a) {
 ; CHECK-P8-LABEL: spConv2qp_03:
 ; CHECK-P8:       # %bb.0: # %entry
 ; CHECK-P8-NEXT:    mflr r0
-; CHECK-P8-NEXT:    .cfi_def_cfa_offset 64
-; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    .cfi_offset r29, -24
-; CHECK-P8-NEXT:    .cfi_offset r30, -16
 ; CHECK-P8-NEXT:    std r29, -24(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    stdu r1, -64(r1)
@@ -1775,7 +1701,7 @@ entry:
 }
 
 ; Function Attrs: norecurse nounwind
-define void @spConv2qp_04(float %a, ptr nocapture %res) {
+define void @spConv2qp_04(float %a, ptr nocapture %res) nounwind {
 ; CHECK-LABEL: spConv2qp_04:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xscpsgndp v2, f1, f1
@@ -1786,9 +1712,6 @@ define void @spConv2qp_04(float %a, ptr nocapture %res) {
 ; CHECK-P8-LABEL: spConv2qp_04:
 ; CHECK-P8:       # %bb.0: # %entry
 ; CHECK-P8-NEXT:    mflr r0
-; CHECK-P8-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    .cfi_offset r30, -16
 ; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    stdu r1, -48(r1)
 ; CHECK-P8-NEXT:    std r0, 64(r1)
@@ -1810,7 +1733,7 @@ entry:
 
 
 ; Function Attrs: norecurse nounwind
-define void @cvdp2sw2qp(double %val, ptr nocapture %res) {
+define void @cvdp2sw2qp(double %val, ptr nocapture %res) nounwind {
 ; CHECK-LABEL: cvdp2sw2qp:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xscvdpsxws v2, f1
@@ -1822,9 +1745,6 @@ define void @cvdp2sw2qp(double %val, ptr nocapture %res) {
 ; CHECK-P8-LABEL: cvdp2sw2qp:
 ; CHECK-P8:       # %bb.0: # %entry
 ; CHECK-P8-NEXT:    mflr r0
-; CHECK-P8-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    .cfi_offset r30, -16
 ; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    stdu r1, -48(r1)
 ; CHECK-P8-NEXT:    xscvdpsxws f0, f1
@@ -1849,7 +1769,7 @@ entry:
 }
 
 ; Function Attrs: norecurse nounwind
-define void @cvdp2sdw2qp(double %val, ptr nocapture %res) {
+define void @cvdp2sdw2qp(double %val, ptr nocapture %res) nounwind {
 ; CHECK-LABEL: cvdp2sdw2qp:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xscvdpsxds v2, f1
@@ -1860,9 +1780,6 @@ define void @cvdp2sdw2qp(double %val, ptr nocapture %res) {
 ; CHECK-P8-LABEL: cvdp2sdw2qp:
 ; CHECK-P8:       # %bb.0: # %entry
 ; CHECK-P8-NEXT:    mflr r0
-; CHECK-P8-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    .cfi_offset r30, -16
 ; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    stdu r1, -48(r1)
 ; CHECK-P8-NEXT:    xscvdpsxds f0, f1
@@ -1886,7 +1803,7 @@ entry:
 }
 
 ; Function Attrs: norecurse nounwind
-define void @cvsp2sw2qp(float %val, ptr nocapture %res) {
+define void @cvsp2sw2qp(float %val, ptr nocapture %res) nounwind {
 ; CHECK-LABEL: cvsp2sw2qp:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xscvdpsxws v2, f1
@@ -1898,9 +1815,6 @@ define void @cvsp2sw2qp(float %val, ptr nocapture %res) {
 ; CHECK-P8-LABEL: cvsp2sw2qp:
 ; CHECK-P8:       # %bb.0: # %entry
 ; CHECK-P8-NEXT:    mflr r0
-; CHECK-P8-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    .cfi_offset r30, -16
 ; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    stdu r1, -48(r1)
 ; CHECK-P8-NEXT:    xscvdpsxws f0, f1
@@ -1925,7 +1839,7 @@ entry:
 }
 
 ; Function Attrs: norecurse nounwind
-define void @cvsp2sdw2qp(float %val, ptr nocapture %res) {
+define void @cvsp2sdw2qp(float %val, ptr nocapture %res) nounwind {
 ; CHECK-LABEL: cvsp2sdw2qp:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xscvdpsxds v2, f1
@@ -1936,9 +1850,6 @@ define void @cvsp2sdw2qp(float %val, ptr nocapture %res) {
 ; CHECK-P8-LABEL: cvsp2sdw2qp:
 ; CHECK-P8:       # %bb.0: # %entry
 ; CHECK-P8-NEXT:    mflr r0
-; CHECK-P8-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    .cfi_offset r30, -16
 ; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    stdu r1, -48(r1)
 ; CHECK-P8-NEXT:    xscvdpsxds f0, f1
@@ -1962,7 +1873,7 @@ entry:
 }
 
 ; Function Attrs: norecurse nounwind
-define void @cvdp2uw2qp(double %val, ptr nocapture %res) {
+define void @cvdp2uw2qp(double %val, ptr nocapture %res) nounwind {
 ; CHECK-LABEL: cvdp2uw2qp:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xscvdpuxws f0, f1
@@ -1974,9 +1885,6 @@ define void @cvdp2uw2qp(double %val, ptr nocapture %res) {
 ; CHECK-P8-LABEL: cvdp2uw2qp:
 ; CHECK-P8:       # %bb.0: # %entry
 ; CHECK-P8-NEXT:    mflr r0
-; CHECK-P8-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    .cfi_offset r30, -16
 ; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    stdu r1, -48(r1)
 ; CHECK-P8-NEXT:    xscvdpuxws f0, f1
@@ -2000,7 +1908,7 @@ entry:
 }
 
 ; Function Attrs: norecurse nounwind
-define void @cvdp2udw2qp(double %val, ptr nocapture %res) {
+define void @cvdp2udw2qp(double %val, ptr nocapture %res) nounwind {
 ; CHECK-LABEL: cvdp2udw2qp:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xscvdpuxds v2, f1
@@ -2011,9 +1919,6 @@ define void @cvdp2udw2qp(double %val, ptr nocapture %res) {
 ; CHECK-P8-LABEL: cvdp2udw2qp:
 ; CHECK-P8:       # %bb.0: # %entry
 ; CHECK-P8-NEXT:    mflr r0
-; CHECK-P8-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    .cfi_offset r30, -16
 ; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    stdu r1, -48(r1)
 ; CHECK-P8-NEXT:    xscvdpuxds f0, f1
@@ -2037,7 +1942,7 @@ entry:
 }
 
 ; Function Attrs: norecurse nounwind
-define void @cvsp2uw2qp(float %val, ptr nocapture %res) {
+define void @cvsp2uw2qp(float %val, ptr nocapture %res) nounwind {
 ; CHECK-LABEL: cvsp2uw2qp:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xscvdpuxws f0, f1
@@ -2049,9 +1954,6 @@ define void @cvsp2uw2qp(float %val, ptr nocapture %res) {
 ; CHECK-P8-LABEL: cvsp2uw2qp:
 ; CHECK-P8:       # %bb.0: # %entry
 ; CHECK-P8-NEXT:    mflr r0
-; CHECK-P8-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    .cfi_offset r30, -16
 ; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    stdu r1, -48(r1)
 ; CHECK-P8-NEXT:    xscvdpuxws f0, f1
@@ -2075,7 +1977,7 @@ entry:
 }
 
 ; Function Attrs: norecurse nounwind
-define void @cvsp2udw2qp(float %val, ptr nocapture %res) {
+define void @cvsp2udw2qp(float %val, ptr nocapture %res) nounwind {
 ; CHECK-LABEL: cvsp2udw2qp:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xscvdpuxds v2, f1
@@ -2086,9 +1988,6 @@ define void @cvsp2udw2qp(float %val, ptr nocapture %res) {
 ; CHECK-P8-LABEL: cvsp2udw2qp:
 ; CHECK-P8:       # %bb.0: # %entry
 ; CHECK-P8-NEXT:    mflr r0
-; CHECK-P8-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    .cfi_offset r30, -16
 ; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    stdu r1, -48(r1)
 ; CHECK-P8-NEXT:    xscvdpuxds f0, f1
@@ -2112,14 +2011,12 @@ entry:
 }
 
 ; Function Attrs: norecurse nounwind readonly
-define i128 @qpConv2i128(ptr nocapture readonly %a) {
+define i128 @qpConv2i128(ptr nocapture readonly %a) nounwind {
 ; CHECK-LABEL: qpConv2i128:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    mflr r0
 ; CHECK-NEXT:    stdu r1, -32(r1)
 ; CHECK-NEXT:    std r0, 48(r1)
-; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    .cfi_offset lr, 16
 ; CHECK-NEXT:    lxv v2, 0(r3)
 ; CHECK-NEXT:    bl __fixkfti
 ; CHECK-NEXT:    nop
@@ -2133,8 +2030,6 @@ define i128 @qpConv2i128(ptr nocapture readonly %a) {
 ; CHECK-P8-NEXT:    mflr r0
 ; CHECK-P8-NEXT:    stdu r1, -32(r1)
 ; CHECK-P8-NEXT:    std r0, 48(r1)
-; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-P8-NEXT:    .cfi_offset lr, 16
 ; CHECK-P8-NEXT:    lxvd2x vs0, 0, r3
 ; CHECK-P8-NEXT:    xxswapd v2, vs0
 ; CHECK-P8-NEXT:    bl __fixkfti
@@ -2144,20 +2039,18 @@ define i128 @qpConv2i128(ptr nocapture readonly %a) {
 ; CHECK-P8-NEXT:    mtlr r0
 ; CHECK-P8-NEXT:    blr
 entry:
-  %0 = load fp128, ptr %a, align 16
-  %conv = fptosi fp128 %0 to i128
+  %i = load fp128, ptr %a, align 16
+  %conv = fptosi fp128 %i to i128
   ret i128 %conv
 }
 
 ; Function Attrs: norecurse nounwind readonly
-define i128 @qpConv2ui128(ptr nocapture readonly %a) {
+define i128 @qpConv2ui128(ptr nocapture readonly %a) nounwind {
 ; CHECK-LABEL: qpConv2ui128:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    mflr r0
 ; CHECK-NEXT:    stdu r1, -32(r1)
 ; CHECK-NEXT:    std r0, 48(r1)
-; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    .cfi_offset lr, 16
 ; CHECK-NEXT:    lxv v2, 0(r3)
 ; CHECK-NEXT:    bl __fixunskfti
 ; CHECK-NEXT:    nop
@@ -2171,8 +2064,6 @@ define i128 @qpConv2ui128(ptr nocapture readonly %a) {
 ; CHECK-P8-NEXT:    mflr r0
 ; CHECK-P8-NEXT:    stdu r1, -32(r1)
 ; CHECK-P8-NEXT:    std r0, 48(r1)
-; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-P8-NEXT:    .cfi_offset lr, 16
 ; CHECK-P8-NEXT:    lxvd2x vs0, 0, r3
 ; CHECK-P8-NEXT:    xxswapd v2, vs0
 ; CHECK-P8-NEXT:    bl __fixunskfti
@@ -2182,13 +2073,13 @@ define i128 @qpConv2ui128(ptr nocapture readonly %a) {
 ; CHECK-P8-NEXT:    mtlr r0
 ; CHECK-P8-NEXT:    blr
 entry:
-  %0 = load fp128, ptr %a, align 16
-  %conv = fptoui fp128 %0 to i128
+  %i = load fp128, ptr %a, align 16
+  %conv = fptoui fp128 %i to i128
   ret i128 %conv
 }
 
 ; Function Attrs: norecurse nounwind readonly
-define i1 @qpConv2ui1(ptr nocapture readonly %a) {
+define i1 @qpConv2ui1(ptr nocapture readonly %a) nounwind {
 ; CHECK-LABEL: qpConv2ui1:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    lxv v2, 0(r3)
@@ -2201,8 +2092,6 @@ define i1 @qpConv2ui1(ptr nocapture readonly %a) {
 ; CHECK-P8-NEXT:    mflr r0
 ; CHECK-P8-NEXT:    stdu r1, -32(r1)
 ; CHECK-P8-NEXT:    std r0, 48(r1)
-; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-P8-NEXT:    .cfi_offset lr, 16
 ; CHECK-P8-NEXT:    lxvd2x vs0, 0, r3
 ; CHECK-P8-NEXT:    xxswapd v2, vs0
 ; CHECK-P8-NEXT:    bl __fixkfsi
@@ -2212,13 +2101,13 @@ define i1 @qpConv2ui1(ptr nocapture readonly %a) {
 ; CHECK-P8-NEXT:    mtlr r0
 ; CHECK-P8-NEXT:    blr
 entry:
-  %0 = load fp128, ptr %a, align 16
-  %conv = fptoui fp128 %0 to i1
+  %i = load fp128, ptr %a, align 16
+  %conv = fptoui fp128 %i to i1
   ret i1 %conv
 }
 
 ; Function Attrs: norecurse nounwind readonly
-define i1 @qpConv2si1(ptr nocapture readonly %a) {
+define i1 @qpConv2si1(ptr nocapture readonly %a) nounwind {
 ; CHECK-LABEL: qpConv2si1:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    lxv v2, 0(r3)
@@ -2231,8 +2120,6 @@ define i1 @qpConv2si1(ptr nocapture readonly %a) {
 ; CHECK-P8-NEXT:    mflr r0
 ; CHECK-P8-NEXT:    stdu r1, -32(r1)
 ; CHECK-P8-NEXT:    std r0, 48(r1)
-; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-P8-NEXT:    .cfi_offset lr, 16
 ; CHECK-P8-NEXT:    lxvd2x vs0, 0, r3
 ; CHECK-P8-NEXT:    xxswapd v2, vs0
 ; CHECK-P8-NEXT:    bl __fixkfsi
@@ -2242,7 +2129,7 @@ define i1 @qpConv2si1(ptr nocapture readonly %a) {
 ; CHECK-P8-NEXT:    mtlr r0
 ; CHECK-P8-NEXT:    blr
 entry:
-  %0 = load fp128, ptr %a, align 16
-  %conv = fptosi fp128 %0 to i1
+  %i = load fp128, ptr %a, align 16
+  %conv = fptosi fp128 %i to i1
   ret i1 %conv
 }
diff --git a/llvm/test/CodeGen/PowerPC/fp128-libcalls.ll b/llvm/test/CodeGen/PowerPC/fp128-libcalls.ll
index 9d875c854e320..e348d40a4cc74 100644
--- a/llvm/test/CodeGen/PowerPC/fp128-libcalls.ll
+++ b/llvm/test/CodeGen/PowerPC/fp128-libcalls.ll
@@ -30,6 +30,15 @@ define fp128 @divkf3(fp128 %a, fp128 %b) {
   ret fp128 %1
 }
 
+
+define fp128 @extendsfkf2_f16(half %a) {
+; CHECK-LABEL: extendsfkf2_f16:
+; CHECK: __extendsfkf2
+entry:
+  %i = fpext half %a to fp128
+  ret fp128 %i
+}
+
 define fp128 @extendsfkf2(float %a) {
 ; CHECK-LABEL: extendsfkf2:
 ; CHECK: __extendsfkf2
@@ -44,6 +53,14 @@ define fp128 @extenddfkf2(double %a) {
   ret fp128 %1
 }
 
+define half @trunctfhf2(fp128 %a) {
+; CHECK-LABEL: trunctfhf2:
+; CHECK: __trunckfhf2
+entry:
+  %i = fptrunc fp128 %a to half
+  ret half %i
+}
+
 define float @trunckfsf2(fp128 %a) {
 ; CHECK-LABEL: trunckfsf2:
 ; CHECK: __trunckfsf2

From 90cc9ca8bcb2b96463dd1b2bb9c43380dbf67f87 Mon Sep 17 00:00:00 2001
From: Ikhlas Ajbar <iajbar@quicinc.com>
Date: Fri, 31 Jan 2025 11:59:39 -0600
Subject: [PATCH 275/282] [Hexagon] Set the default compilation target to V68
 (#125239)

Set the default compilation target to V68 if no Hexagon processor is
specified at the command-line.
Add the elf header changes for v81/v83/v85 architectures.

(cherry picked from commit 759ef5811e2297f2cbe7578f7c118668e3467c6a)
---
 clang/lib/Driver/ToolChains/Hexagon.cpp |  4 +---
 clang/test/Driver/hexagon-cpu-default.c |  4 ++++
 llvm/include/llvm/BinaryFormat/ELF.h    | 10 ++++++++++
 3 files changed, 15 insertions(+), 3 deletions(-)
 create mode 100644 clang/test/Driver/hexagon-cpu-default.c

diff --git a/clang/lib/Driver/ToolChains/Hexagon.cpp b/clang/lib/Driver/ToolChains/Hexagon.cpp
index 772a9827cb211..6ea701a7882d1 100644
--- a/clang/lib/Driver/ToolChains/Hexagon.cpp
+++ b/clang/lib/Driver/ToolChains/Hexagon.cpp
@@ -803,9 +803,7 @@ bool HexagonToolChain::isAutoHVXEnabled(const llvm::opt::ArgList &Args) {
 // Returns the default CPU for Hexagon. This is the default compilation target
 // if no Hexagon processor is selected at the command-line.
 //
-StringRef HexagonToolChain::GetDefaultCPU() {
-  return "hexagonv60";
-}
+StringRef HexagonToolChain::GetDefaultCPU() { return "hexagonv68"; }
 
 StringRef HexagonToolChain::GetTargetCPUVersion(const ArgList &Args) {
   Arg *CpuArg = nullptr;
diff --git a/clang/test/Driver/hexagon-cpu-default.c b/clang/test/Driver/hexagon-cpu-default.c
new file mode 100644
index 0000000000000..31fb839f21656
--- /dev/null
+++ b/clang/test/Driver/hexagon-cpu-default.c
@@ -0,0 +1,4 @@
+// CHECK: "-target-cpu" "hexagonv68"
+
+// RUN: %clang -c %s -### --target=hexagon-unknown-elf \
+// RUN:  2>&1 | FileCheck  %s
diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h
index 48ae0db80f43e..8853c4a88b0b5 100644
--- a/llvm/include/llvm/BinaryFormat/ELF.h
+++ b/llvm/include/llvm/BinaryFormat/ELF.h
@@ -619,6 +619,7 @@ enum {
   EF_HEXAGON_MACH_V5 = 0x00000004,   // Hexagon V5
   EF_HEXAGON_MACH_V55 = 0x00000005,  // Hexagon V55
   EF_HEXAGON_MACH_V60 = 0x00000060,  // Hexagon V60
+  EF_HEXAGON_MACH_V61 = 0x00000061,  // Hexagon V61
   EF_HEXAGON_MACH_V62 = 0x00000062,  // Hexagon V62
   EF_HEXAGON_MACH_V65 = 0x00000065,  // Hexagon V65
   EF_HEXAGON_MACH_V66 = 0x00000066,  // Hexagon V66
@@ -630,7 +631,11 @@ enum {
   EF_HEXAGON_MACH_V71T = 0x00008071, // Hexagon V71T
   EF_HEXAGON_MACH_V73 = 0x00000073,  // Hexagon V73
   EF_HEXAGON_MACH_V75 = 0x00000075,  // Hexagon V75
+  EF_HEXAGON_MACH_V77 = 0x00000077,  // Hexagon V77
   EF_HEXAGON_MACH_V79 = 0x00000079,  // Hexagon V79
+  EF_HEXAGON_MACH_V81 = 0x00000081,  // Hexagon V81
+  EF_HEXAGON_MACH_V83 = 0x00000083,  // Hexagon V83
+  EF_HEXAGON_MACH_V85 = 0x00000085,  // Hexagon V85
   EF_HEXAGON_MACH = 0x000003ff,      // Hexagon V..
 
   // Highest ISA version flags
@@ -642,6 +647,7 @@ enum {
   EF_HEXAGON_ISA_V5 = 0x00000040,   // Hexagon V5 ISA
   EF_HEXAGON_ISA_V55 = 0x00000050,  // Hexagon V55 ISA
   EF_HEXAGON_ISA_V60 = 0x00000060,  // Hexagon V60 ISA
+  EF_HEXAGON_ISA_V61 = 0x00000061,  // Hexagon V61 ISA
   EF_HEXAGON_ISA_V62 = 0x00000062,  // Hexagon V62 ISA
   EF_HEXAGON_ISA_V65 = 0x00000065,  // Hexagon V65 ISA
   EF_HEXAGON_ISA_V66 = 0x00000066,  // Hexagon V66 ISA
@@ -651,7 +657,11 @@ enum {
   EF_HEXAGON_ISA_V71 = 0x00000071,  // Hexagon V71 ISA
   EF_HEXAGON_ISA_V73 = 0x00000073,  // Hexagon V73 ISA
   EF_HEXAGON_ISA_V75 = 0x00000075,  // Hexagon V75 ISA
+  EF_HEXAGON_ISA_V77 = 0x00000077,  // Hexagon V77 ISA
   EF_HEXAGON_ISA_V79 = 0x00000079,  // Hexagon V79 ISA
+  EF_HEXAGON_ISA_V81 = 0x00000081,  // Hexagon V81 ISA
+  EF_HEXAGON_ISA_V83 = 0x00000083,  // Hexagon V83 ISA
+  EF_HEXAGON_ISA_V85 = 0x00000085,  // Hexagon V85 ISA
   EF_HEXAGON_ISA = 0x000003ff,      // Hexagon V.. ISA
 };
 

From d1f5a9f66ee237eabe35a1adf88eaaaadabd9783 Mon Sep 17 00:00:00 2001
From: Alexey Karyakin <akaryaki@quicinc.com>
Date: Fri, 21 Mar 2025 20:08:45 -0500
Subject: [PATCH 276/282] [hexagon] Bump the default version to v68 (#132304)

Set the default processor version to v68 when the user does not specify
one in the command line. This includes changes in the LLVM backed and
linker (lld). Since lld normally sets the version based on inputs, this
change will only affect cases when there are no inputs.

Fixes #127558

(cherry picked from commit c0b2c10e9f3a939c227a26aec3ba377f7cc25667)
---
 lld/ELF/Arch/Hexagon.cpp                      |  2 +-
 lld/docs/ReleaseNotes.rst                     |  5 ++
 lld/test/ELF/emulation-hexagon.s              |  4 +-
 lld/test/ELF/hexagon-eflag.s                  |  5 +-
 llvm/docs/ReleaseNotes.md                     |  4 ++
 .../MCTargetDesc/HexagonMCTargetDesc.cpp      |  2 +-
 llvm/test/CodeGen/Hexagon/arg-copy-elison.ll  | 23 ++++----
 .../Hexagon/atomicrmw-cond-sub-clamp.ll       |  4 +-
 .../Hexagon/atomicrmw-uinc-udec-wrap.ll       | 12 ++---
 llvm/test/CodeGen/Hexagon/bank-conflict.mir   |  2 +-
 .../CodeGen/Hexagon/fixed-spill-mutable.ll    |  5 +-
 llvm/test/CodeGen/Hexagon/isel-memory-vNi1.ll | 50 ++++++++++--------
 llvm/test/CodeGen/Hexagon/isel/logical.ll     | 52 +++++++++----------
 llvm/test/CodeGen/Hexagon/isel/select-i1.ll   | 12 ++---
 .../CodeGen/Hexagon/postinc-baseoffset.mir    |  4 +-
 llvm/test/CodeGen/Hexagon/setmemrefs.ll       |  2 +-
 llvm/test/MC/Hexagon/arch-support.s           |  4 ++
 llvm/test/MC/Hexagon/hexagon_attributes.s     | 12 +++--
 ...agon_generated_funcs.ll.generated.expected | 25 +++++----
 ...on_generated_funcs.ll.nogenerated.expected | 25 +++++----
 20 files changed, 134 insertions(+), 120 deletions(-)

diff --git a/lld/ELF/Arch/Hexagon.cpp b/lld/ELF/Arch/Hexagon.cpp
index 23b60672f6317..4ba61db2733c2 100644
--- a/lld/ELF/Arch/Hexagon.cpp
+++ b/lld/ELF/Arch/Hexagon.cpp
@@ -68,7 +68,7 @@ uint32_t Hexagon::calcEFlags() const {
     if (!ret || eflags > *ret)
       ret = eflags;
   }
-  return ret.value_or(/* Default Arch Rev: */ 0x60);
+  return ret.value_or(/* Default Arch Rev: */ EF_HEXAGON_MACH_V68);
 }
 
 static uint32_t applyMask(uint32_t mask, uint32_t data) {
diff --git a/lld/docs/ReleaseNotes.rst b/lld/docs/ReleaseNotes.rst
index e13b0cf0678ce..b8604611e286e 100644
--- a/lld/docs/ReleaseNotes.rst
+++ b/lld/docs/ReleaseNotes.rst
@@ -76,6 +76,11 @@ ELF Improvements
 * Supported relocation types for LoongArch target: ``R_LARCH_TLS_{LD,GD,DESC}_PCREL20_S2``.
   (`#100105 <https://github.com/llvm/llvm-project/pull/100105>`_)
 
+* The default Hexagon architecture version in ELF object files produced by
+  lld is changed to v68. This change is only effective when the version is
+  not provided in the command line by the user and cannot be inferred from
+  inputs.
+
 Breaking changes
 ----------------
 
diff --git a/lld/test/ELF/emulation-hexagon.s b/lld/test/ELF/emulation-hexagon.s
index a8a02d4c428b5..5bdd88941c269 100644
--- a/lld/test/ELF/emulation-hexagon.s
+++ b/lld/test/ELF/emulation-hexagon.s
@@ -1,5 +1,5 @@
 # REQUIRES: hexagon
-# RUN: llvm-mc -filetype=obj -triple=hexagon %s -o %t.o
+# RUN: llvm-mc -filetype=obj -triple=hexagon --mcpu=hexagonv73 %s -o %t.o
 # RUN: ld.lld %t.o -o %t
 # RUN: llvm-readelf --file-headers %t | FileCheck --check-prefix=CHECK %s
 # RUN: ld.lld -m hexagonelf %t.o -o %t
@@ -26,7 +26,7 @@
 # CHECK-NEXT:    Entry point address:               0x200B4
 # CHECK-NEXT:    Start of program headers:          52 (bytes into file)
 # CHECK-NEXT:    Start of section headers:
-# CHECK-NEXT:    Flags:                             0x60
+# CHECK-NEXT:    Flags:                             0x73
 # CHECK-NEXT:    Size of this header:               52 (bytes)
 # CHECK-NEXT:    Size of program headers:           32 (bytes)
 
diff --git a/lld/test/ELF/hexagon-eflag.s b/lld/test/ELF/hexagon-eflag.s
index dbe8604f69fda..ac9123832ac8a 100644
--- a/lld/test/ELF/hexagon-eflag.s
+++ b/lld/test/ELF/hexagon-eflag.s
@@ -3,10 +3,11 @@
 # RUN: llvm-mc -filetype=obj -mv60 -triple=hexagon-unknown-elf %S/Inputs/hexagon.s -o %t2
 # RUN: ld.lld %t2 %t  -o %t3
 # RUN: llvm-readelf -h  %t3 | FileCheck %s
-# Verify that the largest arch in the input list is selected.
+## Verify that the largest arch in the input list is selected.
 # CHECK: Flags: 0x62
 
+## Verify the arch version when it cannot be inferred from inputs.
 # RUN: llvm-ar rcsD %t4
 # RUN: ld.lld -m hexagonelf %t4 -o %t5
 # RUN: llvm-readelf -h  %t5 | FileCheck --check-prefix=CHECK-EMPTYARCHIVE %s
-# CHECK-EMPTYARCHIVE: Flags: 0x60
+# CHECK-EMPTYARCHIVE: Flags: 0x68
diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index 958b7adbc4c36..f34003eaf0fe2 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -202,6 +202,10 @@ Changes to the DirectX Backend
 Changes to the Hexagon Backend
 ------------------------------
 
+* The default Hexagon architecture version in ELF object files produced by
+  the tools such as llvm-mc is changed to v68. This version will be set if
+  the user does not provide the CPU version in the command line.
+
 Changes to the LoongArch Backend
 --------------------------------
 
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
index a98f6048b051c..8d18aade1a2be 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -125,7 +125,7 @@ static cl::opt<bool>
 static cl::opt<bool> EnableHexagonCabac
   ("mcabac", cl::desc("tbd"), cl::init(false));
 
-static StringRef DefaultArch = "hexagonv60";
+static constexpr StringRef DefaultArch = "hexagonv68";
 
 static StringRef HexagonGetArchVariant() {
   if (MV5)
diff --git a/llvm/test/CodeGen/Hexagon/arg-copy-elison.ll b/llvm/test/CodeGen/Hexagon/arg-copy-elison.ll
index f0c30c301f446..52f29eefa5ce6 100644
--- a/llvm/test/CodeGen/Hexagon/arg-copy-elison.ll
+++ b/llvm/test/CodeGen/Hexagon/arg-copy-elison.ll
@@ -1,8 +1,7 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --include-generated-funcs --version 4
 ; RUN: llc -mtriple hexagon-- -o - %s | FileCheck %s
 
 ; Reproducer for https://github.com/llvm/llvm-project/issues/89060
-;
 ; Problem was a bug in argument copy elison. Given that the %alloca is
 ; eliminated, the same frame index will be used for accessing %alloca and %a
 ; on the fixed stack. Care must be taken when setting up
@@ -11,8 +10,15 @@
 ; ir.alloca name), or make sure that we still detect that they alias each
 ; other if using different kinds of MemOperands to identify the same fixed
 ; stack entry.
-;
 define i32 @f(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 %q1, i32 %a, i32 %q2) {
+  %alloca = alloca i32
+  store i32 %a, ptr %alloca     ; Should be elided.
+  store i32 666, ptr %alloca
+  %x = sub i32 %q1, %q2
+  %y = xor i32 %x, %a           ; Results in a load of %a from fixed stack.
+                                ; Using same frame index as elided %alloca.
+  ret i32 %y
+}
 ; CHECK-LABEL: f:
 ; CHECK:         .cfi_startproc
 ; CHECK-NEXT:  // %bb.0:
@@ -24,16 +30,9 @@ define i32 @f(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i
 ; CHECK-NEXT:     r0 = sub(r1,r0)
 ; CHECK-NEXT:     r2 = memw(r29+#32)
 ; CHECK-NEXT:     memw(r29+#32) = ##666
-; CHECK-NEXT:    }
+; CHECK-EMPTY:
+; CHECK-NEXT:    } :mem_noshuf
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     r0 = xor(r0,r2)
 ; CHECK-NEXT:     jumpr r31
 ; CHECK-NEXT:    }
-  %alloca = alloca i32
-  store i32 %a, ptr %alloca     ; Should be elided.
-  store i32 666, ptr %alloca
-  %x = sub i32 %q1, %q2
-  %y = xor i32 %x, %a           ; Results in a load of %a from fixed stack.
-                                ; Using same frame index as elided %alloca.
-  ret i32 %y
-}
diff --git a/llvm/test/CodeGen/Hexagon/atomicrmw-cond-sub-clamp.ll b/llvm/test/CodeGen/Hexagon/atomicrmw-cond-sub-clamp.ll
index ba09c3e2852df..0e0b64aac4f6d 100644
--- a/llvm/test/CodeGen/Hexagon/atomicrmw-cond-sub-clamp.ll
+++ b/llvm/test/CodeGen/Hexagon/atomicrmw-cond-sub-clamp.ll
@@ -152,10 +152,8 @@ define i64 @atomicrmw_usub_cond_i64(ptr %ptr, i64 %val) {
 ; CHECK-NEXT:     r5:4 = memd_locked(r0)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     r7:6 = sub(r5:4,r3:2)
-; CHECK-NEXT:    }
-; CHECK-NEXT:    {
 ; CHECK-NEXT:     p0 = cmp.gtu(r3:2,r5:4)
+; CHECK-NEXT:     r7:6 = sub(r5:4,r3:2)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     r8 = mux(p0,r4,r6)
diff --git a/llvm/test/CodeGen/Hexagon/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/Hexagon/atomicrmw-uinc-udec-wrap.ll
index 6b6946d0dbb0e..8e673c1bb06ba 100644
--- a/llvm/test/CodeGen/Hexagon/atomicrmw-uinc-udec-wrap.ll
+++ b/llvm/test/CodeGen/Hexagon/atomicrmw-uinc-udec-wrap.ll
@@ -156,12 +156,12 @@ define i64 @atomicrmw_uinc_wrap_i64(ptr %ptr, i64 %val) {
 ; CHECK-NEXT:     r5:4 = memd_locked(r0)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
+; CHECK-NEXT:     p0 = cmp.gtu(r3:2,r5:4)
 ; CHECK-NEXT:     r9:8 = add(r5:4,r7:6)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     p0 = cmp.gtu(r3:2,r5:4)
-; CHECK-NEXT:     if (!p0.new) r8 = add(r1,#0)
-; CHECK-NEXT:     if (!p0.new) r9 = add(r1,#0)
+; CHECK-NEXT:     if (!p0) r8 = add(r1,#0)
+; CHECK-NEXT:     if (!p0) r9 = add(r1,#0)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     memd_locked(r0,p0) = r9:8
@@ -345,13 +345,13 @@ define i64 @atomicrmw_udec_wrap_i64(ptr %ptr, i64 %val) {
 ; CHECK-NEXT:     r5:4 = memd_locked(r0)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     r13:12 = add(r5:4,r7:6)
-; CHECK-NEXT:    }
-; CHECK-NEXT:    {
 ; CHECK-NEXT:     p1 = cmp.gtu(r5:4,r3:2)
 ; CHECK-NEXT:     p0 = cmp.eq(r5:4,r9:8)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
+; CHECK-NEXT:     r13:12 = add(r5:4,r7:6)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
 ; CHECK-NEXT:     r1 = mux(p1,r2,r12)
 ; CHECK-NEXT:     r14 = mux(p1,r3,r13)
 ; CHECK-NEXT:    }
diff --git a/llvm/test/CodeGen/Hexagon/bank-conflict.mir b/llvm/test/CodeGen/Hexagon/bank-conflict.mir
index 12d7838b8372d..f32c3868dcf09 100644
--- a/llvm/test/CodeGen/Hexagon/bank-conflict.mir
+++ b/llvm/test/CodeGen/Hexagon/bank-conflict.mir
@@ -8,9 +8,9 @@
 # CHECK: = A2_tfr
 # CHECK: = L2_loadrigp
 
-# CHECK: = L4_loadri_rr
 # CHECK: = S2_tstbit_i
 # CHECK: = L4_loadri_rr
+# CHECK: = L4_loadri_rr
 
 --- |
   %s.0 = type { [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [3 x i32], [24 x i32], [8 x %s.1], [5 x i32] }
diff --git a/llvm/test/CodeGen/Hexagon/fixed-spill-mutable.ll b/llvm/test/CodeGen/Hexagon/fixed-spill-mutable.ll
index f99b448cc1a78..aa8766661a24a 100644
--- a/llvm/test/CodeGen/Hexagon/fixed-spill-mutable.ll
+++ b/llvm/test/CodeGen/Hexagon/fixed-spill-mutable.ll
@@ -12,10 +12,11 @@
 ; The problem is that the load will execute before the store, clobbering the
 ; pair r17:16.
 ;
-; Check that the store and the load are not in the same packet.
+
+; Validate that store executes before load.
 ; CHECK: memd{{.*}} = r17:16
-; CHECK: }
 ; CHECK: r17:16 = memd
+; CHECK: } :mem_noshuf
 ; CHECK-LABEL: LBB0_1:
 
 target triple = "hexagon"
diff --git a/llvm/test/CodeGen/Hexagon/isel-memory-vNi1.ll b/llvm/test/CodeGen/Hexagon/isel-memory-vNi1.ll
index e1b848c0d247b..23c919df05556 100644
--- a/llvm/test/CodeGen/Hexagon/isel-memory-vNi1.ll
+++ b/llvm/test/CodeGen/Hexagon/isel-memory-vNi1.ll
@@ -8,12 +8,15 @@ define i64 @f0(ptr %a0, <8 x i8> %a1) #0 {
 ; CHECK-NEXT:     r0 = memub(r0+#0)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     r5:4 = combine(#0,#0)
+; CHECK-NEXT:     r1 = #0
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     p0 = r0
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
+; CHECK-NEXT:     r5:4 = vsplatb(r1)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
 ; CHECK-NEXT:     r1:0 = vmux(p0,r3:2,r5:4)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
@@ -114,7 +117,10 @@ define void @f4(ptr %a0, i64 %a1) #0 {
 ; CHECK-LABEL: f4:
 ; CHECK:       // %bb.0: // %b0
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     r5:4 = combine(#0,#0)
+; CHECK-NEXT:     r1 = #0
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r5:4 = vsplatb(r1)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     p0 = vcmpb.eq(r3:2,r5:4)
@@ -123,10 +129,10 @@ define void @f4(ptr %a0, i64 %a1) #0 {
 ; CHECK-NEXT:     p0 = not(p0)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     r1 = p0
+; CHECK-NEXT:     r2 = p0
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     memb(r0+#0) = r1
+; CHECK-NEXT:     memb(r0+#0) = r2
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     jumpr r31
@@ -173,64 +179,64 @@ define void @f6(ptr %a0, i16 %a1) #0 {
 ; CHECK-LABEL: f6:
 ; CHECK:       // %bb.0: // %b0
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     r2 = extractu(r1,#8,#8)
+; CHECK-NEXT:     r2 = #255
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     r3 = #255
+; CHECK-NEXT:     r3 = extractu(r1,#8,#8)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     p1 = !bitsclr(r1,r3)
+; CHECK-NEXT:     p1 = !bitsclr(r1,r2)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     p0 = cmp.eq(r2,#0)
+; CHECK-NEXT:     p0 = cmp.eq(r3,#0)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     if (p0) r2 = #0
+; CHECK-NEXT:     if (p0) r3 = #0
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     r1 = mux(p1,#8,#0)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     r3 = mux(p1,#2,#0)
+; CHECK-NEXT:     r2 = mux(p1,#2,#0)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     r5 = setbit(r1,#2)
+; CHECK-NEXT:     if (!p0) r3 = #128
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     r6 = setbit(r3,#0)
+; CHECK-NEXT:     r4 = mux(p0,#0,#32)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     if (!p0) r2 = #128
+; CHECK-NEXT:     r5 = setbit(r1,#2)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     r4 = mux(p0,#0,#32)
+; CHECK-NEXT:     r6 = setbit(r2,#0)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     if (!p1) r5 = add(r1,#0)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     if (!p1) r6 = add(r3,#0)
+; CHECK-NEXT:      r1 = setbit(r3,#6)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     r1 = setbit(r2,#6)
+; CHECK-NEXT:     if (!p1) r6 = add(r2,#0)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     r3 = setbit(r4,#4)
+; CHECK-NEXT:     r2 = setbit(r4,#4)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     r5 = or(r6,r5)
+; CHECK-NEXT:     if (!p0) r4 = add(r2,#0)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     if (!p0) r2 = add(r1,#0)
+; CHECK-NEXT:     if (!p0) r3 = add(r1,#0)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     if (!p0) r4 = add(r3,#0)
+; CHECK-NEXT:     r2 = or(r6,r5)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     r5 |= or(r4,r2)
+; CHECK-NEXT:     r2 |= or(r4,r3)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     memb(r0+#0) = r5
+; CHECK-NEXT:     memb(r0+#0) = r2
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     jumpr r31
diff --git a/llvm/test/CodeGen/Hexagon/isel/logical.ll b/llvm/test/CodeGen/Hexagon/isel/logical.ll
index 7f9c178c42416..669d01dcd6add 100644
--- a/llvm/test/CodeGen/Hexagon/isel/logical.ll
+++ b/llvm/test/CodeGen/Hexagon/isel/logical.ll
@@ -1399,10 +1399,10 @@ define <8 x i8> @f39(<8 x i8> %a0, <8 x i8> %a1) #1 {
 ; CHECK-LABEL: f39:
 ; CHECK:       // %bb.0: // %b0
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     r4 = ##16843009
+; CHECK-NEXT:     r4 = #1
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     r5 = ##16843009
+; CHECK-NEXT:     r5:4 = vsplatb(r4)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     p0 = vcmpb.eq(r1:0,r5:4)
@@ -1431,10 +1431,10 @@ define <8 x i8> @f40(<8 x i8> %a0, <8 x i8> %a1) #1 {
 ; CHECK-LABEL: f40:
 ; CHECK:       // %bb.0: // %b0
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     r4 = ##16843009
+; CHECK-NEXT:     r4 = #1
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     r5 = ##16843009
+; CHECK-NEXT:     r5:4 = vsplatb(r4)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     p0 = vcmpb.eq(r1:0,r5:4)
@@ -1463,10 +1463,10 @@ define <8 x i8> @f41(<8 x i8> %a0, <8 x i8> %a1) #1 {
 ; CHECK-LABEL: f41:
 ; CHECK:       // %bb.0: // %b0
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     r4 = ##16843009
+; CHECK-NEXT:     r4 = #1
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     r5 = ##16843009
+; CHECK-NEXT:     r5:4 = vsplatb(r4)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     p0 = vcmpb.eq(r1:0,r5:4)
@@ -1495,10 +1495,10 @@ define <8 x i8> @f42(<8 x i8> %a0, <8 x i8> %a1) #1 {
 ; CHECK-LABEL: f42:
 ; CHECK:       // %bb.0: // %b0
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     r4 = ##16843009
+; CHECK-NEXT:     r4 = #1
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     r5 = ##16843009
+; CHECK-NEXT:     r5:4 = vsplatb(r4)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     p0 = vcmpb.eq(r1:0,r5:4)
@@ -1528,10 +1528,10 @@ define <8 x i8> @f43(<8 x i8> %a0, <8 x i8> %a1) #1 {
 ; CHECK-LABEL: f43:
 ; CHECK:       // %bb.0: // %b0
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     r4 = ##16843009
+; CHECK-NEXT:     r4 = #1
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     r5 = ##16843009
+; CHECK-NEXT:     r5:4 = vsplatb(r4)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     p0 = vcmpb.eq(r1:0,r5:4)
@@ -1561,10 +1561,10 @@ define <8 x i8> @f44(<8 x i8> %a0, <8 x i8> %a1, <8 x i8> %a2) #1 {
 ; CHECK-LABEL: f44:
 ; CHECK:       // %bb.0: // %b0
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     r6 = ##16843009
+; CHECK-NEXT:     r6 = #1
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     r7 = ##16843009
+; CHECK-NEXT:     r7:6 = vsplatb(r6)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     p0 = vcmpb.eq(r1:0,r7:6)
@@ -1598,10 +1598,10 @@ define <8 x i8> @f45(<8 x i8> %a0, <8 x i8> %a1, <8 x i8> %a2) #1 {
 ; CHECK-LABEL: f45:
 ; CHECK:       // %bb.0: // %b0
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     r6 = ##16843009
+; CHECK-NEXT:     r6 = #1
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     r7 = ##16843009
+; CHECK-NEXT:     r7:6 = vsplatb(r6)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     p0 = vcmpb.eq(r1:0,r7:6)
@@ -1635,10 +1635,10 @@ define <8 x i8> @f46(<8 x i8> %a0, <8 x i8> %a1, <8 x i8> %a2) #1 {
 ; CHECK-LABEL: f46:
 ; CHECK:       // %bb.0: // %b0
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     r6 = ##16843009
+; CHECK-NEXT:     r6 = #1
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     r7 = ##16843009
+; CHECK-NEXT:     r7:6 = vsplatb(r6)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     p0 = vcmpb.eq(r1:0,r7:6)
@@ -1672,10 +1672,10 @@ define <8 x i8> @f47(<8 x i8> %a0, <8 x i8> %a1, <8 x i8> %a2) #1 {
 ; CHECK-LABEL: f47:
 ; CHECK:       // %bb.0: // %b0
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     r6 = ##16843009
+; CHECK-NEXT:     r6 = #1
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     r7 = ##16843009
+; CHECK-NEXT:     r7:6 = vsplatb(r6)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     p0 = vcmpb.eq(r1:0,r7:6)
@@ -1709,10 +1709,10 @@ define <8 x i8> @f48(<8 x i8> %a0, <8 x i8> %a1, <8 x i8> %a2) #1 {
 ; CHECK-LABEL: f48:
 ; CHECK:       // %bb.0: // %b0
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     r6 = ##16843009
+; CHECK-NEXT:     r6 = #1
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     r7 = ##16843009
+; CHECK-NEXT:     r7:6 = vsplatb(r6)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     p0 = vcmpb.eq(r1:0,r7:6)
@@ -1750,10 +1750,10 @@ define <8 x i8> @f49(<8 x i8> %a0, <8 x i8> %a1, <8 x i8> %a2) #1 {
 ; CHECK-LABEL: f49:
 ; CHECK:       // %bb.0: // %b0
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     r6 = ##16843009
+; CHECK-NEXT:     r6 = #1
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     r7 = ##16843009
+; CHECK-NEXT:     r7:6 = vsplatb(r6)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     p0 = vcmpb.eq(r1:0,r7:6)
@@ -1788,10 +1788,10 @@ define <8 x i8> @f50(<8 x i8> %a0, <8 x i8> %a1, <8 x i8> %a2) #1 {
 ; CHECK-LABEL: f50:
 ; CHECK:       // %bb.0: // %b0
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     r6 = ##16843009
+; CHECK-NEXT:     r6 = #1
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     r7 = ##16843009
+; CHECK-NEXT:     r7:6 = vsplatb(r6)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     p0 = vcmpb.eq(r1:0,r7:6)
@@ -1826,10 +1826,10 @@ define <8 x i8> @f51(<8 x i8> %a0, <8 x i8> %a1, <8 x i8> %a2) #1 {
 ; CHECK-LABEL: f51:
 ; CHECK:       // %bb.0: // %b0
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     r6 = ##16843009
+; CHECK-NEXT:     r6 = #1
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     r7 = ##16843009
+; CHECK-NEXT:     r7:6 = vsplatb(r6)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     p0 = vcmpb.eq(r1:0,r7:6)
diff --git a/llvm/test/CodeGen/Hexagon/isel/select-i1.ll b/llvm/test/CodeGen/Hexagon/isel/select-i1.ll
index 193b354bc5a87..eb77850f0a5a8 100644
--- a/llvm/test/CodeGen/Hexagon/isel/select-i1.ll
+++ b/llvm/test/CodeGen/Hexagon/isel/select-i1.ll
@@ -124,11 +124,9 @@ define void @f4(ptr %a0, ptr %a1, ptr %a2, ptr %a3) {
 ; CHECK-NEXT:     r0 = memub(r0+#0)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     r2 = memub(r2+#0)
-; CHECK-NEXT:    }
-; CHECK-NEXT:    {
 ; CHECK-NEXT:     p0 = r0
 ; CHECK-NEXT:     p1 = r1
+; CHECK-NEXT:     r2 = memub(r2+#0)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     p2 = r2
@@ -160,11 +158,9 @@ define void @f5(ptr %a0, ptr %a1, ptr %a2, ptr %a3) {
 ; CHECK-NEXT:     r0 = memub(r0+#0)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     r2 = memub(r2+#0)
-; CHECK-NEXT:    }
-; CHECK-NEXT:    {
 ; CHECK-NEXT:     p0 = r0
 ; CHECK-NEXT:     p1 = r1
+; CHECK-NEXT:     r2 = memub(r2+#0)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     p2 = r2
@@ -196,11 +192,9 @@ define void @f6(ptr %a0, ptr %a1, ptr %a2, ptr %a3) {
 ; CHECK-NEXT:     r0 = memub(r0+#0)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     r2 = memub(r2+#0)
-; CHECK-NEXT:    }
-; CHECK-NEXT:    {
 ; CHECK-NEXT:     p0 = r0
 ; CHECK-NEXT:     p1 = r1
+; CHECK-NEXT:     r2 = memub(r2+#0)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     p2 = r2
diff --git a/llvm/test/CodeGen/Hexagon/postinc-baseoffset.mir b/llvm/test/CodeGen/Hexagon/postinc-baseoffset.mir
index fa07febcbf5a7..172a28b8e64ef 100644
--- a/llvm/test/CodeGen/Hexagon/postinc-baseoffset.mir
+++ b/llvm/test/CodeGen/Hexagon/postinc-baseoffset.mir
@@ -2,11 +2,11 @@
 
 # Check that we don't packetize these two instructions together. It happened
 # earlier because "offset" in the post-increment instruction was taken to be 8.
+# If they are packetized together, make sure "mem_noshuf" attribute is set.
 
 # CHECK: memw(r0+#0) = #-1
-# CHECK: }
-# CHECK: {
 # CHECK: r1 = memw(r0++#8)
+# CHECK: :mem_noshuf
 
 --- |
   define void @fred(ptr %a) { ret void }
diff --git a/llvm/test/CodeGen/Hexagon/setmemrefs.ll b/llvm/test/CodeGen/Hexagon/setmemrefs.ll
index 85f46af7e56ac..13b7b955cb624 100644
--- a/llvm/test/CodeGen/Hexagon/setmemrefs.ll
+++ b/llvm/test/CodeGen/Hexagon/setmemrefs.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=hexagon < %s | FileCheck %s
+; RUN: llc -mtriple=hexagon -mcpu=hexagonv60 < %s | FileCheck %s
 
 ; This test checks to see if, after lowering the two loads below, we set up the
 ; memrefs of the resulting load MIs correctly, so that they are packetized
diff --git a/llvm/test/MC/Hexagon/arch-support.s b/llvm/test/MC/Hexagon/arch-support.s
index 99364cc936912..eb362a7db3caf 100644
--- a/llvm/test/MC/Hexagon/arch-support.s
+++ b/llvm/test/MC/Hexagon/arch-support.s
@@ -11,6 +11,9 @@
 # RUN: llvm-mc -triple=hexagon -mv75 -filetype=obj %s | llvm-readelf -h - | FileCheck --check-prefix=CHECK-V75 %s
 # RUN: llvm-mc -triple=hexagon -mv79 -filetype=obj %s | llvm-readelf -h - | FileCheck --check-prefix=CHECK-V79 %s
 
+## Check which arch version llvm-mc sets when the user does not provide one.
+# RUN: llvm-mc -triple=hexagon -filetype=obj %s | llvm-readelf -h - | FileCheck --check-prefix=CHECK-DEFAULT %s
+
 # RUN: llvm-mc -triple=hexagon -mv5 -filetype=obj %s | llvm-objdump --disassemble - | FileCheck --check-prefix=CHECK-OBJDUMP %s
 # RUN: llvm-mc -triple=hexagon -mv55 -filetype=obj %s | llvm-objdump --disassemble - | FileCheck --check-prefix=CHECK-OBJDUMP %s
 # RUN: llvm-mc -triple=hexagon -mv60 -filetype=obj %s | llvm-objdump --disassemble - | FileCheck --check-prefix=CHECK-OBJDUMP %s
@@ -38,5 +41,6 @@ r1 = r1
 # CHECK-V73: Flags:{{.*}}0x73
 # CHECK-V75: Flags:{{.*}}0x75
 # CHECK-V79: Flags:{{.*}}0x79
+# CHECK-DEFAULT: Flags:{{.*}}0x68
 
 # CHECK-OBJDUMP: { r1 = r1 }
diff --git a/llvm/test/MC/Hexagon/hexagon_attributes.s b/llvm/test/MC/Hexagon/hexagon_attributes.s
index 4cd5223cd2206..8d96993eee99a 100644
--- a/llvm/test/MC/Hexagon/hexagon_attributes.s
+++ b/llvm/test/MC/Hexagon/hexagon_attributes.s
@@ -5,8 +5,11 @@ r3:2=cround(r1:0,#0x0)       // v67, audio
 v3:0.w=vrmpyz(v0.b,r0.b)     // hvxv73, zreg
 v1:0.sf=vadd(v0.bf,v0.bf)    // hvxv73, hvx-ieee-fp
 
-// RUN: llvm-mc --mattr=+v67,+hvxv73,+hvx-qfloat,+hvx-ieee-fp,+zreg,+audio %s \
-// RUN:   -triple=hexagon -filetype=obj --hexagon-add-build-attributes -o %t.o
+// Note that the CPU version should be set with `--mcpu` and not with attributes
+// because attributes are additive.
+// RUN: llvm-mc -triple=hexagon --mcpu=hexagonv67 \
+// RUN:   --mattr=+hvxv73,+hvx-qfloat,+hvx-ieee-fp,+zreg,+audio %s \
+// RUN:   -filetype=obj --hexagon-add-build-attributes -o %t.o
 
 // RUN: llvm-readelf -A %t.o | \
 // RUN:   FileCheck %s --match-full-lines --implicit-check-not={{.}} --check-prefix=READELF
@@ -15,8 +18,9 @@ v1:0.sf=vadd(v0.bf,v0.bf)    // hvxv73, hvx-ieee-fp
 /// without manually passing in features when an attribute section is present.
 // RUN: llvm-objdump -d %t.o | FileCheck %s --check-prefix=OBJDUMP
 
-// RUN: llvm-mc --mattr=+v67,+hvxv73,+hvx-qfloat,+hvx-ieee-fp,+zreg,+audio %s \
-// RUN:   -triple=hexagon -filetype=asm --hexagon-add-build-attributes | \
+// RUN: llvm-mc -triple=hexagon --mcpu=hexagonv67 \
+// RUN:   --mattr=+hvxv73,+hvx-qfloat,+hvx-ieee-fp,+zreg,+audio %s \
+// RUN:   -filetype=asm --hexagon-add-build-attributes | \
 // RUN:     FileCheck %s --match-full-lines --implicit-check-not={{.}} --check-prefix=ASM
 
 //      READELF: BuildAttributes {
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/hexagon_generated_funcs.ll.generated.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/hexagon_generated_funcs.ll.generated.expected
index cd135ce9e011d..e54510ba8e040 100644
--- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/hexagon_generated_funcs.ll.generated.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/hexagon_generated_funcs.ll.generated.expected
@@ -76,29 +76,28 @@ attributes #0 = { noredzone nounwind ssp uwtable "frame-pointer"="all" }
 ; CHECK-NEXT:    .cfi_offset r30, -8
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     memw(r29+#4) = #0
-; CHECK-NEXT:    }
-; CHECK-NEXT:    {
 ; CHECK-NEXT:     memw(r29+#8) = #0
-; CHECK-NEXT:     memw(r29+#8) = #1
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
+; CHECK-NEXT:     memw(r29+#8) = #1
 ; CHECK-NEXT:     r1 = memw(r29+#8)
-; CHECK-NEXT:     memw(r29+#12) = #2
-; CHECK-NEXT:    }
+; CHECK-EMPTY:
+; CHECK-NEXT:    } :mem_noshuf
 ; CHECK-NEXT:    {
+; CHECK-NEXT:     p0 = cmp.eq(r1,#0)
+; CHECK-NEXT:     memw(r29+#12) = #2
 ; CHECK-NEXT:     memw(r29+#16) = #3
-; CHECK-NEXT:     memw(r29+#20) = #4
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     p0 = cmp.eq(r1,#0)
-; CHECK-NEXT:     if (p0.new) memw(r29+#16) = #3
-; CHECK-NEXT:     if (p0.new) memw(r29+#12) = #2
+; CHECK-NEXT:     memw(r29+#20) = #4
+; CHECK-NEXT:     if (p0) memw(r29+#16) = #3
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
+; CHECK-NEXT:     if (p0) memw(r29+#12) = #2
 ; CHECK-NEXT:     if (p0) memw(r29+#20) = #4
-; CHECK-NEXT:     if (p0) memw(r29+#8) = #1
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
+; CHECK-NEXT:     if (p0) memw(r29+#8) = #1
 ; CHECK-NEXT:     if (!p0) memw(r29+#16) = #1
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
@@ -117,15 +116,15 @@ attributes #0 = { noredzone nounwind ssp uwtable "frame-pointer"="all" }
 ; CHECK-NEXT:    .cfi_offset r30, -8
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     memw(r29+#4) = #0
-; CHECK-NEXT:     memw(r0+#0) = #1
+; CHECK-NEXT:     memw(r29+#8) = #1
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     memw(r29+#8) = #1
 ; CHECK-NEXT:     memw(r29+#12) = #2
+; CHECK-NEXT:     memw(r29+#16) = #3
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     memw(r29+#16) = #3
 ; CHECK-NEXT:     memw(r29+#20) = #4
+; CHECK-NEXT:     memw(r0+#0) = #1
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    //# InlineAsm Start
 ; CHECK-NEXT:    //# InlineAsm End
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/hexagon_generated_funcs.ll.nogenerated.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/hexagon_generated_funcs.ll.nogenerated.expected
index 833bf68fc03de..219d6d004fd8f 100644
--- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/hexagon_generated_funcs.ll.nogenerated.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/hexagon_generated_funcs.ll.nogenerated.expected
@@ -17,29 +17,28 @@ define dso_local i32 @check_boundaries() #0 {
 ; CHECK-NEXT:    .cfi_offset r30, -8
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     memw(r29+#4) = #0
-; CHECK-NEXT:    }
-; CHECK-NEXT:    {
 ; CHECK-NEXT:     memw(r29+#8) = #0
-; CHECK-NEXT:     memw(r29+#8) = #1
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
+; CHECK-NEXT:     memw(r29+#8) = #1
 ; CHECK-NEXT:     r1 = memw(r29+#8)
-; CHECK-NEXT:     memw(r29+#12) = #2
-; CHECK-NEXT:    }
+; CHECK-EMPTY:
+; CHECK-NEXT:    } :mem_noshuf
 ; CHECK-NEXT:    {
+; CHECK-NEXT:     p0 = cmp.eq(r1,#0)
+; CHECK-NEXT:     memw(r29+#12) = #2
 ; CHECK-NEXT:     memw(r29+#16) = #3
-; CHECK-NEXT:     memw(r29+#20) = #4
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     p0 = cmp.eq(r1,#0)
-; CHECK-NEXT:     if (p0.new) memw(r29+#16) = #3
-; CHECK-NEXT:     if (p0.new) memw(r29+#12) = #2
+; CHECK-NEXT:     memw(r29+#20) = #4
+; CHECK-NEXT:     if (p0) memw(r29+#16) = #3
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
+; CHECK-NEXT:     if (p0) memw(r29+#12) = #2
 ; CHECK-NEXT:     if (p0) memw(r29+#20) = #4
-; CHECK-NEXT:     if (p0) memw(r29+#8) = #1
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
+; CHECK-NEXT:     if (p0) memw(r29+#8) = #1
 ; CHECK-NEXT:     if (!p0) memw(r29+#16) = #1
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
@@ -94,15 +93,15 @@ define dso_local i32 @main() #0 {
 ; CHECK-NEXT:    .cfi_offset r30, -8
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     memw(r29+#4) = #0
-; CHECK-NEXT:     memw(r0+#0) = #1
+; CHECK-NEXT:     memw(r29+#8) = #1
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     memw(r29+#8) = #1
 ; CHECK-NEXT:     memw(r29+#12) = #2
+; CHECK-NEXT:     memw(r29+#16) = #3
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     memw(r29+#16) = #3
 ; CHECK-NEXT:     memw(r29+#20) = #4
+; CHECK-NEXT:     memw(r0+#0) = #1
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    //# InlineAsm Start
 ; CHECK-NEXT:    //# InlineAsm End

From 3d5f5ef6b7848deaf45703e170166cad9438bcd9 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Thu, 27 Mar 2025 16:01:35 -0700
Subject: [PATCH 277/282] workflows: Add missing apt-get update to abi tests
 (#133264)

(cherry picked from commit a10a9134023539ee6ab3d166518487f40e368334)
---
 .github/workflows/libclang-abi-tests.yml | 5 ++++-
 .github/workflows/llvm-tests.yml         | 5 ++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/libclang-abi-tests.yml b/.github/workflows/libclang-abi-tests.yml
index 41b3075288d2d..ff8f38b43098e 100644
--- a/.github/workflows/libclang-abi-tests.yml
+++ b/.github/workflows/libclang-abi-tests.yml
@@ -103,6 +103,7 @@ jobs:
         uses: llvm/actions/install-ninja@main
       - name: Install abi-compliance-checker
         run: |
+          sudo apt-get update
           sudo apt-get install abi-dumper autoconf pkg-config
       - name: Install universal-ctags
         run: |
@@ -154,7 +155,9 @@ jobs:
           path: build-latest
 
       - name: Install abi-compliance-checker
-        run: sudo apt-get install abi-compliance-checker
+        run: |
+          sudo apt-get update
+          sudo apt-get install abi-compliance-checker
       - name: Compare ABI
         run: |
           for lib in ${{ needs.abi-dump-setup.outputs.ABI_LIBS }}; do
diff --git a/.github/workflows/llvm-tests.yml b/.github/workflows/llvm-tests.yml
index 9b3d49d4e99b9..92debf2a8a269 100644
--- a/.github/workflows/llvm-tests.yml
+++ b/.github/workflows/llvm-tests.yml
@@ -91,6 +91,7 @@ jobs:
         uses: llvm/actions/install-ninja@main
       - name: Install abi-compliance-checker
         run: |
+          sudo apt-get update
           sudo apt-get install abi-dumper autoconf pkg-config
       - name: Install universal-ctags
         run: |
@@ -163,7 +164,9 @@ jobs:
           path: symbol-list
 
       - name: Install abi-compliance-checker
-        run: sudo apt-get install abi-compliance-checker
+        run: |
+          sudo apt-get update
+          sudo apt-get install abi-compliance-checker
       - name: Compare ABI
         run: |
           if [ -s symbol-list/llvm.symbols ]; then

From 2406e0d4467a265678f4e414f23a88faf5012944 Mon Sep 17 00:00:00 2001
From: Eli Friedman <efriedma@quicinc.com>
Date: Thu, 27 Mar 2025 17:46:42 -0700
Subject: [PATCH 278/282] Revert "[MC] Explicitly mark MCSymbol for
 MO_ExternalSymbol" (#133291)

Reverts llvm/llvm-project#108880 .

The patch has no regression test, no description of why the fix is
necessary, and the code is modifying MC datastructures in a way that's
forbidden in the AsmPrinter.

Fixes #132055.

(cherry picked from commit cd6e959102888279dc7e75a41ebd75a08ac3f7a5)
---
 llvm/lib/Target/X86/X86MCInstLower.cpp | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp
index 645a9baeba65c..680bf4286da0c 100644
--- a/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -348,12 +348,8 @@ MCOperand X86MCInstLower::LowerMachineOperand(const MachineInstr *MI,
     return MCOperand::createImm(MO.getImm());
   case MachineOperand::MO_MachineBasicBlock:
   case MachineOperand::MO_GlobalAddress:
+  case MachineOperand::MO_ExternalSymbol:
     return LowerSymbolOperand(MO, GetSymbolFromOperand(MO));
-  case MachineOperand::MO_ExternalSymbol: {
-    MCSymbol *Sym = GetSymbolFromOperand(MO);
-    Sym->setExternal(true);
-    return LowerSymbolOperand(MO, Sym);
-  }
   case MachineOperand::MO_MCSymbol:
     return LowerSymbolOperand(MO, MO.getMCSymbol());
   case MachineOperand::MO_JumpTableIndex:

From c1c4d7191d7078216b9c8793e46fff84a8c7a02d Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Thu, 27 Mar 2025 01:00:02 -0700
Subject: [PATCH 279/282] [clang-format] Allow `Language: Cpp` for C files
 (#133033)

Fix #132832

(cherry picked from commit 05fb8408de23c3ccb6125b6886742177755bd757)
---
 clang/lib/Format/Format.cpp                | 18 ++++++++++++++----
 clang/unittests/Format/ConfigParseTest.cpp | 20 ++++++++++++++++++++
 2 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp
index 0bb8545884442..768e655f65ce7 100644
--- a/clang/lib/Format/Format.cpp
+++ b/clang/lib/Format/Format.cpp
@@ -2114,10 +2114,14 @@ std::error_code parseConfiguration(llvm::MemoryBufferRef Config,
   FormatStyle::FormatStyleSet StyleSet;
   bool LanguageFound = false;
   for (const FormatStyle &Style : llvm::reverse(Styles)) {
-    if (Style.Language != FormatStyle::LK_None)
+    const auto Lang = Style.Language;
+    if (Lang != FormatStyle::LK_None)
       StyleSet.Add(Style);
-    if (Style.Language == Language)
+    if (Lang == Language ||
+        // For backward compatibility.
+        (Lang == FormatStyle::LK_Cpp && Language == FormatStyle::LK_C)) {
       LanguageFound = true;
+    }
   }
   if (!LanguageFound) {
     if (Styles.empty() || Styles[0].Language != FormatStyle::LK_None)
@@ -2157,8 +2161,14 @@ FormatStyle::FormatStyleSet::Get(FormatStyle::LanguageKind Language) const {
   if (!Styles)
     return std::nullopt;
   auto It = Styles->find(Language);
-  if (It == Styles->end())
-    return std::nullopt;
+  if (It == Styles->end()) {
+    if (Language != FormatStyle::LK_C)
+      return std::nullopt;
+    // For backward compatibility.
+    It = Styles->find(FormatStyle::LK_Cpp);
+    if (It == Styles->end())
+      return std::nullopt;
+  }
   FormatStyle Style = It->second;
   Style.StyleSet = *this;
   return Style;
diff --git a/clang/unittests/Format/ConfigParseTest.cpp b/clang/unittests/Format/ConfigParseTest.cpp
index 10788449a1a1d..fcf07e660ddb6 100644
--- a/clang/unittests/Format/ConfigParseTest.cpp
+++ b/clang/unittests/Format/ConfigParseTest.cpp
@@ -1214,6 +1214,26 @@ TEST(ConfigParseTest, ParsesConfigurationWithLanguages) {
               IndentWidth, 56u);
 }
 
+TEST(ConfigParseTest, AllowCppForC) {
+  FormatStyle Style = {};
+  Style.Language = FormatStyle::LK_C;
+  EXPECT_EQ(parseConfiguration("Language: Cpp", &Style), ParseError::Success);
+
+  CHECK_PARSE("---\n"
+              "IndentWidth: 4\n"
+              "---\n"
+              "Language: Cpp\n"
+              "IndentWidth: 8\n",
+              IndentWidth, 8u);
+
+  EXPECT_EQ(parseConfiguration("---\n"
+                               "Language: ObjC\n"
+                               "---\n"
+                               "Language: Cpp\n",
+                               &Style),
+            ParseError::Success);
+}
+
 TEST(ConfigParseTest, UsesLanguageForBasedOnStyle) {
   FormatStyle Style = {};
   Style.Language = FormatStyle::LK_JavaScript;

From 44a6f6abbdb6f0eebfaf1ad6f601c29f80782de7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Wed, 26 Mar 2025 22:13:28 +0200
Subject: [PATCH 280/282] [libcxx] [test] Fix restoring LLVM_DIR and Clang_DIR
 (#132838)

In 664f345cd53d1f624d94f9889a1c9fff803e3391, a fix was introduced,
attempting to restore LLVM_DIR and Clang_DIR after doing
find_package(Clang).

However, 6775285e7695f2d45cf455f5d31b2c9fa9362d3d added a return if the
clangTidy target wasn't found. If this is hit, we don't restore LLVM_DIR
and Clang_DIR, which causes strange effects if CMake is rerun a second
time.

Move the code for restoring LLVM_DIR and Clang_DIR to directly after the
find_package calls, to make sure they are restored, regardless of the
find_package outcome.

(cherry picked from commit 51bceb46f8eeb7c3d060387be315ca41855933c2)
---
 libcxx/test/tools/clang_tidy_checks/CMakeLists.txt | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/libcxx/test/tools/clang_tidy_checks/CMakeLists.txt b/libcxx/test/tools/clang_tidy_checks/CMakeLists.txt
index 0f8f0e8864d0f..da045fac92ce4 100644
--- a/libcxx/test/tools/clang_tidy_checks/CMakeLists.txt
+++ b/libcxx/test/tools/clang_tidy_checks/CMakeLists.txt
@@ -8,6 +8,10 @@ set(Clang_DIR_SAVE ${Clang_DIR})
 # versions must match. Otherwise there likely will be ODR-violations. This had
 # led to crashes and incorrect output of the clang-tidy based checks.
 find_package(Clang ${CMAKE_CXX_COMPILER_VERSION})
+
+set(LLVM_DIR "${LLVM_DIR_SAVE}" CACHE PATH "The directory containing a CMake configuration file for LLVM." FORCE)
+set(Clang_DIR "${Clang_DIR_SAVE}" CACHE PATH "The directory containing a CMake configuration file for Clang." FORCE)
+
 if(NOT Clang_FOUND)
   message(STATUS "Clang-tidy tests are disabled since the "
                  "Clang development package is unavailable.")
@@ -19,9 +23,6 @@ if(NOT TARGET clangTidy)
   return()
 endif()
 
-set(LLVM_DIR "${LLVM_DIR_SAVE}" CACHE PATH "The directory containing a CMake configuration file for LLVM." FORCE)
-set(Clang_DIR "${Clang_DIR_SAVE}" CACHE PATH "The directory containing a CMake configuration file for Clang." FORCE)
-
 message(STATUS "Found system-installed LLVM ${LLVM_PACKAGE_VERSION} with headers in ${LLVM_INCLUDE_DIRS}")
 
 set(CMAKE_CXX_STANDARD 20)

From 943b43250b5580b73a4932032270e4478b118dcf Mon Sep 17 00:00:00 2001
From: R-Goc <ryszardgoc@gmail.com>
Date: Sat, 29 Mar 2025 07:27:50 +0100
Subject: [PATCH 281/282] release/20.x: [clang][docs] Move -Wnontrivial-memcall
 to added flags. (#132367)

-Wnon-trivial-memcall was incorrectly added to modified flags instead of
added flags. This commit moves it to the added compiler flags.
---
 clang/docs/ReleaseNotes.rst | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index c921ac3518f01..f4befc242f28b 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -545,6 +545,11 @@ New Compiler Flags
 - The ``-Warray-compare-cxx26`` warning has been added to warn about array comparison
   starting from C++26, this warning is enabled as an error by default.
 
+- The ``-Wnontrivial-memcall`` warning has been added to warn about
+  passing non-trivially-copyable destination parameter to ``memcpy``,
+  ``memset`` and similar functions for which it is a documented undefined
+  behavior. It is implied by ``-Wnontrivial-memaccess``
+
 - clang-cl and clang-dxc now support ``-fdiagnostics-color=[auto|never|always]``
   in addition to ``-f[no-]color-diagnostics``.
 
@@ -576,11 +581,6 @@ Modified Compiler Flags
   to utilize these vector libraries. The behavior for all other vector function
   libraries remains unchanged.
 
-- The ``-Wnontrivial-memcall`` warning has been added to warn about
-  passing non-trivially-copyable destination parameter to ``memcpy``,
-  ``memset`` and similar functions for which it is a documented undefined
-  behavior. It is implied by ``-Wnontrivial-memaccess``
-
 - Added ``-fmodules-reduced-bmi`` flag corresponding to
   ``-fexperimental-modules-reduced-bmi`` flag. The ``-fmodules-reduced-bmi`` flag
   is intended to be enabled by default in the future.

From 5ba194972878531b527f7370b509829a8e251949 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Tue, 25 Mar 2025 21:17:37 -0700
Subject: [PATCH 282/282] [MC,COFF] .safeseh: avoid changeSection (#132624)

The directive temporarily switches to the .sxdata section to emit data,
and then calls `insert`, which makes `CurFrag` out of sync of the
current section. Call push/switch/pop instead.

Related to #132464

(cherry picked from commit ece72e2731350d9840c6446db9276b04d593cc23)
---
 llvm/lib/MC/MCWinCOFFStreamer.cpp | 4 +++-
 llvm/test/CodeGen/X86/win32-eh.ll | 9 +++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/MC/MCWinCOFFStreamer.cpp b/llvm/lib/MC/MCWinCOFFStreamer.cpp
index 8fd46bc8b0255..3720d6e26fe46 100644
--- a/llvm/lib/MC/MCWinCOFFStreamer.cpp
+++ b/llvm/lib/MC/MCWinCOFFStreamer.cpp
@@ -299,7 +299,8 @@ void MCWinCOFFStreamer::emitCOFFSafeSEH(MCSymbol const *Symbol) {
     return;
 
   MCSection *SXData = getContext().getObjectFileInfo()->getSXDataSection();
-  changeSection(SXData);
+  pushSection();
+  switchSection(SXData);
   SXData->ensureMinAlignment(Align(4));
 
   insert(getContext().allocFragment<MCSymbolIdFragment>(Symbol));
@@ -310,6 +311,7 @@ void MCWinCOFFStreamer::emitCOFFSafeSEH(MCSymbol const *Symbol) {
   // function. Go ahead and oblige it here.
   CSymbol->setType(COFF::IMAGE_SYM_DTYPE_FUNCTION
                    << COFF::SCT_COMPLEX_TYPE_SHIFT);
+  popSection();
 }
 
 void MCWinCOFFStreamer::emitCOFFSymbolIndex(MCSymbol const *Symbol) {
diff --git a/llvm/test/CodeGen/X86/win32-eh.ll b/llvm/test/CodeGen/X86/win32-eh.ll
index 82dc4beaf972b..d3d19ede546d6 100644
--- a/llvm/test/CodeGen/X86/win32-eh.ll
+++ b/llvm/test/CodeGen/X86/win32-eh.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -mtriple=i686-pc-windows-msvc < %s | FileCheck %s
+; RUN: llc -mtriple=i686-pc-windows-msvc -filetype=obj < %s -o %t
 
 declare void @may_throw_or_crash()
 declare i32 @_except_handler3(...)
@@ -208,6 +209,14 @@ catch:
 ; CHECK-NEXT:  .long   0
 ; CHECK-NEXT:  .long   1
 
+; CHECK-LABEL: inlineasm:
+; CHECK: .safeseh my_handler
+define i32 @inlineasm() {
+entry:
+  call void asm sideeffect ".safeseh my_handler", "~{dirflag},~{fpsr},~{flags}"()
+  ret i32 0
+}
+
 ; CHECK-LABEL: ___ehhandler$use_CxxFrameHandler3:
 ; CHECK: movl $L__ehtable$use_CxxFrameHandler3, %eax
 ; CHECK-NEXT: jmp  ___CxxFrameHandler3 # TAILCALL