[libc][nfc] Use common implementation of read_first_lane_u64 #131027

JonChesterfield · 2025-03-12T20:52:51Z

No codegen regression on either target. The two builtin_ffs implied on nvptx CSE away.

define internal i64 @__gpu_read_first_lane_u64(i64 noundef %__lane_mask, i64 noundef %__x) #2 {
entry:
  %shr = lshr i64 %__x, 32
  %conv = trunc nuw i64 %shr to i32
  %conv1 = trunc i64 %__x to i32
  %conv2 = trunc i64 %__lane_mask to i32
  %0 = tail call range(i32 0, 33) i32 @llvm.cttz.i32(i32 %conv2, i1 true)
  %iszero = icmp eq i32 %conv2, 0
  %sub = select i1 %iszero, i32 -1, i32 %0
  %1 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 %conv2, i32 %conv, i32 %sub, i32 31)
  %conv4 = sext i32 %1 to i64
  %shl = shl nsw i64 %conv4, 32
  %2 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 %conv2, i32 %conv1, i32 %sub, i32 31)
  %conv7 = zext i32 %2 to i64
  %or = or disjoint i64 %shl, %conv7
  ret i64 %or
}
; becomes

define internal i64 @__gpu_competing_read_first_lane_u64(i64 noundef %__lane_mask, i64 noundef %__x) #2 {
entry:
  %shr = lshr i64 %__x, 32
  %conv = trunc nuw i64 %shr to i32
  %conv1 = trunc i64 %__x to i32
  %conv.i = trunc i64 %__lane_mask to i32
  %0 = tail call range(i32 0, 33) i32 @llvm.cttz.i32(i32 %conv.i, i1 true)
  %iszero = icmp eq i32 %conv.i, 0
  %sub.i = select i1 %iszero, i32 -1, i32 %0
  %1 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 %conv.i, i32 %conv, i32 %sub.i, i32 31)
  %conv4 = zext i32 %1 to i64
  %shl = shl nuw i64 %conv4, 32
  %2 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 %conv.i, i32 %conv1, i32 %sub.i, i32 31)
  %conv7 = zext i32 %2 to i64
  %or = or disjoint i64 %shl, %conv7
  ret i64 %or
}

The sext vs zext difference is vaguely interesting but since the bits are immediately discarded in either case it make no odds. The amdgcn one doesn't need CSE, the readfirstlane function is a single call to an intrinsic.

Drive by fix to __gpu_match_all_u32, it was calling first_lane_u64 and could use first_lane_u32 instead. Added the missing call to gpuintrin.c test case and a stray missing static as well.

llvmbot · 2025-03-12T20:53:15Z

@llvm/pr-subscribers-clang
@llvm/pr-subscribers-backend-x86
@llvm/pr-subscribers-backend-amdgpu

@llvm/pr-subscribers-libc

Author: Jon Chesterfield (JonChesterfield)

Changes

No codegen regression on either target. The two builtin_ffs implied on nvptx CSE away.

define internal i64 @<!-- -->__gpu_read_first_lane_u64(i64 noundef %__lane_mask, i64 noundef %__x) #<!-- -->2 {
entry:
  %shr = lshr i64 %__x, 32
  %conv = trunc nuw i64 %shr to i32
  %conv1 = trunc i64 %__x to i32
  %conv2 = trunc i64 %__lane_mask to i32
  %0 = tail call range(i32 0, 33) i32 @<!-- -->llvm.cttz.i32(i32 %conv2, i1 true)
  %iszero = icmp eq i32 %conv2, 0
  %sub = select i1 %iszero, i32 -1, i32 %0
  %1 = tail call i32 @<!-- -->llvm.nvvm.shfl.sync.idx.i32(i32 %conv2, i32 %conv, i32 %sub, i32 31)
  %conv4 = sext i32 %1 to i64
  %shl = shl nsw i64 %conv4, 32
  %2 = tail call i32 @<!-- -->llvm.nvvm.shfl.sync.idx.i32(i32 %conv2, i32 %conv1, i32 %sub, i32 31)
  %conv7 = zext i32 %2 to i64
  %or = or disjoint i64 %shl, %conv7
  ret i64 %or
}
; becomes

define internal i64 @<!-- -->__gpu_competing_read_first_lane_u64(i64 noundef %__lane_mask, i64 noundef %__x) #<!-- -->2 {
entry:
  %shr = lshr i64 %__x, 32
  %conv = trunc nuw i64 %shr to i32
  %conv1 = trunc i64 %__x to i32
  %conv.i = trunc i64 %__lane_mask to i32
  %0 = tail call range(i32 0, 33) i32 @<!-- -->llvm.cttz.i32(i32 %conv.i, i1 true)
  %iszero = icmp eq i32 %conv.i, 0
  %sub.i = select i1 %iszero, i32 -1, i32 %0
  %1 = tail call i32 @<!-- -->llvm.nvvm.shfl.sync.idx.i32(i32 %conv.i, i32 %conv, i32 %sub.i, i32 31)
  %conv4 = zext i32 %1 to i64
  %shl = shl nuw i64 %conv4, 32
  %2 = tail call i32 @<!-- -->llvm.nvvm.shfl.sync.idx.i32(i32 %conv.i, i32 %conv1, i32 %sub.i, i32 31)
  %conv7 = zext i32 %2 to i64
  %or = or disjoint i64 %shl, %conv7
  ret i64 %or
}

The sext vs zext difference is vaguely interesting but since the bits are immediately discarded in either case it make no odds. The amdgcn one doesn't need CSE, the readfirstlane function is a single call to an intrinsic.

Drive by fix to __gpu_match_all_u32, it was calling first_lane_u64 and could use first_lane_u32 instead. Added the missing call to gpuintrin.c test case and a stray missing static as well.

Full diff: https://github.com/llvm/llvm-project/pull/131027.diff

4 Files Affected:

(modified) clang/lib/Headers/amdgpuintrin.h (+5-10)
(modified) clang/lib/Headers/gpuintrin.h (+9)
(modified) clang/lib/Headers/nvptxintrin.h (+5-16)
(modified) clang/test/Headers/gpuintrin.c (+79-8)

diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
index 15409eacf7716..839a05175cf3e 100644
--- a/clang/lib/Headers/amdgpuintrin.h
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -33,6 +33,10 @@ _Pragma("omp begin declare variant match(device = {arch(amdgcn)})");
 // Attribute to declare a function as a kernel.
 #define __gpu_kernel __attribute__((amdgpu_kernel, visibility("protected")))
 
+// Defined in gpuintrin.h, used later in this file.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x);
+
 // Returns the number of workgroups in the 'x' dimension of the grid.
 _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
   return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
@@ -115,15 +119,6 @@ __gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) {
   return __builtin_amdgcn_readfirstlane(__x);
 }
 
-// Copies the value from the first active thread in the wavefront to the rest.
-_DEFAULT_FN_ATTRS __inline__ uint64_t
-__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) {
-  uint32_t __hi = (uint32_t)(__x >> 32ull);
-  uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
-  return ((uint64_t)__builtin_amdgcn_readfirstlane(__hi) << 32ull) |
-         ((uint64_t)__builtin_amdgcn_readfirstlane(__lo) & 0xFFFFFFFF);
-}
-
 // Returns a bitmask of threads in the current lane for which \p x is true.
 _DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask,
                                                           bool __x) {
@@ -203,7 +198,7 @@ __gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
 // Returns the current lane mask if every lane contains __x.
 _DEFAULT_FN_ATTRS static __inline__ uint64_t
 __gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
-  uint32_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+  uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
   uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
   __gpu_sync_lane(__lane_mask);
   return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index efdc3d94ac0b3..b7b997c1968c5 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -115,6 +115,15 @@ __gpu_is_first_in_lane(uint64_t __lane_mask) {
   return __gpu_lane_id() == __gpu_first_lane_id(__lane_mask);
 }
 
+// Copies the value from the first active thread in the wavefront to the rest.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) {
+  uint32_t __hi = (uint32_t)(__x >> 32ull);
+  uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFFull);
+  return ((uint64_t)__gpu_read_first_lane_u32(__lane_mask, __hi) << 32ull) |
+         ((uint64_t)__gpu_read_first_lane_u32(__lane_mask, __lo) & 0xFFFFFFFFull);
+}
+
 // Gets the first floating point value from the active lanes.
 _DEFAULT_FN_ATTRS static __inline__ float
 __gpu_read_first_lane_f32(uint64_t __lane_mask, float __x) {
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
index 73eb0af8b5926..d00a5f6de3950 100644
--- a/clang/lib/Headers/nvptxintrin.h
+++ b/clang/lib/Headers/nvptxintrin.h
@@ -37,6 +37,10 @@ _Pragma("omp begin declare variant match(device = {arch(nvptx64)})");
 // Attribute to declare a function as a kernel.
 #define __gpu_kernel __attribute__((nvptx_kernel, visibility("protected")))
 
+// Defined in gpuintrin.h, used later in this file.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x);
+
 // Returns the number of CUDA blocks in the 'x' dimension.
 _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
   return __nvvm_read_ptx_sreg_nctaid_x();
@@ -120,21 +124,6 @@ __gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) {
   return __nvvm_shfl_sync_idx_i32(__mask, __x, __id, __gpu_num_lanes() - 1);
 }
 
-// Copies the value from the first active thread in the warp to the rest.
-_DEFAULT_FN_ATTRS static __inline__ uint64_t
-__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) {
-  uint32_t __hi = (uint32_t)(__x >> 32ull);
-  uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
-  uint32_t __mask = (uint32_t)__lane_mask;
-  uint32_t __id = __builtin_ffs(__mask) - 1;
-  return ((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __hi, __id,
-                                             __gpu_num_lanes() - 1)
-          << 32ull) |
-         ((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __lo, __id,
-                                             __gpu_num_lanes() - 1) &
-          0xFFFFFFFF);
-}
-
 // Returns a bitmask of threads in the current lane for which \p x is true.
 _DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask,
                                                           bool __x) {
@@ -231,7 +220,7 @@ __gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
   return __nvvm_match_all_sync_i32p(__lane_mask, __x, &predicate);
 #endif
 
-  uint32_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+  uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
   uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
   return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
 }
diff --git a/clang/test/Headers/gpuintrin.c b/clang/test/Headers/gpuintrin.c
index 30aa6f147ba03..9a15ce277ba87 100644
--- a/clang/test/Headers/gpuintrin.c
+++ b/clang/test/Headers/gpuintrin.c
@@ -33,6 +33,7 @@ __gpu_kernel void foo() {
   __gpu_lane_id();
   __gpu_lane_mask();
   __gpu_read_first_lane_u32(-1, -1);
+  __gpu_read_first_lane_u64(-1, -1);
   __gpu_ballot(-1, 1);
   __gpu_sync_threads();
   __gpu_sync_lane(-1);
@@ -64,12 +65,13 @@ __gpu_kernel void foo() {
 // AMDGPU-NEXT:    [[CALL17:%.*]] = call i32 @__gpu_lane_id() #[[ATTR7]]
 // AMDGPU-NEXT:    [[CALL18:%.*]] = call i64 @__gpu_lane_mask() #[[ATTR7]]
 // AMDGPU-NEXT:    [[CALL19:%.*]] = call i32 @__gpu_read_first_lane_u32(i64 noundef -1, i32 noundef -1) #[[ATTR7]]
-// AMDGPU-NEXT:    [[CALL20:%.*]] = call i64 @__gpu_ballot(i64 noundef -1, i1 noundef zeroext true) #[[ATTR7]]
+// AMDGPU-NEXT:    [[CALL20:%.*]] = call i64 @__gpu_read_first_lane_u64(i64 noundef -1, i64 noundef -1) #[[ATTR7]]
+// AMDGPU-NEXT:    [[CALL21:%.*]] = call i64 @__gpu_ballot(i64 noundef -1, i1 noundef zeroext true) #[[ATTR7]]
 // AMDGPU-NEXT:    call void @__gpu_sync_threads() #[[ATTR7]]
 // AMDGPU-NEXT:    call void @__gpu_sync_lane(i64 noundef -1) #[[ATTR7]]
-// AMDGPU-NEXT:    [[CALL21:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1, i32 noundef 0) #[[ATTR7]]
-// AMDGPU-NEXT:    [[CALL22:%.*]] = call i64 @__gpu_first_lane_id(i64 noundef -1) #[[ATTR7]]
-// AMDGPU-NEXT:    [[CALL23:%.*]] = call zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1) #[[ATTR7]]
+// AMDGPU-NEXT:    [[CALL22:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1, i32 noundef 0) #[[ATTR7]]
+// AMDGPU-NEXT:    [[CALL23:%.*]] = call i64 @__gpu_first_lane_id(i64 noundef -1) #[[ATTR7]]
+// AMDGPU-NEXT:    [[CALL24:%.*]] = call zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1) #[[ATTR7]]
 // AMDGPU-NEXT:    call void @__gpu_exit() #[[ATTR8:[0-9]+]]
 // AMDGPU-NEXT:    unreachable
 //
@@ -388,6 +390,43 @@ __gpu_kernel void foo() {
 // AMDGPU-NEXT:    ret i32 [[TMP1]]
 //
 //
+// AMDGPU-LABEL: define internal i64 @__gpu_read_first_lane_u64(
+// AMDGPU-SAME: i64 noundef [[__LANE_MASK:%.*]], i64 noundef [[__X:%.*]]) #[[ATTR0]] {
+// AMDGPU-NEXT:  [[ENTRY:.*:]]
+// AMDGPU-NEXT:    [[RETVAL:%.*]] = alloca i64, align 8, addrspace(5)
+// AMDGPU-NEXT:    [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
+// AMDGPU-NEXT:    [[__X_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
+// AMDGPU-NEXT:    [[__HI:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT:    [[__LO:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGPU-NEXT:    [[__LANE_MASK_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__LANE_MASK_ADDR]] to ptr
+// AMDGPU-NEXT:    [[__X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__X_ADDR]] to ptr
+// AMDGPU-NEXT:    [[__HI_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__HI]] to ptr
+// AMDGPU-NEXT:    [[__LO_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__LO]] to ptr
+// AMDGPU-NEXT:    store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR_ASCAST]], align 8
+// AMDGPU-NEXT:    store i64 [[__X]], ptr [[__X_ADDR_ASCAST]], align 8
+// AMDGPU-NEXT:    [[TMP0:%.*]] = load i64, ptr [[__X_ADDR_ASCAST]], align 8
+// AMDGPU-NEXT:    [[SHR:%.*]] = lshr i64 [[TMP0]], 32
+// AMDGPU-NEXT:    [[CONV:%.*]] = trunc i64 [[SHR]] to i32
+// AMDGPU-NEXT:    store i32 [[CONV]], ptr [[__HI_ASCAST]], align 4
+// AMDGPU-NEXT:    [[TMP1:%.*]] = load i64, ptr [[__X_ADDR_ASCAST]], align 8
+// AMDGPU-NEXT:    [[AND:%.*]] = and i64 [[TMP1]], 4294967295
+// AMDGPU-NEXT:    [[CONV1:%.*]] = trunc i64 [[AND]] to i32
+// AMDGPU-NEXT:    store i32 [[CONV1]], ptr [[__LO_ASCAST]], align 4
+// AMDGPU-NEXT:    [[TMP2:%.*]] = load i64, ptr [[__LANE_MASK_ADDR_ASCAST]], align 8
+// AMDGPU-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__HI_ASCAST]], align 4
+// AMDGPU-NEXT:    [[CALL:%.*]] = call i32 @__gpu_read_first_lane_u32(i64 noundef [[TMP2]], i32 noundef [[TMP3]]) #[[ATTR7]]
+// AMDGPU-NEXT:    [[CONV2:%.*]] = zext i32 [[CALL]] to i64
+// AMDGPU-NEXT:    [[SHL:%.*]] = shl i64 [[CONV2]], 32
+// AMDGPU-NEXT:    [[TMP4:%.*]] = load i64, ptr [[__LANE_MASK_ADDR_ASCAST]], align 8
+// AMDGPU-NEXT:    [[TMP5:%.*]] = load i32, ptr [[__LO_ASCAST]], align 4
+// AMDGPU-NEXT:    [[CALL3:%.*]] = call i32 @__gpu_read_first_lane_u32(i64 noundef [[TMP4]], i32 noundef [[TMP5]]) #[[ATTR7]]
+// AMDGPU-NEXT:    [[CONV4:%.*]] = zext i32 [[CALL3]] to i64
+// AMDGPU-NEXT:    [[AND5:%.*]] = and i64 [[CONV4]], 4294967295
+// AMDGPU-NEXT:    [[OR:%.*]] = or i64 [[SHL]], [[AND5]]
+// AMDGPU-NEXT:    ret i64 [[OR]]
+//
+//
 // AMDGPU-LABEL: define internal i64 @__gpu_ballot(
 // AMDGPU-SAME: i64 noundef [[__LANE_MASK:%.*]], i1 noundef zeroext [[__X:%.*]]) #[[ATTR0]] {
 // AMDGPU-NEXT:  [[ENTRY:.*:]]
@@ -525,12 +564,13 @@ __gpu_kernel void foo() {
 // NVPTX-NEXT:    [[CALL17:%.*]] = call i32 @__gpu_lane_id() #[[ATTR6]]
 // NVPTX-NEXT:    [[CALL18:%.*]] = call i64 @__gpu_lane_mask() #[[ATTR6]]
 // NVPTX-NEXT:    [[CALL19:%.*]] = call i32 @__gpu_read_first_lane_u32(i64 noundef -1, i32 noundef -1) #[[ATTR6]]
-// NVPTX-NEXT:    [[CALL20:%.*]] = call i64 @__gpu_ballot(i64 noundef -1, i1 noundef zeroext true) #[[ATTR6]]
+// NVPTX-NEXT:    [[CALL20:%.*]] = call i64 @__gpu_read_first_lane_u64(i64 noundef -1, i64 noundef -1) #[[ATTR6]]
+// NVPTX-NEXT:    [[CALL21:%.*]] = call i64 @__gpu_ballot(i64 noundef -1, i1 noundef zeroext true) #[[ATTR6]]
 // NVPTX-NEXT:    call void @__gpu_sync_threads() #[[ATTR6]]
 // NVPTX-NEXT:    call void @__gpu_sync_lane(i64 noundef -1) #[[ATTR6]]
-// NVPTX-NEXT:    [[CALL21:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1, i32 noundef 0) #[[ATTR6]]
-// NVPTX-NEXT:    [[CALL22:%.*]] = call i64 @__gpu_first_lane_id(i64 noundef -1) #[[ATTR6]]
-// NVPTX-NEXT:    [[CALL23:%.*]] = call zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1) #[[ATTR6]]
+// NVPTX-NEXT:    [[CALL22:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1, i32 noundef 0) #[[ATTR6]]
+// NVPTX-NEXT:    [[CALL23:%.*]] = call i64 @__gpu_first_lane_id(i64 noundef -1) #[[ATTR6]]
+// NVPTX-NEXT:    [[CALL24:%.*]] = call zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1) #[[ATTR6]]
 // NVPTX-NEXT:    call void @__gpu_exit() #[[ATTR7:[0-9]+]]
 // NVPTX-NEXT:    unreachable
 //
@@ -793,6 +833,37 @@ __gpu_kernel void foo() {
 // NVPTX-NEXT:    ret i32 [[TMP7]]
 //
 //
+// NVPTX-LABEL: define internal i64 @__gpu_read_first_lane_u64(
+// NVPTX-SAME: i64 noundef [[__LANE_MASK:%.*]], i64 noundef [[__X:%.*]]) #[[ATTR0]] {
+// NVPTX-NEXT:  [[ENTRY:.*:]]
+// NVPTX-NEXT:    [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8
+// NVPTX-NEXT:    [[__X_ADDR:%.*]] = alloca i64, align 8
+// NVPTX-NEXT:    [[__HI:%.*]] = alloca i32, align 4
+// NVPTX-NEXT:    [[__LO:%.*]] = alloca i32, align 4
+// NVPTX-NEXT:    store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR]], align 8
+// NVPTX-NEXT:    store i64 [[__X]], ptr [[__X_ADDR]], align 8
+// NVPTX-NEXT:    [[TMP0:%.*]] = load i64, ptr [[__X_ADDR]], align 8
+// NVPTX-NEXT:    [[SHR:%.*]] = lshr i64 [[TMP0]], 32
+// NVPTX-NEXT:    [[CONV:%.*]] = trunc i64 [[SHR]] to i32
+// NVPTX-NEXT:    store i32 [[CONV]], ptr [[__HI]], align 4
+// NVPTX-NEXT:    [[TMP1:%.*]] = load i64, ptr [[__X_ADDR]], align 8
+// NVPTX-NEXT:    [[AND:%.*]] = and i64 [[TMP1]], 4294967295
+// NVPTX-NEXT:    [[CONV1:%.*]] = trunc i64 [[AND]] to i32
+// NVPTX-NEXT:    store i32 [[CONV1]], ptr [[__LO]], align 4
+// NVPTX-NEXT:    [[TMP2:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8
+// NVPTX-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__HI]], align 4
+// NVPTX-NEXT:    [[CALL:%.*]] = call i32 @__gpu_read_first_lane_u32(i64 noundef [[TMP2]], i32 noundef [[TMP3]]) #[[ATTR6]]
+// NVPTX-NEXT:    [[CONV2:%.*]] = zext i32 [[CALL]] to i64
+// NVPTX-NEXT:    [[SHL:%.*]] = shl i64 [[CONV2]], 32
+// NVPTX-NEXT:    [[TMP4:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8
+// NVPTX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[__LO]], align 4
+// NVPTX-NEXT:    [[CALL3:%.*]] = call i32 @__gpu_read_first_lane_u32(i64 noundef [[TMP4]], i32 noundef [[TMP5]]) #[[ATTR6]]
+// NVPTX-NEXT:    [[CONV4:%.*]] = zext i32 [[CALL3]] to i64
+// NVPTX-NEXT:    [[AND5:%.*]] = and i64 [[CONV4]], 4294967295
+// NVPTX-NEXT:    [[OR:%.*]] = or i64 [[SHL]], [[AND5]]
+// NVPTX-NEXT:    ret i64 [[OR]]
+//
+//
 // NVPTX-LABEL: define internal i64 @__gpu_ballot(
 // NVPTX-SAME: i64 noundef [[__LANE_MASK:%.*]], i1 noundef zeroext [[__X:%.*]]) #[[ATTR0]] {
 // NVPTX-NEXT:  [[ENTRY:.*:]]

github-actions · 2025-03-12T20:56:12Z

✅ With the latest revision this PR passed the C/C++ code formatter.

jhuber6

This means that nvptxintrin.h and amdgpuintrin.h can't be included standalone, but I'm not sure it's a big deal since we already define a lot of the common functionality in the gpurintrin.h header.

…gen regression

llvm-ci · 2025-03-12T21:34:33Z

LLVM Buildbot has detected a new failure on builder openmp-s390x-linux running on systemz-1 while building clang at step 2 "checkout".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/88/builds/9054

Here is the relevant piece of the build log for the reference

Step 2 (checkout) failure: update (failure)
git version 2.34.1
fatal: unable to access 'https://github.com/llvm/llvm-project.git/': Failed to connect to github.com port 443 after 133598 ms: Connection timed out
fatal: unable to access 'https://github.com/llvm/llvm-project.git/': GnuTLS recv error (-110): The TLS connection was non-properly terminated.

…1027) No codegen regression on either target. The two builtin_ffs implied on nvptx CSE away. ``` define internal i64 @__gpu_read_first_lane_u64(i64 noundef %__lane_mask, i64 noundef %__x) llvm#2 { entry: %shr = lshr i64 %__x, 32 %conv = trunc nuw i64 %shr to i32 %conv1 = trunc i64 %__x to i32 %conv2 = trunc i64 %__lane_mask to i32 %0 = tail call range(i32 0, 33) i32 @llvm.cttz.i32(i32 %conv2, i1 true) %iszero = icmp eq i32 %conv2, 0 %sub = select i1 %iszero, i32 -1, i32 %0 %1 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 %conv2, i32 %conv, i32 %sub, i32 31) %conv4 = sext i32 %1 to i64 %shl = shl nsw i64 %conv4, 32 %2 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 %conv2, i32 %conv1, i32 %sub, i32 31) %conv7 = zext i32 %2 to i64 %or = or disjoint i64 %shl, %conv7 ret i64 %or } ; becomes define internal i64 @__gpu_competing_read_first_lane_u64(i64 noundef %__lane_mask, i64 noundef %__x) llvm#2 { entry: %shr = lshr i64 %__x, 32 %conv = trunc nuw i64 %shr to i32 %conv1 = trunc i64 %__x to i32 %conv.i = trunc i64 %__lane_mask to i32 %0 = tail call range(i32 0, 33) i32 @llvm.cttz.i32(i32 %conv.i, i1 true) %iszero = icmp eq i32 %conv.i, 0 %sub.i = select i1 %iszero, i32 -1, i32 %0 %1 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 %conv.i, i32 %conv, i32 %sub.i, i32 31) %conv4 = zext i32 %1 to i64 %shl = shl nuw i64 %conv4, 32 %2 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 %conv.i, i32 %conv1, i32 %sub.i, i32 31) %conv7 = zext i32 %2 to i64 %or = or disjoint i64 %shl, %conv7 ret i64 %or } ``` The sext vs zext difference is vaguely interesting but since the bits are immediately discarded in either case it make no odds. The amdgcn one doesn't need CSE, the readfirstlane function is a single call to an intrinsic. Drive by fix to __gpu_match_all_u32, it was calling first_lane_u64 and could use first_lane_u32 instead. Added the missing call to gpuintrin.c test case and a stray missing static as well.

JonChesterfield added the libc label Mar 12, 2025

JonChesterfield requested a review from jhuber6 March 12, 2025 20:52

llvmbot added clang backend:AMDGPU backend:X86 clang:headers labels Mar 12, 2025

jhuber6 approved these changes Mar 12, 2025

View reviewed changes

[libc][nfc] Use common implementation of read_first_lane_u64, no code…

Loading
Loading status checks…

68f09d0

…gen regression

JonChesterfield force-pushed the jc_common_read_first_lane_u64 branch from a343ee9 to 68f09d0 Compare March 12, 2025 21:01

JonChesterfield merged commit cba9dc6 into llvm:main Mar 12, 2025
9 of 10 checks passed

JonChesterfield deleted the jc_common_read_first_lane_u64 branch March 12, 2025 21:29

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[libc][nfc] Use common implementation of read_first_lane_u64 #131027

[libc][nfc] Use common implementation of read_first_lane_u64 #131027

JonChesterfield commented Mar 12, 2025

llvmbot commented Mar 12, 2025 •

edited

Loading

github-actions bot commented Mar 12, 2025 •

edited

Loading

jhuber6 left a comment

llvm-ci commented Mar 12, 2025

[libc][nfc] Use common implementation of read_first_lane_u64 #131027

[libc][nfc] Use common implementation of read_first_lane_u64 #131027

Conversation

JonChesterfield commented Mar 12, 2025

llvmbot commented Mar 12, 2025 • edited Loading

github-actions bot commented Mar 12, 2025 • edited Loading

jhuber6 left a comment

Choose a reason for hiding this comment

llvm-ci commented Mar 12, 2025

llvmbot commented Mar 12, 2025 •

edited

Loading

github-actions bot commented Mar 12, 2025 •

edited

Loading