Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit ac604b2

Browse files
authoredJan 7, 2025··
[libc] Switch to using the generic <gpuintrin.h> implementations (#121810)
Summary: This patch switches the GPU utility helpers to wrapping around the gpuintrin.h ones with a C++ flavor.
1 parent a15fedc commit ac604b2

File tree

10 files changed

+90
-479
lines changed

10 files changed

+90
-479
lines changed
 

‎libc/src/__support/GPU/CMakeLists.txt

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,12 @@
1-
if(NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_ARCHITECTURE})
1+
# These utilities are GPU only.
2+
if(NOT LIBC_TARGET_OS_IS_GPU)
23
return()
34
endif()
45

5-
add_subdirectory(${LIBC_TARGET_ARCHITECTURE})
6-
set(target_gpu_utils libc.src.__support.GPU.${LIBC_TARGET_ARCHITECTURE}.${LIBC_TARGET_ARCHITECTURE}_utils)
7-
86
add_header_library(
97
utils
108
HDRS
119
utils.h
12-
DEPENDS
13-
${target_gpu_utils}
1410
)
1511

1612
add_object_library(
@@ -21,6 +17,6 @@ add_object_library(
2117
allocator.h
2218
DEPENDS
2319
libc.src.__support.common
24-
libc.src.__support.GPU.utils
2520
libc.src.__support.RPC.rpc_client
21+
.utils
2622
)

‎libc/src/__support/GPU/amdgpu/CMakeLists.txt

Lines changed: 0 additions & 7 deletions
This file was deleted.

‎libc/src/__support/GPU/amdgpu/utils.h

Lines changed: 0 additions & 183 deletions
This file was deleted.

‎libc/src/__support/GPU/generic/CMakeLists.txt

Lines changed: 0 additions & 7 deletions
This file was deleted.

‎libc/src/__support/GPU/generic/utils.h

Lines changed: 0 additions & 84 deletions
This file was deleted.

‎libc/src/__support/GPU/nvptx/CMakeLists.txt

Lines changed: 0 additions & 7 deletions
This file was deleted.

‎libc/src/__support/GPU/nvptx/utils.h

Lines changed: 0 additions & 160 deletions
This file was deleted.

‎libc/src/__support/GPU/utils.h

Lines changed: 84 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -9,48 +9,108 @@
99
#ifndef LLVM_LIBC_SRC___SUPPORT_GPU_UTILS_H
1010
#define LLVM_LIBC_SRC___SUPPORT_GPU_UTILS_H
1111

12+
#include "src/__support/macros/attributes.h"
1213
#include "src/__support/macros/config.h"
1314
#include "src/__support/macros/properties/architectures.h"
1415

15-
#if defined(LIBC_TARGET_ARCH_IS_AMDGPU)
16-
#include "amdgpu/utils.h"
17-
#elif defined(LIBC_TARGET_ARCH_IS_NVPTX)
18-
#include "nvptx/utils.h"
19-
#else
20-
#include "generic/utils.h"
16+
#if !__has_include(<gpuintrin.h>)
17+
#error "Unsupported compiler"
2118
#endif
2219

20+
#include <gpuintrin.h>
21+
2322
namespace LIBC_NAMESPACE_DECL {
2423
namespace gpu {
25-
/// Get the first active thread inside the lane.
26-
LIBC_INLINE uint64_t get_first_lane_id(uint64_t lane_mask) {
27-
return __builtin_ffsll(lane_mask) - 1;
24+
25+
template <typename T> using Private = __gpu_private T;
26+
template <typename T> using Constant = __gpu_constant T;
27+
template <typename T> using Local = __gpu_local T;
28+
template <typename T> using Global = __gpu_local T;
29+
30+
LIBC_INLINE uint32_t get_num_blocks_x() { return __gpu_num_blocks(0); }
31+
32+
LIBC_INLINE uint32_t get_num_blocks_y() { return __gpu_num_blocks(1); }
33+
34+
LIBC_INLINE uint32_t get_num_blocks_z() { return __gpu_num_blocks(2); }
35+
36+
LIBC_INLINE uint64_t get_num_blocks() {
37+
return get_num_blocks_x() * get_num_blocks_y() * get_num_blocks_z();
38+
}
39+
40+
LIBC_INLINE uint32_t get_block_id_x() { return __gpu_block_id(0); }
41+
42+
LIBC_INLINE uint32_t get_block_id_y() { return __gpu_block_id(1); }
43+
44+
LIBC_INLINE uint32_t get_block_id_z() { return __gpu_block_id(2); }
45+
46+
LIBC_INLINE uint64_t get_block_id() {
47+
return get_block_id_x() + get_num_blocks_x() * get_block_id_y() +
48+
get_num_blocks_x() * get_num_blocks_y() * get_block_id_z();
49+
}
50+
51+
LIBC_INLINE uint32_t get_num_threads_x() { return __gpu_num_threads(0); }
52+
53+
LIBC_INLINE uint32_t get_num_threads_y() { return __gpu_num_threads(1); }
54+
55+
LIBC_INLINE uint32_t get_num_threads_z() { return __gpu_num_threads(2); }
56+
57+
LIBC_INLINE uint64_t get_num_threads() {
58+
return get_num_threads_x() * get_num_threads_y() * get_num_threads_z();
59+
}
60+
61+
LIBC_INLINE uint32_t get_thread_id_x() { return __gpu_thread_id(0); }
62+
63+
LIBC_INLINE uint32_t get_thread_id_y() { return __gpu_thread_id(1); }
64+
65+
LIBC_INLINE uint32_t get_thread_id_z() { return __gpu_thread_id(2); }
66+
67+
LIBC_INLINE uint64_t get_thread_id() {
68+
return get_thread_id_x() + get_num_threads_x() * get_thread_id_y() +
69+
get_num_threads_x() * get_num_threads_y() * get_thread_id_z();
70+
}
71+
72+
LIBC_INLINE uint32_t get_lane_size() { return __gpu_num_lanes(); }
73+
74+
LIBC_INLINE uint32_t get_lane_id() { return __gpu_lane_id(); }
75+
76+
LIBC_INLINE uint64_t get_lane_mask() { return __gpu_lane_mask(); }
77+
78+
LIBC_INLINE uint32_t broadcast_value(uint64_t lane_mask, uint32_t x) {
79+
return __gpu_read_first_lane_u32(lane_mask, x);
80+
}
81+
82+
LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {
83+
return __gpu_ballot(lane_mask, x);
84+
}
85+
86+
LIBC_INLINE void sync_threads() { __gpu_sync_threads(); }
87+
88+
LIBC_INLINE void sync_lane(uint64_t lane_mask) { __gpu_sync_lane(lane_mask); }
89+
90+
LIBC_INLINE uint32_t shuffle(uint64_t lane_mask, uint32_t idx, uint32_t x) {
91+
return __gpu_shuffle_idx_u32(lane_mask, idx, x);
2892
}
2993

30-
/// Conditional that is only true for a single thread in a lane.
94+
[[noreturn]] LIBC_INLINE void end_program() { __gpu_exit(); }
95+
3196
LIBC_INLINE bool is_first_lane(uint64_t lane_mask) {
32-
return gpu::get_lane_id() == get_first_lane_id(lane_mask);
97+
return __gpu_is_first_in_lane(lane_mask);
3398
}
3499

35-
/// Gets the sum of all lanes inside the warp or wavefront.
36100
LIBC_INLINE uint32_t reduce(uint64_t lane_mask, uint32_t x) {
37-
for (uint32_t step = gpu::get_lane_size() / 2; step > 0; step /= 2) {
38-
uint32_t index = step + gpu::get_lane_id();
39-
x += gpu::shuffle(lane_mask, index, x);
40-
}
41-
return gpu::broadcast_value(lane_mask, x);
101+
return __gpu_lane_sum_u32(lane_mask, x);
42102
}
43103

44-
/// Gets the accumulator scan of the threads in the warp or wavefront.
45104
LIBC_INLINE uint32_t scan(uint64_t lane_mask, uint32_t x) {
46-
for (uint32_t step = 1; step < gpu::get_lane_size(); step *= 2) {
47-
uint32_t index = gpu::get_lane_id() - step;
48-
uint32_t bitmask = gpu::get_lane_id() >= step;
49-
x += -bitmask & gpu::shuffle(lane_mask, index, x);
50-
}
51-
return x;
105+
return __gpu_lane_scan_u32(lane_mask, x);
106+
}
107+
108+
LIBC_INLINE uint64_t fixed_frequency_clock() {
109+
return __builtin_readsteadycounter();
52110
}
53111

112+
LIBC_INLINE uint64_t processor_clock() { return __builtin_readcyclecounter(); }
113+
54114
} // namespace gpu
55115
} // namespace LIBC_NAMESPACE_DECL
56116

‎libc/src/time/gpu/clock.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
//===----------------------------------------------------------------------===//
88

99
#include "src/time/clock.h"
10+
11+
#include "src/__support/common.h"
1012
#include "src/__support/macros/config.h"
1113
#include "src/__support/time/gpu/time_utils.h"
1214

‎libc/src/time/gpu/nanosleep.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
#include "src/time/nanosleep.h"
1010

11+
#include "src/__support/common.h"
1112
#include "src/__support/macros/config.h"
1213
#include "src/__support/time/gpu/time_utils.h"
1314

0 commit comments

Comments
 (0)
Please sign in to comment.