|
9 | 9 | #ifndef LLVM_LIBC_SRC___SUPPORT_GPU_UTILS_H
|
10 | 10 | #define LLVM_LIBC_SRC___SUPPORT_GPU_UTILS_H
|
11 | 11 |
|
| 12 | +#include "src/__support/macros/attributes.h" |
12 | 13 | #include "src/__support/macros/config.h"
|
13 | 14 | #include "src/__support/macros/properties/architectures.h"
|
14 | 15 |
|
15 |
| -#if defined(LIBC_TARGET_ARCH_IS_AMDGPU) |
16 |
| -#include "amdgpu/utils.h" |
17 |
| -#elif defined(LIBC_TARGET_ARCH_IS_NVPTX) |
18 |
| -#include "nvptx/utils.h" |
19 |
| -#else |
20 |
| -#include "generic/utils.h" |
| 16 | +#if !__has_include(<gpuintrin.h>) |
| 17 | +#error "Unsupported compiler" |
21 | 18 | #endif
|
22 | 19 |
|
| 20 | +#include <gpuintrin.h> |
| 21 | + |
23 | 22 | namespace LIBC_NAMESPACE_DECL {
|
24 | 23 | namespace gpu {
|
25 |
| -/// Get the first active thread inside the lane. |
26 |
| -LIBC_INLINE uint64_t get_first_lane_id(uint64_t lane_mask) { |
27 |
| - return __builtin_ffsll(lane_mask) - 1; |
| 24 | + |
| 25 | +template <typename T> using Private = __gpu_private T; |
| 26 | +template <typename T> using Constant = __gpu_constant T; |
| 27 | +template <typename T> using Local = __gpu_local T; |
| 28 | +template <typename T> using Global = __gpu_local T; |
| 29 | + |
| 30 | +LIBC_INLINE uint32_t get_num_blocks_x() { return __gpu_num_blocks(0); } |
| 31 | + |
| 32 | +LIBC_INLINE uint32_t get_num_blocks_y() { return __gpu_num_blocks(1); } |
| 33 | + |
| 34 | +LIBC_INLINE uint32_t get_num_blocks_z() { return __gpu_num_blocks(2); } |
| 35 | + |
| 36 | +LIBC_INLINE uint64_t get_num_blocks() { |
| 37 | + return get_num_blocks_x() * get_num_blocks_y() * get_num_blocks_z(); |
| 38 | +} |
| 39 | + |
| 40 | +LIBC_INLINE uint32_t get_block_id_x() { return __gpu_block_id(0); } |
| 41 | + |
| 42 | +LIBC_INLINE uint32_t get_block_id_y() { return __gpu_block_id(1); } |
| 43 | + |
| 44 | +LIBC_INLINE uint32_t get_block_id_z() { return __gpu_block_id(2); } |
| 45 | + |
| 46 | +LIBC_INLINE uint64_t get_block_id() { |
| 47 | + return get_block_id_x() + get_num_blocks_x() * get_block_id_y() + |
| 48 | + get_num_blocks_x() * get_num_blocks_y() * get_block_id_z(); |
| 49 | +} |
| 50 | + |
| 51 | +LIBC_INLINE uint32_t get_num_threads_x() { return __gpu_num_threads(0); } |
| 52 | + |
| 53 | +LIBC_INLINE uint32_t get_num_threads_y() { return __gpu_num_threads(1); } |
| 54 | + |
| 55 | +LIBC_INLINE uint32_t get_num_threads_z() { return __gpu_num_threads(2); } |
| 56 | + |
| 57 | +LIBC_INLINE uint64_t get_num_threads() { |
| 58 | + return get_num_threads_x() * get_num_threads_y() * get_num_threads_z(); |
| 59 | +} |
| 60 | + |
| 61 | +LIBC_INLINE uint32_t get_thread_id_x() { return __gpu_thread_id(0); } |
| 62 | + |
| 63 | +LIBC_INLINE uint32_t get_thread_id_y() { return __gpu_thread_id(1); } |
| 64 | + |
| 65 | +LIBC_INLINE uint32_t get_thread_id_z() { return __gpu_thread_id(2); } |
| 66 | + |
| 67 | +LIBC_INLINE uint64_t get_thread_id() { |
| 68 | + return get_thread_id_x() + get_num_threads_x() * get_thread_id_y() + |
| 69 | + get_num_threads_x() * get_num_threads_y() * get_thread_id_z(); |
| 70 | +} |
| 71 | + |
| 72 | +LIBC_INLINE uint32_t get_lane_size() { return __gpu_num_lanes(); } |
| 73 | + |
| 74 | +LIBC_INLINE uint32_t get_lane_id() { return __gpu_lane_id(); } |
| 75 | + |
| 76 | +LIBC_INLINE uint64_t get_lane_mask() { return __gpu_lane_mask(); } |
| 77 | + |
| 78 | +LIBC_INLINE uint32_t broadcast_value(uint64_t lane_mask, uint32_t x) { |
| 79 | + return __gpu_read_first_lane_u32(lane_mask, x); |
| 80 | +} |
| 81 | + |
| 82 | +LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) { |
| 83 | + return __gpu_ballot(lane_mask, x); |
| 84 | +} |
| 85 | + |
| 86 | +LIBC_INLINE void sync_threads() { __gpu_sync_threads(); } |
| 87 | + |
| 88 | +LIBC_INLINE void sync_lane(uint64_t lane_mask) { __gpu_sync_lane(lane_mask); } |
| 89 | + |
| 90 | +LIBC_INLINE uint32_t shuffle(uint64_t lane_mask, uint32_t idx, uint32_t x) { |
| 91 | + return __gpu_shuffle_idx_u32(lane_mask, idx, x); |
28 | 92 | }
|
29 | 93 |
|
30 |
| -/// Conditional that is only true for a single thread in a lane. |
| 94 | +[[noreturn]] LIBC_INLINE void end_program() { __gpu_exit(); } |
| 95 | + |
31 | 96 | LIBC_INLINE bool is_first_lane(uint64_t lane_mask) {
|
32 |
| - return gpu::get_lane_id() == get_first_lane_id(lane_mask); |
| 97 | + return __gpu_is_first_in_lane(lane_mask); |
33 | 98 | }
|
34 | 99 |
|
35 |
| -/// Gets the sum of all lanes inside the warp or wavefront. |
36 | 100 | LIBC_INLINE uint32_t reduce(uint64_t lane_mask, uint32_t x) {
|
37 |
| - for (uint32_t step = gpu::get_lane_size() / 2; step > 0; step /= 2) { |
38 |
| - uint32_t index = step + gpu::get_lane_id(); |
39 |
| - x += gpu::shuffle(lane_mask, index, x); |
40 |
| - } |
41 |
| - return gpu::broadcast_value(lane_mask, x); |
| 101 | + return __gpu_lane_sum_u32(lane_mask, x); |
42 | 102 | }
|
43 | 103 |
|
44 |
| -/// Gets the accumulator scan of the threads in the warp or wavefront. |
45 | 104 | LIBC_INLINE uint32_t scan(uint64_t lane_mask, uint32_t x) {
|
46 |
| - for (uint32_t step = 1; step < gpu::get_lane_size(); step *= 2) { |
47 |
| - uint32_t index = gpu::get_lane_id() - step; |
48 |
| - uint32_t bitmask = gpu::get_lane_id() >= step; |
49 |
| - x += -bitmask & gpu::shuffle(lane_mask, index, x); |
50 |
| - } |
51 |
| - return x; |
| 105 | + return __gpu_lane_scan_u32(lane_mask, x); |
| 106 | +} |
| 107 | + |
| 108 | +LIBC_INLINE uint64_t fixed_frequency_clock() { |
| 109 | + return __builtin_readsteadycounter(); |
52 | 110 | }
|
53 | 111 |
|
| 112 | +LIBC_INLINE uint64_t processor_clock() { return __builtin_readcyclecounter(); } |
| 113 | + |
54 | 114 | } // namespace gpu
|
55 | 115 | } // namespace LIBC_NAMESPACE_DECL
|
56 | 116 |
|
|
0 commit comments