-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[libc] Improve memcpy for ARM Cortex-M supporting unaligned accesses. #144872
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-libc Author: Guillaume Chatelet (gchatelet) ChangesFull diff: https://github.com/llvm/llvm-project/pull/144872.diff 6 Files Affected:
diff --git a/libc/src/__support/macros/optimization.h b/libc/src/__support/macros/optimization.h
index 253843e5e37aa..f7133c94b405d 100644
--- a/libc/src/__support/macros/optimization.h
+++ b/libc/src/__support/macros/optimization.h
@@ -10,7 +10,7 @@
#ifndef LLVM_LIBC_SRC___SUPPORT_MACROS_OPTIMIZATION_H
#define LLVM_LIBC_SRC___SUPPORT_MACROS_OPTIMIZATION_H
-#include "src/__support/macros/attributes.h" // LIBC_INLINE
+#include "src/__support/macros/attributes.h" // LIBC_INLINE
#include "src/__support/macros/config.h"
#include "src/__support/macros/properties/compiler.h" // LIBC_COMPILER_IS_CLANG
@@ -30,8 +30,10 @@ LIBC_INLINE constexpr bool expects_bool_condition(T value, T expected) {
#if defined(LIBC_COMPILER_IS_CLANG)
#define LIBC_LOOP_NOUNROLL _Pragma("nounroll")
+#define LIBC_LOOP_UNROLL _Pragma("unroll")
#elif defined(LIBC_COMPILER_IS_GCC)
#define LIBC_LOOP_NOUNROLL _Pragma("GCC unroll 0")
+#define LIBC_LOOP_UNROLL _Pragma("GCC unroll 2048")
#else
#error "Unhandled compiler"
#endif
diff --git a/libc/src/string/memory_utils/CMakeLists.txt b/libc/src/string/memory_utils/CMakeLists.txt
index 08c0b0d34d503..a967247db53f4 100644
--- a/libc/src/string/memory_utils/CMakeLists.txt
+++ b/libc/src/string/memory_utils/CMakeLists.txt
@@ -7,6 +7,7 @@ add_header_library(
aarch64/inline_memcpy.h
aarch64/inline_memmove.h
aarch64/inline_memset.h
+ arm/inline_memcpy.h
generic/aligned_access.h
generic/byte_per_byte.h
inline_bcmp.h
diff --git a/libc/src/string/memory_utils/arm/inline_memcpy.h b/libc/src/string/memory_utils/arm/inline_memcpy.h
new file mode 100644
index 0000000000000..be8bc6459b6c4
--- /dev/null
+++ b/libc/src/string/memory_utils/arm/inline_memcpy.h
@@ -0,0 +1,127 @@
+#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMCPY_H
+#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMCPY_H
+
+#include "src/__support/macros/attributes.h" // LIBC_INLINE
+#include "src/__support/macros/optimization.h" // LIBC_LOOP_NOUNROLL
+#include "src/string/memory_utils/utils.h" // memcpy_inline, distance_to_align
+
+#include <stddef.h> // size_t
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace {
+
+LIBC_INLINE_VAR constexpr size_t kWordSize = sizeof(uint32_t);
+
+template <size_t bytes>
+LIBC_INLINE void copy_and_bump_pointers(Ptr &dst, CPtr &src) {
+ if constexpr (bytes == 1 || bytes == 2 || bytes == 4) {
+ memcpy_inline<bytes>(dst, src);
+ } else {
+ // We restrict loads/stores to 4 byte to prevent the use of load/store
+ // multiple (LDM, STM) and load/store double (LDRD, STRD). First, they may
+ // fault (see notes below) and second, they use more registers which in turn
+ // adds push/pop instructions in the hot path.
+ static_assert(bytes % kWordSize == 0);
+ LIBC_LOOP_UNROLL
+ for (size_t i = 0; i < bytes / kWordSize; ++i) {
+ const uintptr_t offset = i * kWordSize;
+ memcpy_inline<kWordSize>(dst + offset, src + offset);
+ }
+ }
+ // In the 1, 2, 4 byte copy case, the compiler can fold pointer offsetting
+ // into the load/store instructions.
+ // e.g.,
+ // ldrb r3, [r1], #1
+ // strb r3, [r0], #1
+ dst += bytes;
+ src += bytes;
+}
+
+template <size_t block_size>
+LIBC_INLINE void copy_blocks(Ptr &dst, CPtr &src, size_t &size) {
+ LIBC_LOOP_NOUNROLL
+ for (size_t i = 0; i < size / block_size; ++i)
+ copy_and_bump_pointers<block_size>(dst, src);
+ // Update `size` once at the end instead of once per iteration.
+ size %= block_size;
+}
+
+LIBC_INLINE CPtr bitwise_or(CPtr a, CPtr b) {
+ return cpp::bit_cast<CPtr>(cpp::bit_cast<uintptr_t>(a) |
+ cpp::bit_cast<uintptr_t>(b));
+}
+
+LIBC_INLINE auto misaligned(CPtr a) {
+ return distance_to_align_down<kWordSize>(a);
+}
+
+} // namespace
+
+// Implementation for Cortex-M0, M0+, M1.
+// The implementation makes sure that all accesses are aligned.
+[[maybe_unused]] LIBC_INLINE void inline_memcpy_arm_low_end(Ptr dst, CPtr src,
+ size_t size) {
+ // For now, dummy implementation that performs byte per byte copy.
+ LIBC_LOOP_NOUNROLL
+ for (size_t i = 0; i < size; ++i)
+ dst[i] = src[i];
+}
+
+// Implementation for Cortex-M3, M4, M7, M23, M33, M35P, M52 with hardware
+// support for unaligned loads and stores.
+// Notes:
+// - It compiles down to <300 bytes.
+// - `dst` and `src` are not `__restrict` to prevent the compiler from
+// reordering loads/stores.
+// - We keep state variables to a strict minimum to keep everything in the free
+// registers and prevent costly push / pop.
+// - If unaligned single loads/stores to normal memory are supported, unaligned
+// accesses for load/store multiple (LDM, STM) and load/store double (LDRD,
+// STRD) instructions are generally not supported and will still fault so we
+// make sure to restrict unrolling to word loads/stores.
+[[maybe_unused]] LIBC_INLINE void inline_memcpy_arm_mid_end(Ptr dst, CPtr src,
+ size_t size) {
+ if (misaligned(bitwise_or(src, dst))) [[unlikely]] {
+ if (size < 8) [[unlikely]] {
+ if (size & 1)
+ copy_and_bump_pointers<1>(dst, src);
+ if (size & 2)
+ copy_and_bump_pointers<2>(dst, src);
+ if (size & 4)
+ copy_and_bump_pointers<4>(dst, src);
+ return;
+ }
+ if (misaligned(src)) [[unlikely]] {
+ const size_t offset = distance_to_align_up<kWordSize>(dst);
+ if (offset & 1)
+ copy_and_bump_pointers<1>(dst, src);
+ if (offset & 2)
+ copy_and_bump_pointers<2>(dst, src);
+ size -= offset;
+ }
+ }
+ copy_blocks<64>(dst, src, size);
+ copy_blocks<16>(dst, src, size);
+ copy_blocks<4>(dst, src, size);
+ if (size & 1)
+ copy_and_bump_pointers<1>(dst, src);
+ if (size & 2) [[unlikely]]
+ copy_and_bump_pointers<2>(dst, src);
+}
+
+[[maybe_unused]] LIBC_INLINE void inline_memcpy_arm(void *__restrict dst_,
+ const void *__restrict src_,
+ size_t size) {
+ Ptr dst = cpp::bit_cast<Ptr>(dst_);
+ CPtr src = cpp::bit_cast<CPtr>(src_);
+#ifdef __ARM_FEATURE_UNALIGNED
+ return inline_memcpy_arm_mid_end(dst, src, size);
+#else
+ return inline_memcpy_arm_low_end(dst, src, size);
+#endif
+}
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMCPY_H
diff --git a/libc/src/string/memory_utils/inline_memcpy.h b/libc/src/string/memory_utils/inline_memcpy.h
index f98e55321a9b5..13975e6b3bd0e 100644
--- a/libc/src/string/memory_utils/inline_memcpy.h
+++ b/libc/src/string/memory_utils/inline_memcpy.h
@@ -22,6 +22,9 @@
#include "src/string/memory_utils/x86_64/inline_memcpy.h"
#define LIBC_SRC_STRING_MEMORY_UTILS_MEMCPY \
inline_memcpy_x86_maybe_interpose_repmovsb
+#elif defined(LIBC_TARGET_ARCH_IS_ARM)
+#include "src/string/memory_utils/arm/inline_memcpy.h"
+#define LIBC_SRC_STRING_MEMORY_UTILS_MEMCPY inline_memcpy_arm
#elif defined(LIBC_TARGET_ARCH_IS_AARCH64)
#include "src/string/memory_utils/aarch64/inline_memcpy.h"
#define LIBC_SRC_STRING_MEMORY_UTILS_MEMCPY inline_memcpy_aarch64
diff --git a/libc/src/string/memory_utils/utils.h b/libc/src/string/memory_utils/utils.h
index bdf0b8652188b..c08608c87bb25 100644
--- a/libc/src/string/memory_utils/utils.h
+++ b/libc/src/string/memory_utils/utils.h
@@ -101,7 +101,7 @@ LIBC_INLINE void memcpy_inline(void *__restrict dst,
}
using Ptr = cpp::byte *; // Pointer to raw data.
-using CPtr = const cpp::byte *; // Const pointer to raw data.
+using CPtr = const cpp::byte *; // Pointer to const raw data.
// This type makes sure that we don't accidentally promote an integral type to
// another one. It is only constructible from the exact T type.
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 8e629270c89d2..9e2828eaf098c 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -4218,6 +4218,7 @@ libc_support_library(
"src/string/memory_utils/aarch64/inline_memcpy.h",
"src/string/memory_utils/aarch64/inline_memmove.h",
"src/string/memory_utils/aarch64/inline_memset.h",
+ "src/string/memory_utils/arm/inline_memcpy.h",
"src/string/memory_utils/generic/aligned_access.h",
"src/string/memory_utils/generic/byte_per_byte.h",
"src/string/memory_utils/inline_bcmp.h",
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Overall LGTM
// make sure to restrict unrolling to word loads/stores. | ||
[[maybe_unused]] LIBC_INLINE void inline_memcpy_arm_mid_end(Ptr dst, CPtr src, | ||
size_t size) { | ||
if (misaligned(bitwise_or(src, dst))) [[unlikely]] { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
does LIBC_UNLIKELY
work instead of [[unlikely]]
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So I tried LIBC_UNLIKELY
but the codegen is worse, and since [[likely]]
and [[unlikely]]
attributes are now standard we may want to use them instead of our own version. WDYT? @lntue?
Regardless, for this patch I would like to keep the attributes.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm happy to move to the standardized versions, but since they're C++20 I'd want to make sure they're supported by the compilers we use.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
https://libc.llvm.org/compiler_support.html
- Clang 11
- GCC 12.2
Unfortunately it is not supported by Clang 11 (but it is from Clang 12 onward).
Since it is only one version away from our supported compiler I suggest to stub out the attribute for Clang 11 with code like this (godbolt)
#define LIBC_HAS_ATTR_LIKELY
#if defined(LIBC_COMPILER_IS_CLANG)
#if LIBC_COMPILER_CLANG_VER < 1200
#undef LIBC_HAS_ATTR_LIKELY
#endif
#endif
#ifdef LIBC_HAS_ATTR_LIKELY
#define LIBC_ATTR_LIKELY [[likely]]
#define LIBC_ATTR_UNLIKELY [[unlikely]]
#else
#define LIBC_ATTR_LIKELY
#define LIBC_ATTR_UNLIKELY
#endif
WDYT?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If we really cared about older compilers we could use __builtin_expect
which should work everywhere but I'm not sure if it's worth it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
__builtin_expect
is what is behind our current LIBC_LIKELY
macro, it works only on expressions.
AFAICT __builtin_expect
currently yields worse code than [[likely]]
/ [[unlikely]]
. I would be in favor of embracing the new world, bump our compiler requirements to Clang 12.0.0, remove our LIBC_LIKELY
/ LIBC_UNLIKELY
and replace them with [[likely]]
/ [[unlikely]]
. @michaelrj-google @petrhosek @enh-google @lntue @frobtech do you have an opinion on the matter?
FTR Release dates
- LLVM 11 : Oct 12, 2020
- LLVM 12 : Apr 15, 2021
FWIW Google OSS currently defines this C++ support matrix that mandates Clang >= 14.0.0 and GCC >= 7.5.0.
As for this PR I will use the LIBC_ATTR_LIKELY
approach described above. I will keep it local to the ARM implementation for now.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
AFAICT
__builtin_expect
currently yields worse code than[[likely]]
/[[unlikely]]
. I would be in favor of embracing the new world, bump our compiler requirements to Clang 12.0.0, remove ourLIBC_LIKELY
/LIBC_UNLIKELY
and replace them with[[likely]]
/[[unlikely]]
. @michaelrj-google @petrhosek @enh-google @lntue @frobtech do you have an opinion on the matter?
Android doesn't care about old versions of clang, no.
that said, as a compiler user --- can we fix "__builtin_expect
currently yields worse code than [[likely]]
/ [[unlikely]]
"?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd say having this check is good for now, I think the main blocker for bumping the compiler version has been the RISC-V 64 buildbot and we may be moving that to emulation soon. CC @amykhuang
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/196/builds/9451 Here is the relevant piece of the build log for the reference
|
The failure is not related to my change. |
This implementation has been compiled with the pigweed toolchain and tested on:
--target=armv8m.main-none-eabi
-march=armv8m.main+fp+dsp
-mcpu=cortex-m33
--target=armv6m-none-eabi
-march=armv6m
-mcpu=cortex-m0+
They both compile down to a little bit more than 200 bytes and are between 2 and 10 times faster than byte per byte copies.
For best performance the following options can be set in the
libc/config/baremetal/arm/config.json