diff --git a/src/libcore/mem.rs b/src/libcore/mem.rs index 509b396f211bb..62dff93d85cb1 100644 --- a/src/libcore/mem.rs +++ b/src/libcore/mem.rs @@ -499,59 +499,7 @@ pub unsafe fn uninitialized() -> T { #[stable(feature = "rust1", since = "1.0.0")] pub fn swap(x: &mut T, y: &mut T) { unsafe { - // The approach here is to utilize simd to swap x & y efficiently. Testing reveals - // that swapping either 32 bytes or 64 bytes at a time is most efficient for intel - // Haswell E processors. LLVM is more able to optimize if we give a struct a - // #[repr(simd)], even if we don't actually use this struct directly. - // - // FIXME repr(simd) broken on emscripten and redox - #[cfg_attr(not(any(target_os = "emscripten", target_os = "redox")), repr(simd))] - struct Block(u64, u64, u64, u64); - struct UnalignedBlock(u64, u64, u64, u64); - - let block_size = size_of::(); - - // Get raw pointers to the bytes of x & y for easier manipulation - let x = x as *mut T as *mut u8; - let y = y as *mut T as *mut u8; - - // Loop through x & y, copying them `Block` at a time - // The optimizer should unroll the loop fully for most types - // N.B. We can't use a for loop as the `range` impl calls `mem::swap` recursively - let len = size_of::(); - let mut i = 0; - while i + block_size <= len { - // Create some uninitialized memory as scratch space - // Declaring `t` here avoids aligning the stack when this loop is unused - let mut t: Block = uninitialized(); - let t = &mut t as *mut _ as *mut u8; - let x = x.offset(i as isize); - let y = y.offset(i as isize); - - // Swap a block of bytes of x & y, using t as a temporary buffer - // This should be optimized into efficient SIMD operations where available - ptr::copy_nonoverlapping(x, t, block_size); - ptr::copy_nonoverlapping(y, x, block_size); - ptr::copy_nonoverlapping(t, y, block_size); - i += block_size; - } - - - if i < len { - // Swap any remaining bytes, using aligned types to copy - // where appropriate (this information is lost by conversion - // to *mut u8, so restore it manually here) - let mut t: UnalignedBlock = uninitialized(); - let rem = len - i; - - let t = &mut t as *mut _ as *mut u8; - let x = x.offset(i as isize); - let y = y.offset(i as isize); - - ptr::copy_nonoverlapping(x, t, rem); - ptr::copy_nonoverlapping(y, x, rem); - ptr::copy_nonoverlapping(t, y, rem); - } + ptr::swap_nonoverlapping(x, y, 1); } } diff --git a/src/libcore/ptr.rs b/src/libcore/ptr.rs index f89f86e18a149..bd311743bebfd 100644 --- a/src/libcore/ptr.rs +++ b/src/libcore/ptr.rs @@ -117,6 +117,90 @@ pub unsafe fn swap(x: *mut T, y: *mut T) { mem::forget(tmp); } +/// Swaps a sequence of values at two mutable locations of the same type. +/// +/// # Safety +/// +/// The two arguments must each point to the beginning of `count` locations +/// of valid memory, and the two memory ranges must not overlap. +/// +/// # Examples +/// +/// Basic usage: +/// +/// ``` +/// #![feature(swap_nonoverlapping)] +/// +/// use std::ptr; +/// +/// let mut x = [1, 2, 3, 4]; +/// let mut y = [7, 8, 9]; +/// +/// unsafe { +/// ptr::swap_nonoverlapping(x.as_mut_ptr(), y.as_mut_ptr(), 2); +/// } +/// +/// assert_eq!(x, [7, 8, 3, 4]); +/// assert_eq!(y, [1, 2, 9]); +/// ``` +#[inline] +#[unstable(feature = "swap_nonoverlapping", issue = "42818")] +pub unsafe fn swap_nonoverlapping(x: *mut T, y: *mut T, count: usize) { + let x = x as *mut u8; + let y = y as *mut u8; + let len = mem::size_of::() * count; + swap_nonoverlapping_bytes(x, y, len) +} + +#[inline] +unsafe fn swap_nonoverlapping_bytes(x: *mut u8, y: *mut u8, len: usize) { + // The approach here is to utilize simd to swap x & y efficiently. Testing reveals + // that swapping either 32 bytes or 64 bytes at a time is most efficient for intel + // Haswell E processors. LLVM is more able to optimize if we give a struct a + // #[repr(simd)], even if we don't actually use this struct directly. + // + // FIXME repr(simd) broken on emscripten and redox + #[cfg_attr(not(any(target_os = "emscripten", target_os = "redox")), repr(simd))] + struct Block(u64, u64, u64, u64); + struct UnalignedBlock(u64, u64, u64, u64); + + let block_size = mem::size_of::(); + + // Loop through x & y, copying them `Block` at a time + // The optimizer should unroll the loop fully for most types + // N.B. We can't use a for loop as the `range` impl calls `mem::swap` recursively + let mut i = 0; + while i + block_size <= len { + // Create some uninitialized memory as scratch space + // Declaring `t` here avoids aligning the stack when this loop is unused + let mut t: Block = mem::uninitialized(); + let t = &mut t as *mut _ as *mut u8; + let x = x.offset(i as isize); + let y = y.offset(i as isize); + + // Swap a block of bytes of x & y, using t as a temporary buffer + // This should be optimized into efficient SIMD operations where available + copy_nonoverlapping(x, t, block_size); + copy_nonoverlapping(y, x, block_size); + copy_nonoverlapping(t, y, block_size); + i += block_size; + } + + if i < len { + // Swap any remaining bytes + let mut t: UnalignedBlock = mem::uninitialized(); + let rem = len - i; + + let t = &mut t as *mut _ as *mut u8; + let x = x.offset(i as isize); + let y = y.offset(i as isize); + + copy_nonoverlapping(x, t, rem); + copy_nonoverlapping(y, x, rem); + copy_nonoverlapping(t, y, rem); + } +} + /// Replaces the value at `dest` with `src`, returning the old /// value, without dropping either. /// diff --git a/src/libcore/slice/rotate.rs b/src/libcore/slice/rotate.rs index 3b9ae5652c5d1..e4a4e33c1729e 100644 --- a/src/libcore/slice/rotate.rs +++ b/src/libcore/slice/rotate.rs @@ -76,7 +76,7 @@ pub unsafe fn ptr_rotate(mut left: usize, mid: *mut T, mut right: usize) { break; } - ptr_swap_n( + ptr::swap_nonoverlapping( mid.offset(-(left as isize)), mid.offset((right-delta) as isize), delta); @@ -103,10 +103,3 @@ pub unsafe fn ptr_rotate(mut left: usize, mid: *mut T, mut right: usize) { ptr::copy_nonoverlapping(buf, mid.offset(-(left as isize)), right); } } - -unsafe fn ptr_swap_n(a: *mut T, b: *mut T, n: usize) { - for i in 0..n { - // These are nonoverlapping, so use mem::swap instead of ptr::swap - mem::swap(&mut *a.offset(i as isize), &mut *b.offset(i as isize)); - } -}