From 2c4f59a99f522d001c27e8f3138a4b7d62a9f514 Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@gmail.com>
Date: Fri, 8 Jun 2018 11:52:37 -0400
Subject: [PATCH 1/4] Document `@simd` and remove invalid uses

Fixes #27482.

Copies the information from the performance tips into a docstring; makes it a bit scarier.
---
 base/broadcast.jl                  |  2 +-
 base/multidimensional.jl           |  2 +-
 base/reduce.jl                     |  2 +-
 base/reducedim.jl                  |  4 +-
 base/simdloop.jl                   | 73 ++++++++++++++++++++++++++++++
 doc/src/manual/performance-tips.md | 10 ++--
 6 files changed, 83 insertions(+), 10 deletions(-)

diff --git a/base/broadcast.jl b/base/broadcast.jl
index 1e31171589c0a..36426840f5610 100644
--- a/base/broadcast.jl
+++ b/base/broadcast.jl
@@ -828,7 +828,7 @@ preprocess_args(dest, args::Tuple{}) = ()
         end
     end
     bc′ = preprocess(dest, bc)
-    @simd for I in eachindex(bc′)
+    for I in eachindex(bc′)
         @inbounds dest[I] = bc′[I]
     end
     return dest
diff --git a/base/multidimensional.jl b/base/multidimensional.jl
index 3d2e375dc9ccf..708709d952679 100644
--- a/base/multidimensional.jl
+++ b/base/multidimensional.jl
@@ -1458,7 +1458,7 @@ end
         B[I] = (AI, AI)
     end
     Bmax = CartesianIndex(sB)
-    @inbounds @simd for I in CartesianIndices(sA)
+    @inbounds for I in CartesianIndices(sA)
         J = min(Bmax,I)
         BJ = B[J]
         AI = A[I]
diff --git a/base/reduce.jl b/base/reduce.jl
index 989e34c5d95b0..b1607e39de906 100644
--- a/base/reduce.jl
+++ b/base/reduce.jl
@@ -190,7 +190,7 @@ foldr(op, itr) = mapfoldr(identity, op, itr)
         @inbounds a1 = A[ifirst]
         @inbounds a2 = A[ifirst+1]
         v = op(f(a1), f(a2))
-        @simd for i = ifirst + 2 : ilast
+        for i = ifirst + 2 : ilast
             @inbounds ai = A[i]
             v = op(v, f(ai))
         end
diff --git a/base/reducedim.jl b/base/reducedim.jl
index c8d5e720b6f89..7801863e922b4 100644
--- a/base/reducedim.jl
+++ b/base/reducedim.jl
@@ -225,7 +225,7 @@ function _mapreducedim!(f, op, R::AbstractArray, A::AbstractArray)
         @inbounds for IA in CartesianIndices(indsAt)
             IR = Broadcast.newindex(IA, keep, Idefault)
             r = R[i1,IR]
-            @simd for i in axes(A, 1)
+            for i in axes(A, 1)
                 r = op(r, f(A[i, IA]))
             end
             R[i1,IR] = r
@@ -233,7 +233,7 @@ function _mapreducedim!(f, op, R::AbstractArray, A::AbstractArray)
     else
         @inbounds for IA in CartesianIndices(indsAt)
             IR = Broadcast.newindex(IA, keep, Idefault)
-            @simd for i in axes(A, 1)
+            for i in axes(A, 1)
                 R[i,IR] = op(R[i,IR], f(A[i,IA]))
             end
         end
diff --git a/base/simdloop.jl b/base/simdloop.jl
index 7493fbbd585a5..f71fd9b265868 100644
--- a/base/simdloop.jl
+++ b/base/simdloop.jl
@@ -85,6 +85,79 @@ function compile(x)
     end
 end
 
+"""
+    @simd
+
+Annotate a `for` loop to allow the compiler to take extra liberties to allow vectorization
+
+!!! warning
+
+    This feature is experimental and could change or disappear in future versions of Julia.
+    Incorrect use of the `@simd` macro may cause unexpected results.
+
+By using `@simd`, you are are asserting several properties of the loop:
+
+* It is safe to execute iterations in arbitrary or overlapping order, with special consideration for reduction variables.
+* Floating-point operations on reduction variables can be reordered, possibly causing different results than without `@simd`.
+* No iteration ever waits on a previous iteration to make forward progress.
+
+In many cases, Julia is able to automatically vectorize inner for loops without the use of `@simd`.
+Using `@simd` gives the compiler a little extra leeway to make it possible in more situations. In
+either case, your inner loop should have the following properties to allow vectorization:
+
+* The loop must be an innermost loop
+* The loop body must be straight-line code. Therefore, [`@inbounds`](@ref) is
+  currently needed for all array accesses. The compiler can sometimes turn
+  short `&&`, `||`, and `?:` expressions into straight-line code if it is safe
+  to evaluate all operands unconditionally. Consider using the [`ifelse`](@ref)
+  function instead of `?:` in the loop if it is safe to do so.
+* Accesses must have a stride pattern and cannot be "gathers" (random-index
+  reads) or "scatters" (random-index writes).
+* The stride should be unit stride.
+
+### Example
+
+```
+function inner(x::Array, y::Array)
+    s = zero(eltype(x))
+    for i=eachindex(x)
+        @inbounds s += x[i]*y[i]
+    end
+    s
+end
+
+function innersimd(x::Array, y::Array)
+    s = zero(eltype(x))
+    @simd for i=eachindex(x)
+        @inbounds s += x[i]*y[i]
+    end
+    s
+end
+
+function timeit(n, reps)
+    x = rand(Float32,n)
+    y = rand(Float32,n)
+    s = zero(Float64)
+    time = @elapsed for j in 1:reps
+        s+=inner(x,y)
+    end
+    println("GFlop/sec        = ",2.0*n*reps/time*1E-9)
+    time = @elapsed for j in 1:reps
+        s+=innersimd(x,y)
+    end
+    println("GFlop/sec (SIMD) = ",2.0*n*reps/time*1E-9)
+end
+
+timeit(1000,1000)
+```
+
+On a computer with a 2.4GHz Intel Core i5 processor, this produces:
+
+```
+GFlop/sec        = 1.9467069505224963
+GFlop/sec (SIMD) = 17.578554163920018
+```
+"""
 macro simd(forloop)
     esc(compile(forloop))
 end
diff --git a/doc/src/manual/performance-tips.md b/doc/src/manual/performance-tips.md
index a0c2d0b620efb..ebc4f62d9631d 100644
--- a/doc/src/manual/performance-tips.md
+++ b/doc/src/manual/performance-tips.md
@@ -1138,7 +1138,7 @@ Sometimes you can enable better optimization by promising certain program proper
   * Use `@fastmath` to allow floating point optimizations that are correct for real numbers, but lead
     to differences for IEEE numbers. Be careful when doing this, as this may change numerical results.
     This corresponds to the `-ffast-math` option of clang.
-  * Write `@simd` in front of `for` loops that are amenable to vectorization. **This feature is experimental**
+  * Write `@simd` in front of `for` loops to assert that they are amenable to vectorization. **This feature is experimental**
     and could change or disappear in future versions of Julia.
 
 The common idiom of using 1:n to index into an AbstractArray is not safe if the Array uses unconventional indexing,
@@ -1205,11 +1205,11 @@ loop:
     results than without `@simd`.
   * No iteration ever waits on another iteration to make forward progress.
 
-A loop containing `break`, `continue`, or `@goto` will cause a compile-time error.
+Violating these assumptions may cause errors or incorrect results. A loop containing `break`, `continue`, or `@goto` will cause a compile-time error.
 
-Using `@simd` merely gives the compiler license to vectorize. Whether it actually does so depends
-on the compiler. To actually benefit from the current implementation, your loop should have the
-following additional properties:
+In many cases, Julia is able to automatically vectorize inner for loops without the use of `@simd`.
+Using `@simd` gives the compiler a little extra leeway to make vectorization possible in more situations.
+In either case, your loop should have the following properties to allow vectorization:
 
   * The loop must be an innermost loop.
   * The loop body must be straight-line code. This is why `@inbounds` is currently needed for all

From dd88c6a3a130a8e2293f85e940e3689fcb962157 Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@gmail.com>
Date: Fri, 8 Jun 2018 18:37:23 -0400
Subject: [PATCH 2/4] Deduplicate & link docs; add test

---
 base/simdloop.jl                   | 75 +++++++-----------------------
 doc/src/base/base.md               |  1 +
 doc/src/manual/performance-tips.md | 46 ++++++------------
 test/bitarray.jl                   | 10 ++++
 4 files changed, 40 insertions(+), 92 deletions(-)

diff --git a/base/simdloop.jl b/base/simdloop.jl
index f71fd9b265868..42ec45815100f 100644
--- a/base/simdloop.jl
+++ b/base/simdloop.jl
@@ -88,75 +88,32 @@ end
 """
     @simd
 
-Annotate a `for` loop to allow the compiler to take extra liberties to allow vectorization
-
-!!! warning
+Annotate a `for` loop to allow the compiler to take extra liberties to allow loop re-ordering
 
+!!!warning
     This feature is experimental and could change or disappear in future versions of Julia.
     Incorrect use of the `@simd` macro may cause unexpected results.
 
-By using `@simd`, you are are asserting several properties of the loop:
+The object iterated over in a `@simd for` loop should be a one-dimensional range.
+By using `@simd`, you are asserting several properties of the loop:
 
-* It is safe to execute iterations in arbitrary or overlapping order, with special consideration for reduction variables.
-* Floating-point operations on reduction variables can be reordered, possibly causing different results than without `@simd`.
-* No iteration ever waits on a previous iteration to make forward progress.
+    * It is safe to execute iterations in arbitrary or overlapping order, with special consideration for reduction variables.
+    * Floating-point operations on reduction variables can be reordered, possibly causing different results than without `@simd`.
+    * No iteration ever waits on a previous iteration to make forward progress.
 
 In many cases, Julia is able to automatically vectorize inner for loops without the use of `@simd`.
 Using `@simd` gives the compiler a little extra leeway to make it possible in more situations. In
 either case, your inner loop should have the following properties to allow vectorization:
 
-* The loop must be an innermost loop
-* The loop body must be straight-line code. Therefore, [`@inbounds`](@ref) is
-  currently needed for all array accesses. The compiler can sometimes turn
-  short `&&`, `||`, and `?:` expressions into straight-line code if it is safe
-  to evaluate all operands unconditionally. Consider using the [`ifelse`](@ref)
-  function instead of `?:` in the loop if it is safe to do so.
-* Accesses must have a stride pattern and cannot be "gathers" (random-index
-  reads) or "scatters" (random-index writes).
-* The stride should be unit stride.
-
-### Example
-
-```
-function inner(x::Array, y::Array)
-    s = zero(eltype(x))
-    for i=eachindex(x)
-        @inbounds s += x[i]*y[i]
-    end
-    s
-end
-
-function innersimd(x::Array, y::Array)
-    s = zero(eltype(x))
-    @simd for i=eachindex(x)
-        @inbounds s += x[i]*y[i]
-    end
-    s
-end
-
-function timeit(n, reps)
-    x = rand(Float32,n)
-    y = rand(Float32,n)
-    s = zero(Float64)
-    time = @elapsed for j in 1:reps
-        s+=inner(x,y)
-    end
-    println("GFlop/sec        = ",2.0*n*reps/time*1E-9)
-    time = @elapsed for j in 1:reps
-        s+=innersimd(x,y)
-    end
-    println("GFlop/sec (SIMD) = ",2.0*n*reps/time*1E-9)
-end
-
-timeit(1000,1000)
-```
-
-On a computer with a 2.4GHz Intel Core i5 processor, this produces:
-
-```
-GFlop/sec        = 1.9467069505224963
-GFlop/sec (SIMD) = 17.578554163920018
-```
+    * The loop must be an innermost loop
+    * The loop body must be straight-line code. Therefore, [`@inbounds`](@ref) is
+      currently needed for all array accesses. The compiler can sometimes turn
+      short `&&`, `||`, and `?:` expressions into straight-line code if it is safe
+      to evaluate all operands unconditionally. Consider using the [`ifelse`](@ref)
+      function instead of `?:` in the loop if it is safe to do so.
+    * Accesses must have a stride pattern and cannot be "gathers" (random-index
+      reads) or "scatters" (random-index writes).
+    * The stride should be unit stride.
 """
 macro simd(forloop)
     esc(compile(forloop))
diff --git a/doc/src/base/base.md b/doc/src/base/base.md
index 030ffe3a837e4..f5a28be131d71 100644
--- a/doc/src/base/base.md
+++ b/doc/src/base/base.md
@@ -219,6 +219,7 @@ Base.gensym
 Base.@gensym
 Base.@goto
 Base.@label
+Base.@simd
 Base.@polly
 ```
 
diff --git a/doc/src/manual/performance-tips.md b/doc/src/manual/performance-tips.md
index ebc4f62d9631d..aba5ba560c0f8 100644
--- a/doc/src/manual/performance-tips.md
+++ b/doc/src/manual/performance-tips.md
@@ -1133,12 +1133,18 @@ These are some minor points that might help in tight inner loops.
 
 Sometimes you can enable better optimization by promising certain program properties.
 
-  * Use `@inbounds` to eliminate array bounds checking within expressions. Be certain before doing
+  * Use [`@inbounds`](@ref) to eliminate array bounds checking within expressions. Be certain before doing
     this. If the subscripts are ever out of bounds, you may suffer crashes or silent corruption.
-  * Use `@fastmath` to allow floating point optimizations that are correct for real numbers, but lead
+  * Use [`@fastmath`](@ref) to allow floating point optimizations that are correct for real numbers, but lead
     to differences for IEEE numbers. Be careful when doing this, as this may change numerical results.
     This corresponds to the `-ffast-math` option of clang.
-  * Write `@simd` in front of `for` loops to assert that they are amenable to vectorization. **This feature is experimental**
+  * Write [`@simd`](@ref) in front of `for` loops to promise that the iterations are independent and may be
+    reordered.  Note that in many cases, Julia can automatically vectorize code without the `@simd` macro;
+    it is only beneficial in cases where such a transformation would otherwise be illegal, including cases
+    like allowing floating-point re-associativity and ignoring dependent memory accesses. Again, be very
+    careful when asserting `@simd` as erroneously annotating a loop with dependent iterations may result
+    in unexpected results. In particular, note that `setindex!` on some `AbstractArray` subtypes is
+    enherently dependent upon iteration order. **This feature is experimental**
     and could change or disappear in future versions of Julia.
 
 The common idiom of using 1:n to index into an AbstractArray is not safe if the Array uses unconventional indexing,
@@ -1146,9 +1152,9 @@ and may cause a segmentation fault if bounds checking is turned off. Use `Linear
 instead (see also [offset-arrays](https://docs.julialang.org/en/latest/devdocs/offset-arrays)).
 
 !!!note
-    While `@simd` needs to be placed directly in front of a loop, both `@inbounds` and `@fastmath`
-    can be applied to several statements at once, e.g. using `begin` ... `end`, or even to a whole
-    function.
+    While `@simd` needs to be placed directly in front of an innermost `for` loop, both `@inbounds` and `@fastmath`
+    can be applied to either single expressions or all the expressions that appear within nested blocks of code, e.g.,
+    using `@inbounds begin` or `@inbounds for ...`.
 
 Here is an example with both `@inbounds` and `@simd` markup (we here use `@noinline` to prevent
 the optimizer from trying to be too clever and defeat our benchmark):
@@ -1194,33 +1200,7 @@ GFlop/sec        = 1.9467069505224963
 GFlop/sec (SIMD) = 17.578554163920018
 ```
 
-(`GFlop/sec` measures the performance, and larger numbers are better.) The range for a `@simd for`
-loop should be a one-dimensional range. A variable used for accumulating, such as `s` in the example,
-is called a *reduction variable*. By using `@simd`, you are asserting several properties of the
-loop:
-
-  * It is safe to execute iterations in arbitrary or overlapping order, with special consideration
-    for reduction variables.
-  * Floating-point operations on reduction variables can be reordered, possibly causing different
-    results than without `@simd`.
-  * No iteration ever waits on another iteration to make forward progress.
-
-Violating these assumptions may cause errors or incorrect results. A loop containing `break`, `continue`, or `@goto` will cause a compile-time error.
-
-In many cases, Julia is able to automatically vectorize inner for loops without the use of `@simd`.
-Using `@simd` gives the compiler a little extra leeway to make vectorization possible in more situations.
-In either case, your loop should have the following properties to allow vectorization:
-
-  * The loop must be an innermost loop.
-  * The loop body must be straight-line code. This is why `@inbounds` is currently needed for all
-    array accesses. The compiler can sometimes turn short `&&`, `||`, and `?:` expressions into straight-line
-    code, if it is safe to evaluate all operands unconditionally. Consider using the [`ifelse`](@ref)
-    function instead of `?:` in the loop if it is safe to do so.
-  * Accesses must have a stride pattern and cannot be "gathers" (random-index reads) or "scatters"
-    (random-index writes).
-  * The stride should be unit stride.
-  * In some simple cases, for example with 2-3 arrays accessed in a loop, the LLVM auto-vectorization
-    may kick in automatically, leading to no further speedup with `@simd`.
+(`GFlop/sec` measures the performance, and larger numbers are better.)
 
 Here is an example with all three kinds of markup. This program first calculates the finite difference
 of a one-dimensional array, and then evaluates the L2-norm of the result:
diff --git a/test/bitarray.jl b/test/bitarray.jl
index 997ae0b1edc6a..ef5e1910f64aa 100644
--- a/test/bitarray.jl
+++ b/test/bitarray.jl
@@ -1575,3 +1575,13 @@ end
         end
     end
 end
+
+@testset "SIMD violations (issue #27482)" begin
+    @test all(any!(falses(10), trues(10, 10)))
+    @check_bit_operation any!(falses(10), trues(10, 10))
+    @check_bit_operation any!(falses(100), trues(100, 100))
+    @check_bit_operation any!(falses(1000), trues(1000, 100))
+    @check_bit_operation all!(falses(10), trues(10, 10))
+    @check_bit_operation all!(falses(100), trues(100, 100))
+    @check_bit_operation all!(falses(1000), trues(1000, 100))
+end

From bff1c4895631f47108f2bf3881261c8df08596de Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@gmail.com>
Date: Tue, 12 Jun 2018 12:12:09 -0400
Subject: [PATCH 3/4] Try working around the performance hit in common cases

---
 base/broadcast.jl                  | 17 ++++++++--
 base/multidimensional.jl           | 21 ++++++++++++
 base/reduce.jl                     | 35 +++++++++++++++-----
 base/reducedim.jl                  | 51 ++++++++++++++++++++++++------
 base/simdloop.jl                   |  2 +-
 doc/src/manual/performance-tips.md |  2 +-
 6 files changed, 106 insertions(+), 22 deletions(-)

diff --git a/base/broadcast.jl b/base/broadcast.jl
index 36426840f5610..0d2e37f3451a0 100644
--- a/base/broadcast.jl
+++ b/base/broadcast.jl
@@ -828,12 +828,25 @@ preprocess_args(dest, args::Tuple{}) = ()
         end
     end
     bc′ = preprocess(dest, bc)
-    for I in eachindex(bc′)
-        @inbounds dest[I] = bc′[I]
+    if _is_simd_safe(dest, bc)
+        @inbounds @simd for I in eachindex(bc′)
+            dest[I] = bc′[I]
+        end
+    else
+        @inbounds for I in eachindex(bc′)
+            dest[I] = bc′[I]
+        end
     end
     return dest
 end
 
+_is_simd_safe(::Any, ::Any) = false
+@inline _is_simd_safe(::Array, bc::Broadcasted) = _args_are_simd_safe(bc)
+_args_are_simd_safe() = true
+_args_are_simd_safe(::Any, args...) = false
+@inline _args_are_simd_safe(::Union{Array, Number}, args...) = _args_are_simd_safe(args...)
+@inline _args_are_simd_safe(bc::Broadcasted, args...) = Base.simdable(bc.f) isa Base.SIMDableFunction && _args_are_simd_safe(args...)
+
 # Performance optimization: for BitArray outputs, we cache the result
 # in a "small" Vector{Bool}, and then copy in chunks into the output
 @inline function copyto!(dest::BitArray, bc::Broadcasted{Nothing})
diff --git a/base/multidimensional.jl b/base/multidimensional.jl
index 708709d952679..9d297a1509ca9 100644
--- a/base/multidimensional.jl
+++ b/base/multidimensional.jl
@@ -1470,6 +1470,27 @@ end
     end
     return B
 end
+# When both arrays are ::Array the SIMD transform is safe
+@noinline function extrema!(B::Array, A::Array)
+    sA = size(A)
+    sB = size(B)
+    for I in CartesianIndices(sB)
+        AI = A[I]
+        B[I] = (AI, AI)
+    end
+    Bmax = CartesianIndex(sB)
+    @inbounds @simd for I in CartesianIndices(sA)
+        J = min(Bmax,I)
+        BJ = B[J]
+        AI = A[I]
+        if AI < BJ[1]
+            B[J] = (AI, BJ[2])
+        elseif AI > BJ[2]
+            B[J] = (BJ[1], AI)
+        end
+    end
+    return B
+end
 
 # Show for pairs() with Cartesian indicies. Needs to be here rather than show.jl for bootstrap order
 function Base.showarg(io::IO, r::Iterators.Pairs{<:Integer, <:Any, <:Any, T}, toplevel) where T <: Union{AbstractVector, Tuple}
diff --git a/base/reduce.jl b/base/reduce.jl
index b1607e39de906..a0cf96614fc96 100644
--- a/base/reduce.jl
+++ b/base/reduce.jl
@@ -187,14 +187,7 @@ foldr(op, itr) = mapfoldr(identity, op, itr)
         return mapreduce_first(f, op, a1)
     elseif ifirst + blksize > ilast
         # sequential portion
-        @inbounds a1 = A[ifirst]
-        @inbounds a2 = A[ifirst+1]
-        v = op(f(a1), f(a2))
-        for i = ifirst + 2 : ilast
-            @inbounds ai = A[i]
-            v = op(v, f(ai))
-        end
-        return v
+        return _mapreduce_impl_loop(simdable(f), simdable(op), A, ifirst, ilast)
     else
         # pairwise portion
         imid = (ifirst + ilast) >> 1
@@ -204,6 +197,32 @@ foldr(op, itr) = mapfoldr(identity, op, itr)
     end
 end
 
+# The @simd transformation is only valid in limited situations
+struct SIMDableFunction{f}; end
+(::SIMDableFunction{f})(args...) where {f} = f(args...)
+simdable(f::Union{map(typeof, (+, *, &, |, add_sum, mul_prod, -, /, ^, identity))...}) = SIMDableFunction{f}()
+simdable(f) = f
+function _mapreduce_impl_loop(f::SIMDableFunction, op::SIMDableFunction, A::Array, ifirst, ilast)
+    @inbounds a1 = A[ifirst]
+    @inbounds a2 = A[ifirst+1]
+    v = op(f(a1), f(a2))
+    @simd for i = ifirst + 2 : ilast
+        @inbounds ai = A[i]
+        v = op(v, f(ai))
+    end
+    return v
+end
+function _mapreduce_impl_loop(f, op, A, ifirst, ilast)
+    @inbounds a1 = A[ifirst]
+    @inbounds a2 = A[ifirst+1]
+    v = op(f(a1), f(a2))
+    for i = ifirst + 2 : ilast
+        @inbounds ai = A[i]
+        v = op(v, f(ai))
+    end
+    return v
+end
+
 mapreduce_impl(f, op, A::AbstractArray, ifirst::Integer, ilast::Integer) =
     mapreduce_impl(f, op, A, ifirst, ilast, pairwise_blocksize(f, op))
 
diff --git a/base/reducedim.jl b/base/reducedim.jl
index 7801863e922b4..9051c26f14c11 100644
--- a/base/reducedim.jl
+++ b/base/reducedim.jl
@@ -222,24 +222,55 @@ function _mapreducedim!(f, op, R::AbstractArray, A::AbstractArray)
     if reducedim1(R, A)
         # keep the accumulator as a local variable when reducing along the first dimension
         i1 = first(indices1(R))
-        @inbounds for IA in CartesianIndices(indsAt)
+        for IA in CartesianIndices(indsAt)
             IR = Broadcast.newindex(IA, keep, Idefault)
-            r = R[i1,IR]
-            for i in axes(A, 1)
-                r = op(r, f(A[i, IA]))
-            end
-            R[i1,IR] = r
+            _mapreducedim_loop1!(simdable(f), simdable(op), R, A, IR, IA, i1)
         end
     else
-        @inbounds for IA in CartesianIndices(indsAt)
+        for IA in CartesianIndices(indsAt)
             IR = Broadcast.newindex(IA, keep, Idefault)
-            for i in axes(A, 1)
-                R[i,IR] = op(R[i,IR], f(A[i,IA]))
-            end
+            _mapreducedim_loop!(simdable(f), simdable(op), R, A, IR, IA)
+        end
+    end
+    return R
+end
+
+# The innermost loops are split out to allow for @simd in known safe cases
+# add a few more simd-safe functions that were not available earlier in bootstrap
+simdable(f::Union{map(typeof, (abs, sqrt, log, log10, log2))...}) = SIMDableFunction{f}()
+@inline function _mapreducedim_loop1!(f, op, R, A, IR, IA, i1)
+    @inbounds begin
+        r = R[i1,IR]
+        for i in axes(A, 1)
+            r = op(r, f(A[i, IA]))
         end
+        R[i1,IR] = r
     end
     return R
 end
+@inline function _mapreducedim_loop1!(f::SIMDableFunction, op::SIMDableFunction, R::Array, A::Array, IR, IA, i1)
+    @inbounds begin
+        r = R[i1,IR]
+        @simd for i in axes(A, 1)
+            r = op(r, f(A[i, IA]))
+        end
+        R[i1,IR] = r
+    end
+    return R
+end
+@inline function _mapreducedim_loop!(f, op, R, A, IR, IA)
+    @inbounds for i in axes(A, 1)
+        R[i,IR] = op(R[i,IR], f(A[i,IA]))
+    end
+    return R
+end
+@inline function _mapreducedim_loop!(f::SIMDableFunction, op::SIMDableFunction, R::Array, A::Array, IR, IA)
+    @inbounds @simd for i in axes(A, 1)
+        R[i,IR] = op(R[i,IR], f(A[i,IA]))
+    end
+    return R
+end
+
 
 mapreducedim!(f, op, R::AbstractArray, A::AbstractArray) =
     (_mapreducedim!(f, op, R, A); R)
diff --git a/base/simdloop.jl b/base/simdloop.jl
index 42ec45815100f..8c9d4c6fa3741 100644
--- a/base/simdloop.jl
+++ b/base/simdloop.jl
@@ -94,7 +94,7 @@ Annotate a `for` loop to allow the compiler to take extra liberties to allow loo
     This feature is experimental and could change or disappear in future versions of Julia.
     Incorrect use of the `@simd` macro may cause unexpected results.
 
-The object iterated over in a `@simd for` loop should be a one-dimensional range.
+The object iterated over in a `@simd for` loop should be a one-dimensional range or a CartesianIndices iterator.
 By using `@simd`, you are asserting several properties of the loop:
 
     * It is safe to execute iterations in arbitrary or overlapping order, with special consideration for reduction variables.
diff --git a/doc/src/manual/performance-tips.md b/doc/src/manual/performance-tips.md
index aba5ba560c0f8..006aa91d808a7 100644
--- a/doc/src/manual/performance-tips.md
+++ b/doc/src/manual/performance-tips.md
@@ -1144,7 +1144,7 @@ Sometimes you can enable better optimization by promising certain program proper
     like allowing floating-point re-associativity and ignoring dependent memory accesses. Again, be very
     careful when asserting `@simd` as erroneously annotating a loop with dependent iterations may result
     in unexpected results. In particular, note that `setindex!` on some `AbstractArray` subtypes is
-    enherently dependent upon iteration order. **This feature is experimental**
+    inherently dependent upon iteration order. **This feature is experimental**
     and could change or disappear in future versions of Julia.
 
 The common idiom of using 1:n to index into an AbstractArray is not safe if the Array uses unconventional indexing,

From c0e785c1c5fd28be591ed4b35753e4199505fffd Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@gmail.com>
Date: Wed, 13 Jun 2018 01:37:59 -0400
Subject: [PATCH 4/4] inline inner loop functions and add abs2 as a simdable
 function

---
 base/reduce.jl    | 4 ++--
 base/reducedim.jl | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/base/reduce.jl b/base/reduce.jl
index a0cf96614fc96..8ccf79b43847a 100644
--- a/base/reduce.jl
+++ b/base/reduce.jl
@@ -202,7 +202,7 @@ struct SIMDableFunction{f}; end
 (::SIMDableFunction{f})(args...) where {f} = f(args...)
 simdable(f::Union{map(typeof, (+, *, &, |, add_sum, mul_prod, -, /, ^, identity))...}) = SIMDableFunction{f}()
 simdable(f) = f
-function _mapreduce_impl_loop(f::SIMDableFunction, op::SIMDableFunction, A::Array, ifirst, ilast)
+@inline function _mapreduce_impl_loop(f::SIMDableFunction, op::SIMDableFunction, A::Array, ifirst, ilast)
     @inbounds a1 = A[ifirst]
     @inbounds a2 = A[ifirst+1]
     v = op(f(a1), f(a2))
@@ -212,7 +212,7 @@ function _mapreduce_impl_loop(f::SIMDableFunction, op::SIMDableFunction, A::Arra
     end
     return v
 end
-function _mapreduce_impl_loop(f, op, A, ifirst, ilast)
+@inline function _mapreduce_impl_loop(f, op, A, ifirst, ilast)
     @inbounds a1 = A[ifirst]
     @inbounds a2 = A[ifirst+1]
     v = op(f(a1), f(a2))
diff --git a/base/reducedim.jl b/base/reducedim.jl
index 9051c26f14c11..9eeaa5fcbbec4 100644
--- a/base/reducedim.jl
+++ b/base/reducedim.jl
@@ -237,7 +237,7 @@ end
 
 # The innermost loops are split out to allow for @simd in known safe cases
 # add a few more simd-safe functions that were not available earlier in bootstrap
-simdable(f::Union{map(typeof, (abs, sqrt, log, log10, log2))...}) = SIMDableFunction{f}()
+simdable(f::Union{map(typeof, (abs, abs2, sqrt, log, log10, log2))...}) = SIMDableFunction{f}()
 @inline function _mapreducedim_loop1!(f, op, R, A, IR, IA, i1)
     @inbounds begin
         r = R[i1,IR]