JuliaLang · mbauman · Jun 8, 2018 · Jun 8, 2018 · Jun 12, 2018 · Jun 13, 2018
diff --git a/base/broadcast.jl b/base/broadcast.jl
@@ -828,12 +828,25 @@ preprocess_args(dest, args::Tuple{}) = ()
         end
     end
     bc′ = preprocess(dest, bc)
-    @simd for I in eachindex(bc′)
-        @inbounds dest[I] = bc′[I]
+    if _is_simd_safe(dest, bc)
+        @inbounds @simd for I in eachindex(bc′)
+            dest[I] = bc′[I]
+        end
+    else
+        @inbounds for I in eachindex(bc′)
+            dest[I] = bc′[I]
+        end
     end
     return dest
 end
 
+_is_simd_safe(::Any, ::Any) = false
+@inline _is_simd_safe(::Array, bc::Broadcasted) = _args_are_simd_safe(bc)
+_args_are_simd_safe() = true
+_args_are_simd_safe(::Any, args...) = false
+@inline _args_are_simd_safe(::Union{Array, Number}, args...) = _args_are_simd_safe(args...)
+@inline _args_are_simd_safe(bc::Broadcasted, args...) = Base.simdable(bc.f) isa Base.SIMDableFunction && _args_are_simd_safe(args...)
+
 # Performance optimization: for BitArray outputs, we cache the result
 # in a "small" Vector{Bool}, and then copy in chunks into the output
 @inline function copyto!(dest::BitArray, bc::Broadcasted{Nothing})

diff --git a/base/multidimensional.jl b/base/multidimensional.jl
@@ -1451,6 +1451,27 @@ function extrema(A::AbstractArray, dims)
 end
 
 @noinline function extrema!(B, A)
+    sA = size(A)
+    sB = size(B)
+    for I in CartesianIndices(sB)
+        AI = A[I]
+        B[I] = (AI, AI)
+    end
+    Bmax = CartesianIndex(sB)
+    @inbounds for I in CartesianIndices(sA)
+        J = min(Bmax,I)
+        BJ = B[J]
+        AI = A[I]
+        if AI < BJ[1]
+            B[J] = (AI, BJ[2])
+        elseif AI > BJ[2]
+            B[J] = (BJ[1], AI)
+        end
+    end
+    return B
+end
+# When both arrays are ::Array the SIMD transform is safe
+@noinline function extrema!(B::Array, A::Array)
     sA = size(A)
     sB = size(B)
     for I in CartesianIndices(sB)

diff --git a/base/reduce.jl b/base/reduce.jl
@@ -187,14 +187,7 @@ foldr(op, itr) = mapfoldr(identity, op, itr)
         return mapreduce_first(f, op, a1)
     elseif ifirst + blksize > ilast
         # sequential portion
-        @inbounds a1 = A[ifirst]
-        @inbounds a2 = A[ifirst+1]
-        v = op(f(a1), f(a2))
-        @simd for i = ifirst + 2 : ilast
-            @inbounds ai = A[i]
-            v = op(v, f(ai))
-        end
-        return v
+        return _mapreduce_impl_loop(simdable(f), simdable(op), A, ifirst, ilast)
     else
         # pairwise portion
         imid = (ifirst + ilast) >> 1
@@ -204,6 +197,32 @@ foldr(op, itr) = mapfoldr(identity, op, itr)
     end
 end
 
+# The @simd transformation is only valid in limited situations
+struct SIMDableFunction{f}; end
+(::SIMDableFunction{f})(args...) where {f} = f(args...)
+simdable(f::Union{map(typeof, (+, *, &, |, add_sum, mul_prod, -, /, ^, identity))...}) = SIMDableFunction{f}()
+simdable(f) = f
+@inline function _mapreduce_impl_loop(f::SIMDableFunction, op::SIMDableFunction, A::Array, ifirst, ilast)
+    @inbounds a1 = A[ifirst]
+    @inbounds a2 = A[ifirst+1]
+    v = op(f(a1), f(a2))
+    @simd for i = ifirst + 2 : ilast
+        @inbounds ai = A[i]
+        v = op(v, f(ai))
+    end
+    return v
+end
+@inline function _mapreduce_impl_loop(f, op, A, ifirst, ilast)
+    @inbounds a1 = A[ifirst]
+    @inbounds a2 = A[ifirst+1]
+    v = op(f(a1), f(a2))
+    for i = ifirst + 2 : ilast
+        @inbounds ai = A[i]
+        v = op(v, f(ai))
+    end
+    return v
+end
+
 mapreduce_impl(f, op, A::AbstractArray, ifirst::Integer, ilast::Integer) =
     mapreduce_impl(f, op, A, ifirst, ilast, pairwise_blocksize(f, op))
 

diff --git a/base/reducedim.jl b/base/reducedim.jl
@@ -222,24 +222,55 @@ function _mapreducedim!(f, op, R::AbstractArray, A::AbstractArray)
     if reducedim1(R, A)
         # keep the accumulator as a local variable when reducing along the first dimension
         i1 = first(indices1(R))
-        @inbounds for IA in CartesianIndices(indsAt)
+        for IA in CartesianIndices(indsAt)
             IR = Broadcast.newindex(IA, keep, Idefault)
-            r = R[i1,IR]
-            @simd for i in axes(A, 1)
-                r = op(r, f(A[i, IA]))
-            end
-            R[i1,IR] = r
+            _mapreducedim_loop1!(simdable(f), simdable(op), R, A, IR, IA, i1)
         end
     else
-        @inbounds for IA in CartesianIndices(indsAt)
+        for IA in CartesianIndices(indsAt)
             IR = Broadcast.newindex(IA, keep, Idefault)
-            @simd for i in axes(A, 1)
-                R[i,IR] = op(R[i,IR], f(A[i,IA]))
-            end
+            _mapreducedim_loop!(simdable(f), simdable(op), R, A, IR, IA)
+        end
+    end
+    return R
+end
+
+# The innermost loops are split out to allow for @simd in known safe cases
+# add a few more simd-safe functions that were not available earlier in bootstrap
+simdable(f::Union{map(typeof, (abs, abs2, sqrt, log, log10, log2))...}) = SIMDableFunction{f}()
+@inline function _mapreducedim_loop1!(f, op, R, A, IR, IA, i1)
+    @inbounds begin
+        r = R[i1,IR]
+        for i in axes(A, 1)
+            r = op(r, f(A[i, IA]))
+        end
+        R[i1,IR] = r
+    end
+    return R
+end
+@inline function _mapreducedim_loop1!(f::SIMDableFunction, op::SIMDableFunction, R::Array, A::Array, IR, IA, i1)
+    @inbounds begin
+        r = R[i1,IR]
+        @simd for i in axes(A, 1)
+            r = op(r, f(A[i, IA]))
         end
+        R[i1,IR] = r
+    end
+    return R
+end
+@inline function _mapreducedim_loop!(f, op, R, A, IR, IA)
+    @inbounds for i in axes(A, 1)
+        R[i,IR] = op(R[i,IR], f(A[i,IA]))
     end
     return R
 end
+@inline function _mapreducedim_loop!(f::SIMDableFunction, op::SIMDableFunction, R::Array, A::Array, IR, IA)
+    @inbounds @simd for i in axes(A, 1)
+        R[i,IR] = op(R[i,IR], f(A[i,IA]))
+    end
+    return R
+end
+
 
 mapreducedim!(f, op, R::AbstractArray, A::AbstractArray) =
     (_mapreducedim!(f, op, R, A); R)

diff --git a/base/simdloop.jl b/base/simdloop.jl
@@ -85,6 +85,36 @@ function compile(x)
     end
 end
 
+"""
+    @simd
+
+Annotate a `for` loop to allow the compiler to take extra liberties to allow loop re-ordering
+
+!!!warning
+    This feature is experimental and could change or disappear in future versions of Julia.
+    Incorrect use of the `@simd` macro may cause unexpected results.
+
+The object iterated over in a `@simd for` loop should be a one-dimensional range or a CartesianIndices iterator.
+By using `@simd`, you are asserting several properties of the loop:
+
+    * It is safe to execute iterations in arbitrary or overlapping order, with special consideration for reduction variables.
+    * Floating-point operations on reduction variables can be reordered, possibly causing different results than without `@simd`.
+    * No iteration ever waits on a previous iteration to make forward progress.
+
+In many cases, Julia is able to automatically vectorize inner for loops without the use of `@simd`.
+Using `@simd` gives the compiler a little extra leeway to make it possible in more situations. In
+either case, your inner loop should have the following properties to allow vectorization:
+
+    * The loop must be an innermost loop
+    * The loop body must be straight-line code. Therefore, [`@inbounds`](@ref) is
+      currently needed for all array accesses. The compiler can sometimes turn
+      short `&&`, `||`, and `?:` expressions into straight-line code if it is safe
+      to evaluate all operands unconditionally. Consider using the [`ifelse`](@ref)
+      function instead of `?:` in the loop if it is safe to do so.
+    * Accesses must have a stride pattern and cannot be "gathers" (random-index
+      reads) or "scatters" (random-index writes).
+    * The stride should be unit stride.
+"""
 macro simd(forloop)
     esc(compile(forloop))
 end

diff --git a/doc/src/base/base.md b/doc/src/base/base.md
@@ -219,6 +219,7 @@ Base.gensym
 Base.@gensym
 Base.@goto
 Base.@label
+Base.@simd
 Base.@polly
 ```
 

diff --git a/doc/src/manual/performance-tips.md b/doc/src/manual/performance-tips.md
@@ -1133,22 +1133,28 @@ These are some minor points that might help in tight inner loops.
 
 Sometimes you can enable better optimization by promising certain program properties.
 
-  * Use `@inbounds` to eliminate array bounds checking within expressions. Be certain before doing
+  * Use [`@inbounds`](@ref) to eliminate array bounds checking within expressions. Be certain before doing
     this. If the subscripts are ever out of bounds, you may suffer crashes or silent corruption.
-  * Use `@fastmath` to allow floating point optimizations that are correct for real numbers, but lead
+  * Use [`@fastmath`](@ref) to allow floating point optimizations that are correct for real numbers, but lead
     to differences for IEEE numbers. Be careful when doing this, as this may change numerical results.
     This corresponds to the `-ffast-math` option of clang.
-  * Write `@simd` in front of `for` loops that are amenable to vectorization. **This feature is experimental**
+  * Write [`@simd`](@ref) in front of `for` loops to promise that the iterations are independent and may be
+    reordered.  Note that in many cases, Julia can automatically vectorize code without the `@simd` macro;
+    it is only beneficial in cases where such a transformation would otherwise be illegal, including cases
+    like allowing floating-point re-associativity and ignoring dependent memory accesses. Again, be very
+    careful when asserting `@simd` as erroneously annotating a loop with dependent iterations may result
+    in unexpected results. In particular, note that `setindex!` on some `AbstractArray` subtypes is
+    inherently dependent upon iteration order. **This feature is experimental**
     and could change or disappear in future versions of Julia.
 
 The common idiom of using 1:n to index into an AbstractArray is not safe if the Array uses unconventional indexing,
 and may cause a segmentation fault if bounds checking is turned off. Use `LinearIndices(x)` or `eachindex(x)`
 instead (see also [offset-arrays](https://docs.julialang.org/en/latest/devdocs/offset-arrays)).
 
 !!!note
-    While `@simd` needs to be placed directly in front of a loop, both `@inbounds` and `@fastmath`
-    can be applied to several statements at once, e.g. using `begin` ... `end`, or even to a whole
-    function.
+    While `@simd` needs to be placed directly in front of an innermost `for` loop, both `@inbounds` and `@fastmath`
+    can be applied to either single expressions or all the expressions that appear within nested blocks of code, e.g.,
+    using `@inbounds begin` or `@inbounds for ...`.
 
 Here is an example with both `@inbounds` and `@simd` markup (we here use `@noinline` to prevent
 the optimizer from trying to be too clever and defeat our benchmark):
@@ -1194,33 +1200,7 @@ GFlop/sec        = 1.9467069505224963
 GFlop/sec (SIMD) = 17.578554163920018
 ```
 
-(`GFlop/sec` measures the performance, and larger numbers are better.) The range for a `@simd for`
-loop should be a one-dimensional range. A variable used for accumulating, such as `s` in the example,
-is called a *reduction variable*. By using `@simd`, you are asserting several properties of the
-loop:
-
-  * It is safe to execute iterations in arbitrary or overlapping order, with special consideration
-    for reduction variables.
-  * Floating-point operations on reduction variables can be reordered, possibly causing different
-    results than without `@simd`.
-  * No iteration ever waits on another iteration to make forward progress.
-
-A loop containing `break`, `continue`, or `@goto` will cause a compile-time error.
-
-Using `@simd` merely gives the compiler license to vectorize. Whether it actually does so depends
-on the compiler. To actually benefit from the current implementation, your loop should have the
-following additional properties:
-
-  * The loop must be an innermost loop.
-  * The loop body must be straight-line code. This is why `@inbounds` is currently needed for all
-    array accesses. The compiler can sometimes turn short `&&`, `||`, and `?:` expressions into straight-line
-    code, if it is safe to evaluate all operands unconditionally. Consider using the [`ifelse`](@ref)
-    function instead of `?:` in the loop if it is safe to do so.
-  * Accesses must have a stride pattern and cannot be "gathers" (random-index reads) or "scatters"
-    (random-index writes).
-  * The stride should be unit stride.
-  * In some simple cases, for example with 2-3 arrays accessed in a loop, the LLVM auto-vectorization
-    may kick in automatically, leading to no further speedup with `@simd`.
+(`GFlop/sec` measures the performance, and larger numbers are better.)
 
 Here is an example with all three kinds of markup. This program first calculates the finite difference
 of a one-dimensional array, and then evaluates the L2-norm of the result:

diff --git a/test/bitarray.jl b/test/bitarray.jl
@@ -1575,3 +1575,13 @@ end
         end
     end
 end
+
+@testset "SIMD violations (issue #27482)" begin
+    @test all(any!(falses(10), trues(10, 10)))
+    @check_bit_operation any!(falses(10), trues(10, 10))
+    @check_bit_operation any!(falses(100), trues(100, 100))
+    @check_bit_operation any!(falses(1000), trues(1000, 100))
+    @check_bit_operation all!(falses(10), trues(10, 10))
+    @check_bit_operation all!(falses(100), trues(100, 100))
+    @check_bit_operation all!(falses(1000), trues(1000, 100))
+end