Skip to content

Commit 94d7417

Browse files
vtjnashKristofferC
authored andcommitted
Revert "Make emitted egal code more loopy (#54121)" (#57453)
This reverts a portion of commit 50833c8. This algorithm is not able to handle simple cases where there is any internal padding, such as the example of: ``` struct LotsBytes a::Int8 b::NTuple{256,Int} c::Int end ``` Unfortunately fixing it is a bit of a large project right now, so reverting now to fix correctness while working on that. Fixes #55513 (indirectly, by removing broken code) Maybe reopens #54109, although the latency issue it proposes to fix doesn't occur on master even with this revert (just the mediocre looking IR result output returns) (cherry picked from commit a65c2cf)
1 parent 3c58c10 commit 94d7417

File tree

2 files changed

+0
-192
lines changed

2 files changed

+0
-192
lines changed

Compiler/test/codegen.jl

-51
Original file line numberDiff line numberDiff line change
@@ -889,57 +889,6 @@ ex54166 = Union{Missing, Int64}[missing -2; missing -2];
889889
dims54166 = (1,2)
890890
@test (minimum(ex54166; dims=dims54166)[1] === missing)
891891

892-
# #54109 - Excessive LLVM time for egal
893-
struct DefaultOr54109{T}
894-
x::T
895-
default::Bool
896-
end
897-
898-
@eval struct Torture1_54109
899-
$((Expr(:(::), Symbol("x$i"), DefaultOr54109{Float64}) for i = 1:897)...)
900-
end
901-
Torture1_54109() = Torture1_54109((DefaultOr54109(1.0, false) for i = 1:897)...)
902-
903-
@eval struct Torture2_54109
904-
$((Expr(:(::), Symbol("x$i"), DefaultOr54109{Float64}) for i = 1:400)...)
905-
$((Expr(:(::), Symbol("x$(i+400)"), DefaultOr54109{Int16}) for i = 1:400)...)
906-
end
907-
Torture2_54109() = Torture2_54109((DefaultOr54109(1.0, false) for i = 1:400)..., (DefaultOr54109(Int16(1), false) for i = 1:400)...)
908-
909-
@noinline egal_any54109(x, @nospecialize(y::Any)) = x === Base.compilerbarrier(:type, y)
910-
911-
let ir1 = get_llvm(egal_any54109, Tuple{Torture1_54109, Any}),
912-
ir2 = get_llvm(egal_any54109, Tuple{Torture2_54109, Any})
913-
914-
# We can't really do timing on CI, so instead, let's look at the length of
915-
# the optimized IR. The original version had tens of thousands of lines and
916-
# was slower, so just check here that we only have < 500 lines. If somebody,
917-
# implements a better comparison that's larger than that, just re-benchmark
918-
# this and adjust the threshold.
919-
920-
@test count(==('\n'), ir1) < 500
921-
@test count(==('\n'), ir2) < 500
922-
end
923-
924-
## Regression test for egal of a struct of this size without padding, but with
925-
## non-bitsegal, to make sure that it doesn't accidentally go down the accelerated
926-
## path.
927-
@eval struct BigStructAnyInt
928-
$((Expr(:(::), Symbol("x$i"), Pair{Any, Int}) for i = 1:33)...)
929-
end
930-
BigStructAnyInt() = BigStructAnyInt((Union{Base.inferencebarrier(Float64), Int}=>i for i = 1:33)...)
931-
@test egal_any54109(BigStructAnyInt(), BigStructAnyInt())
932-
933-
## For completeness, also test correctness, since we don't have a lot of
934-
## large-struct tests.
935-
936-
# The two allocations of the same struct will likely have different padding,
937-
# we want to make sure we find them egal anyway - a naive memcmp would
938-
# accidentally look at it.
939-
@test egal_any54109(Torture1_54109(), Torture1_54109())
940-
@test egal_any54109(Torture2_54109(), Torture2_54109())
941-
@test !egal_any54109(Torture1_54109(), Torture1_54109((DefaultOr54109(2.0, false) for i = 1:897)...))
942-
943892
bar54599() = Base.inferencebarrier(true) ? (Base.PkgId(Main),1) : nothing
944893

945894
function foo54599()

src/codegen.cpp

-141
Original file line numberDiff line numberDiff line change
@@ -3641,61 +3641,6 @@ static Value *emit_bitsunion_compare(jl_codectx_t &ctx, const jl_cgval_t &arg1,
36413641
return phi;
36423642
}
36433643

3644-
struct egal_desc {
3645-
size_t offset;
3646-
size_t nrepeats;
3647-
size_t data_bytes;
3648-
size_t padding_bytes;
3649-
};
3650-
3651-
template <typename callback>
3652-
static size_t emit_masked_bits_compare(callback &emit_desc, jl_datatype_t *aty, egal_desc &current_desc)
3653-
{
3654-
// Memcmp, but with masked padding
3655-
size_t data_bytes = 0;
3656-
size_t padding_bytes = 0;
3657-
size_t nfields = jl_datatype_nfields(aty);
3658-
size_t total_size = jl_datatype_size(aty);
3659-
assert(aty->layout->flags.isbitsegal);
3660-
for (size_t i = 0; i < nfields; ++i) {
3661-
size_t offset = jl_field_offset(aty, i);
3662-
size_t fend = i == nfields - 1 ? total_size : jl_field_offset(aty, i + 1);
3663-
size_t fsz = jl_field_size(aty, i);
3664-
jl_datatype_t *fty = (jl_datatype_t*)jl_field_type(aty, i);
3665-
assert(jl_is_datatype(fty)); // union fields should never reach here
3666-
assert(fty->layout->flags.isbitsegal);
3667-
if (jl_field_isptr(aty, i) || !fty->layout->flags.haspadding) {
3668-
// The field has no internal padding
3669-
data_bytes += fsz;
3670-
if (offset + fsz == fend) {
3671-
// The field has no padding after. Merge this into the current
3672-
// comparison range and go to next field.
3673-
} else {
3674-
padding_bytes = fend - offset - fsz;
3675-
// Found padding. Either merge this into the current comparison
3676-
// range, or emit the old one and start a new one.
3677-
if (current_desc.data_bytes == data_bytes &&
3678-
current_desc.padding_bytes == padding_bytes) {
3679-
// Same as the previous range, just note that down, so we
3680-
// emit this as a loop.
3681-
current_desc.nrepeats += 1;
3682-
} else {
3683-
if (current_desc.nrepeats != 0)
3684-
emit_desc(current_desc);
3685-
current_desc.nrepeats = 1;
3686-
current_desc.data_bytes = data_bytes;
3687-
current_desc.padding_bytes = padding_bytes;
3688-
}
3689-
data_bytes = 0;
3690-
}
3691-
} else {
3692-
// The field may have internal padding. Recurse this.
3693-
data_bytes += emit_masked_bits_compare(emit_desc, fty, current_desc);
3694-
}
3695-
}
3696-
return data_bytes;
3697-
}
3698-
36993644
static Value *emit_bits_compare(jl_codectx_t &ctx, jl_cgval_t arg1, jl_cgval_t arg2)
37003645
{
37013646
++EmittedBitsCompares;
@@ -3772,92 +3717,6 @@ static Value *emit_bits_compare(jl_codectx_t &ctx, jl_cgval_t arg1, jl_cgval_t a
37723717
}
37733718
return ctx.builder.CreateICmpEQ(answer, ConstantInt::get(getInt32Ty(ctx.builder.getContext()), 0));
37743719
}
3775-
else if (sz > 512 && jl_struct_try_layout(sty) && sty->layout->flags.isbitsegal) {
3776-
Value *varg1 = arg1.inline_roots.empty() && arg1.ispointer() ? data_pointer(ctx, arg1) :
3777-
value_to_pointer(ctx, arg1).V;
3778-
Value *varg2 = arg2.inline_roots.empty() && arg2.ispointer() ? data_pointer(ctx, arg2) :
3779-
value_to_pointer(ctx, arg2).V;
3780-
varg1 = emit_pointer_from_objref(ctx, varg1);
3781-
varg2 = emit_pointer_from_objref(ctx, varg2);
3782-
3783-
// See above for why we want to do this
3784-
SmallVector<Value*, 0> gc_uses;
3785-
gc_uses.append(get_gc_roots_for(ctx, arg1));
3786-
gc_uses.append(get_gc_roots_for(ctx, arg2));
3787-
OperandBundleDef OpBundle("jl_roots", gc_uses);
3788-
3789-
Value *answer = nullptr;
3790-
auto emit_desc = [&](egal_desc desc) {
3791-
Value *ptr1 = varg1;
3792-
Value *ptr2 = varg2;
3793-
if (desc.offset != 0) {
3794-
ptr1 = emit_ptrgep(ctx, ptr1, desc.offset);
3795-
ptr2 = emit_ptrgep(ctx, ptr2, desc.offset);
3796-
}
3797-
3798-
Value *new_ptr1 = ptr1;
3799-
Value *endptr1 = nullptr;
3800-
BasicBlock *postBB = nullptr;
3801-
BasicBlock *loopBB = nullptr;
3802-
PHINode *answerphi = nullptr;
3803-
if (desc.nrepeats != 1) {
3804-
// Set up loop
3805-
endptr1 = emit_ptrgep(ctx, ptr1, desc.nrepeats * (desc.data_bytes + desc.padding_bytes));;
3806-
3807-
BasicBlock *currBB = ctx.builder.GetInsertBlock();
3808-
loopBB = BasicBlock::Create(ctx.builder.getContext(), "egal_loop", ctx.f);
3809-
postBB = BasicBlock::Create(ctx.builder.getContext(), "post", ctx.f);
3810-
ctx.builder.CreateBr(loopBB);
3811-
3812-
ctx.builder.SetInsertPoint(loopBB);
3813-
Type *TInt1 = getInt1Ty(ctx.builder.getContext());
3814-
answerphi = ctx.builder.CreatePHI(TInt1, 2);
3815-
answerphi->addIncoming(answer ? answer : ConstantInt::get(TInt1, 1), currBB);
3816-
answer = answerphi;
3817-
3818-
PHINode *itr1 = ctx.builder.CreatePHI(ptr1->getType(), 2);
3819-
PHINode *itr2 = ctx.builder.CreatePHI(ptr2->getType(), 2);
3820-
3821-
new_ptr1 = emit_ptrgep(ctx, itr1, desc.data_bytes + desc.padding_bytes);
3822-
itr1->addIncoming(ptr1, currBB);
3823-
itr1->addIncoming(new_ptr1, loopBB);
3824-
3825-
Value *new_ptr2 = emit_ptrgep(ctx, itr2, desc.data_bytes + desc.padding_bytes);
3826-
itr2->addIncoming(ptr2, currBB);
3827-
itr2->addIncoming(new_ptr2, loopBB);
3828-
3829-
ptr1 = itr1;
3830-
ptr2 = itr2;
3831-
}
3832-
3833-
// Emit memcmp. TODO: LLVM has a pass to expand this for additional
3834-
// performance.
3835-
Value *this_answer = ctx.builder.CreateCall(prepare_call(memcmp_func),
3836-
{ ptr1,
3837-
ptr2,
3838-
ConstantInt::get(ctx.types().T_size, desc.data_bytes) },
3839-
ArrayRef<OperandBundleDef>(&OpBundle, gc_uses.empty() ? 0 : 1));
3840-
this_answer = ctx.builder.CreateICmpEQ(this_answer, ConstantInt::get(getInt32Ty(ctx.builder.getContext()), 0));
3841-
answer = answer ? ctx.builder.CreateAnd(answer, this_answer) : this_answer;
3842-
if (endptr1) {
3843-
answerphi->addIncoming(answer, loopBB);
3844-
Value *loopend = ctx.builder.CreateICmpEQ(new_ptr1, endptr1);
3845-
ctx.builder.CreateCondBr(loopend, postBB, loopBB);
3846-
ctx.builder.SetInsertPoint(postBB);
3847-
}
3848-
};
3849-
egal_desc current_desc = {0};
3850-
size_t trailing_data_bytes = emit_masked_bits_compare(emit_desc, sty, current_desc);
3851-
assert(current_desc.nrepeats != 0);
3852-
emit_desc(current_desc);
3853-
if (trailing_data_bytes != 0) {
3854-
current_desc.nrepeats = 1;
3855-
current_desc.data_bytes = trailing_data_bytes;
3856-
current_desc.padding_bytes = 0;
3857-
emit_desc(current_desc);
3858-
}
3859-
return answer;
3860-
}
38613720
else {
38623721
jl_svec_t *types = sty->types;
38633722
Value *answer = ConstantInt::get(getInt1Ty(ctx.builder.getContext()), 1);

0 commit comments

Comments
 (0)