@@ -3358,6 +3358,58 @@ static Value *emit_bitsunion_compare(jl_codectx_t &ctx, const jl_cgval_t &arg1,
3358
3358
return phi;
3359
3359
}
3360
3360
3361
+ struct egal_desc {
3362
+ size_t offset;
3363
+ size_t nrepeats;
3364
+ size_t data_bytes;
3365
+ size_t padding_bytes;
3366
+ };
3367
+
3368
+ template <typename callback>
3369
+ static size_t emit_masked_bits_compare(callback &emit_desc, jl_datatype_t *aty, egal_desc ¤t_desc)
3370
+ {
3371
+ // Memcmp, but with masked padding
3372
+ size_t data_bytes = 0;
3373
+ size_t padding_bytes = 0;
3374
+ size_t nfields = jl_datatype_nfields(aty);
3375
+ size_t total_size = jl_datatype_size(aty);
3376
+ for (size_t i = 0; i < nfields; ++i) {
3377
+ size_t offset = jl_field_offset(aty, i);
3378
+ size_t fend = i == nfields - 1 ? total_size : jl_field_offset(aty, i + 1);
3379
+ size_t fsz = jl_field_size(aty, i);
3380
+ jl_datatype_t *fty = (jl_datatype_t*)jl_field_type(aty, i);
3381
+ if (jl_field_isptr(aty, i) || !fty->layout->flags.haspadding) {
3382
+ // The field has no internal padding
3383
+ data_bytes += fsz;
3384
+ if (offset + fsz == fend) {
3385
+ // The field has no padding after. Merge this into the current
3386
+ // comparison range and go to next field.
3387
+ } else {
3388
+ padding_bytes = fend - offset - fsz;
3389
+ // Found padding. Either merge this into the current comparison
3390
+ // range, or emit the old one and start a new one.
3391
+ if (current_desc.data_bytes == data_bytes &&
3392
+ current_desc.padding_bytes == padding_bytes) {
3393
+ // Same as the previous range, just note that down, so we
3394
+ // emit this as a loop.
3395
+ current_desc.nrepeats += 1;
3396
+ } else {
3397
+ if (current_desc.nrepeats != 0)
3398
+ emit_desc(current_desc);
3399
+ current_desc.nrepeats = 1;
3400
+ current_desc.data_bytes = data_bytes;
3401
+ current_desc.padding_bytes = padding_bytes;
3402
+ }
3403
+ data_bytes = 0;
3404
+ }
3405
+ } else {
3406
+ // The field may have internal padding. Recurse this.
3407
+ data_bytes += emit_masked_bits_compare(emit_desc, fty, current_desc);
3408
+ }
3409
+ }
3410
+ return data_bytes;
3411
+ }
3412
+
3361
3413
static Value *emit_bits_compare(jl_codectx_t &ctx, jl_cgval_t arg1, jl_cgval_t arg2)
3362
3414
{
3363
3415
++EmittedBitsCompares;
@@ -3396,7 +3448,7 @@ static Value *emit_bits_compare(jl_codectx_t &ctx, jl_cgval_t arg1, jl_cgval_t a
3396
3448
if (at->isAggregateType()) { // Struct or Array
3397
3449
jl_datatype_t *sty = (jl_datatype_t*)arg1.typ;
3398
3450
size_t sz = jl_datatype_size(sty);
3399
- if (sz > 512 && !sty->layout->flags.haspadding) {
3451
+ if (sz > 512 && !sty->layout->flags.haspadding && sty->layout->flags.isbitsegal ) {
3400
3452
Value *varg1 = arg1.ispointer() ? data_pointer(ctx, arg1) :
3401
3453
value_to_pointer(ctx, arg1).V;
3402
3454
Value *varg2 = arg2.ispointer() ? data_pointer(ctx, arg2) :
@@ -3433,6 +3485,89 @@ static Value *emit_bits_compare(jl_codectx_t &ctx, jl_cgval_t arg1, jl_cgval_t a
3433
3485
}
3434
3486
return ctx.builder.CreateICmpEQ(answer, ConstantInt::get(getInt32Ty(ctx.builder.getContext()), 0));
3435
3487
}
3488
+ else if (sz > 512 && jl_struct_try_layout(sty) && sty->layout->flags.isbitsegal) {
3489
+ Type *TInt8 = getInt8Ty(ctx.builder.getContext());
3490
+ Type *TpInt8 = getInt8PtrTy(ctx.builder.getContext());
3491
+ Type *TInt1 = getInt1Ty(ctx.builder.getContext());
3492
+ Value *varg1 = arg1.ispointer() ? data_pointer(ctx, arg1) :
3493
+ value_to_pointer(ctx, arg1).V;
3494
+ Value *varg2 = arg2.ispointer() ? data_pointer(ctx, arg2) :
3495
+ value_to_pointer(ctx, arg2).V;
3496
+ varg1 = emit_pointer_from_objref(ctx, varg1);
3497
+ varg2 = emit_pointer_from_objref(ctx, varg2);
3498
+ varg1 = emit_bitcast(ctx, varg1, TpInt8);
3499
+ varg2 = emit_bitcast(ctx, varg2, TpInt8);
3500
+
3501
+ Value *answer = nullptr;
3502
+ auto emit_desc = [&](egal_desc desc) {
3503
+ Value *ptr1 = varg1;
3504
+ Value *ptr2 = varg2;
3505
+ if (desc.offset != 0) {
3506
+ ptr1 = ctx.builder.CreateConstInBoundsGEP1_32(TInt8, ptr1, desc.offset);
3507
+ ptr2 = ctx.builder.CreateConstInBoundsGEP1_32(TInt8, ptr2, desc.offset);
3508
+ }
3509
+
3510
+ Value *new_ptr1 = ptr1;
3511
+ Value *endptr1 = nullptr;
3512
+ BasicBlock *postBB = nullptr;
3513
+ BasicBlock *loopBB = nullptr;
3514
+ PHINode *answerphi = nullptr;
3515
+ if (desc.nrepeats != 1) {
3516
+ // Set up loop
3517
+ endptr1 = ctx.builder.CreateConstInBoundsGEP1_32(TInt8, ptr1, desc.nrepeats * (desc.data_bytes + desc.padding_bytes));;
3518
+
3519
+ BasicBlock *currBB = ctx.builder.GetInsertBlock();
3520
+ loopBB = BasicBlock::Create(ctx.builder.getContext(), "egal_loop", ctx.f);
3521
+ postBB = BasicBlock::Create(ctx.builder.getContext(), "post", ctx.f);
3522
+ ctx.builder.CreateBr(loopBB);
3523
+
3524
+ ctx.builder.SetInsertPoint(loopBB);
3525
+ answerphi = ctx.builder.CreatePHI(TInt1, 2);
3526
+ answerphi->addIncoming(answer ? answer : ConstantInt::get(TInt1, 1), currBB);
3527
+ answer = answerphi;
3528
+
3529
+ PHINode *itr1 = ctx.builder.CreatePHI(ptr1->getType(), 2);
3530
+ PHINode *itr2 = ctx.builder.CreatePHI(ptr2->getType(), 2);
3531
+
3532
+ new_ptr1 = ctx.builder.CreateConstInBoundsGEP1_32(TInt8, itr1, desc.data_bytes + desc.padding_bytes);
3533
+ itr1->addIncoming(ptr1, currBB);
3534
+ itr1->addIncoming(new_ptr1, loopBB);
3535
+
3536
+ Value *new_ptr2 = ctx.builder.CreateConstInBoundsGEP1_32(TInt8, itr2, desc.data_bytes + desc.padding_bytes);
3537
+ itr2->addIncoming(ptr2, currBB);
3538
+ itr2->addIncoming(new_ptr2, loopBB);
3539
+
3540
+ ptr1 = itr1;
3541
+ ptr2 = itr2;
3542
+ }
3543
+
3544
+ // Emit memcmp. TODO: LLVM has a pass to expand this for additional
3545
+ // performance.
3546
+ Value *this_answer = ctx.builder.CreateCall(prepare_call(memcmp_func),
3547
+ { ptr1,
3548
+ ptr2,
3549
+ ConstantInt::get(ctx.types().T_size, desc.data_bytes) });
3550
+ this_answer = ctx.builder.CreateICmpEQ(this_answer, ConstantInt::get(getInt32Ty(ctx.builder.getContext()), 0));
3551
+ answer = answer ? ctx.builder.CreateAnd(answer, this_answer) : this_answer;
3552
+ if (endptr1) {
3553
+ answerphi->addIncoming(answer, loopBB);
3554
+ Value *loopend = ctx.builder.CreateICmpEQ(new_ptr1, endptr1);
3555
+ ctx.builder.CreateCondBr(loopend, postBB, loopBB);
3556
+ ctx.builder.SetInsertPoint(postBB);
3557
+ }
3558
+ };
3559
+ egal_desc current_desc = {0};
3560
+ size_t trailing_data_bytes = emit_masked_bits_compare(emit_desc, sty, current_desc);
3561
+ assert(current_desc.nrepeats != 0);
3562
+ emit_desc(current_desc);
3563
+ if (trailing_data_bytes != 0) {
3564
+ current_desc.nrepeats = 1;
3565
+ current_desc.data_bytes = trailing_data_bytes;
3566
+ current_desc.padding_bytes = 0;
3567
+ emit_desc(current_desc);
3568
+ }
3569
+ return answer;
3570
+ }
3436
3571
else {
3437
3572
jl_svec_t *types = sty->types;
3438
3573
Value *answer = ConstantInt::get(getInt1Ty(ctx.builder.getContext()), 1);
0 commit comments