-
Notifications
You must be signed in to change notification settings - Fork 308
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
fix: enable new reactor in the e2e test #1657
base: feature/recovery
Are you sure you want to change the base?
Conversation
… added
need to debug the e2e test after merging |
currently, proofs are not sent during catchup. This is fine if the node downloads the compact block, however when the node doesn't download the compact block it cannot reconstruct the proofs itself. This occurs when the consensus reactor reconstructs the partsetheader from the commit, and the node catches up. to fix, we need to optionally send proofs in the recovery part, and add the ability to ask for proofs in the want message. at least for now when we need to remain backwards compatible |
missing := prop.block.BitArray().Not() | ||
if missing.IsEmpty() { | ||
// this should never be hit due to the check above. | ||
continue | ||
} | ||
missingParts := partSet.BitArray().Not() | ||
wantPart := &proptypes.WantParts{ | ||
Parts: missingParts, | ||
Height: targetHeight, | ||
Round: round, | ||
|
||
// make requests from different peers | ||
peers = shuffle(peers) | ||
|
||
for _, peer := range peers { | ||
mc := missing.Copy() | ||
reqs, has := peer.GetRequests(height, round) | ||
if has { | ||
mc = mc.Sub(reqs) | ||
} | ||
|
||
if mc.IsEmpty() { | ||
continue | ||
} | ||
|
||
e := p2p.Envelope{ | ||
ChannelID: WantChannel, | ||
Message: &protoprop.WantParts{ | ||
Parts: *missing.ToProto(), | ||
Height: height, | ||
Round: round, | ||
Prove: true, | ||
}, | ||
} | ||
|
||
if !p2p.TrySendEnvelopeShim(peer.peer, e, blockProp.Logger) { //nolint:staticcheck | ||
blockProp.Logger.Error("failed to send want part", "peer", peer, "height", height, "round", round) | ||
continue | ||
} | ||
|
||
// keep track of which requests we've made this attempt. | ||
missing.Sub(mc) | ||
peer.AddRequests(height, round, missing) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
note that need to test this function in a unit test and ensure that we're only actually max requesting parts once in a follow up
func (blockProp *Reactor) AddCommitment(height int64, round int32, psh *types.PartSetHeader) { | ||
blockProp.pmtx.Lock() | ||
defer blockProp.pmtx.Unlock() | ||
|
||
if blockProp.proposals[height] == nil { | ||
blockProp.proposals[height] = make(map[int32]*proposalData) | ||
} | ||
|
||
combinedSet := proptypes.NewCombinedPartSetFromOriginal(types.NewPartSetFromHeader(*psh), true) | ||
|
||
if blockProp.proposals[height][round] != nil { | ||
return | ||
} | ||
|
||
blockProp.proposals[height][round] = &proposalData{ | ||
compactBlock: &proptypes.CompactBlock{ | ||
Proposal: types.Proposal{ | ||
Height: height, | ||
Round: round, | ||
}, | ||
}, | ||
catchup: true, | ||
block: combinedSet, | ||
maxRequests: bits.NewBitArray(int(psh.Total * 2)), // this assumes that the parity parts are the same size | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this allows the consensus reactor to add a partset header each time the node falls behind and then sees a valid commit later
// delete all but the last round for each remaining height except the current. | ||
// this is because we need to keep the last round for the current height. | ||
for height := range p.proposals { | ||
if height == p.currentHeight { | ||
continue | ||
} | ||
for round := range p.proposals[height] { | ||
if round <= p.currentRound-int32(keepRecentRounds) { | ||
delete(p.proposals[height], round) | ||
} | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
simplified pruning via just pruning everything after the consensus reactor commits to a block (instead of pruning after we receive new compact blocks. This makes pruning rounds more difficult tho
round: -1, // meaning "latest round" | ||
round: -2, // meaning "latest round" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we don't need this
func (blockProp *Reactor) handleHaves(peer p2p.ID, haves *proptypes.HaveParts, bypassRequestLimit bool) { | ||
func (blockProp *Reactor) handleHaves(peer p2p.ID, haves *proptypes.HaveParts, _ bool) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
note that we should remove the unused arg
p.wants.AddBitArray(haves) | ||
p.haves.AddBitArray(haves) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
idek know how this was working at all before
super annoying bug omg
// TestHugeBlock doesn't have a success or failure condition yet, although one could be added. It is very useful for debugging however | ||
func TestHugeBlock(t *testing.T) { | ||
p2pCfg := cfg.DefaultP2PConfig() | ||
p2pCfg.SendRate = 5000000 | ||
p2pCfg.RecvRate = 5000000 | ||
|
||
nodes := 20 | ||
|
||
reactors, _ := createTestReactors(nodes, p2pCfg, false, "/home/evan/data/experiments/celestia/fast-recovery/debug") | ||
|
||
cleanup, _, sm := state.SetupTestCase(t) | ||
t.Cleanup(func() { | ||
cleanup(t) | ||
}) | ||
|
||
prop, ps, _, metaData := createTestProposal(sm, 1, 32, 1000000) | ||
|
||
reactors[1].ProposeBlock(prop, ps, metaData) | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is only for fun / help debugging. we should expand on it in the future to actually check invarients
stateMachineValidBlock, err := cs.blockExec.ProcessProposal(cs.ProposalBlock) | ||
if err != nil { | ||
cs.Logger.Error("state machine returned an error when trying to process proposal block", "err", err) | ||
return | ||
} | ||
// todo: re-enable after the fast testnet | ||
// stateMachineValidBlock, err := cs.blockExec.ProcessProposal(cs.ProposalBlock) | ||
// if err != nil { | ||
// cs.Logger.Error("state machine returned an error when trying to process proposal block", "err", err) | ||
// return | ||
// } | ||
stateMachineValidBlock := true |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
note that we should re-enable this after the mammoth testnet
// todo: we need to figure out a way to get the proof for a part that was | ||
// sent during catchup. | ||
proof := cb.GetProof(part.Index) | ||
if proof == nil { | ||
if part.Proof == nil { | ||
blockProp.Logger.Error("proof not found", "peer", peer, "height", part.Height, "round", part.Round, "part", part.Index) | ||
return | ||
} | ||
proof = part.Proof | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
now we get proofs that are stored in the compact block instead of including them in messages. now proofs are only included during catchup
// todo: add a request limit for each part to avoid downloading the block too | ||
// many times. atm, this code will request the same part from every peer. | ||
func (blockProp *Reactor) retryWants(currentHeight int64, currentRound int32) { | ||
data := blockProp.dumpAll() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
hum, so this supports gaps right?
// only re-request original parts that are missing, not parity parts. | ||
missing := prop.block.BitArray().Not() | ||
if missing.IsEmpty() { | ||
// this should never be hit due to the check above. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
maybe log something here since it's never supposed to happen
// TODO document and explain the parameters | ||
// chunkIndexes creates a nested slice of starting and ending indexes for each | ||
// chunk. totalSize indicates the number of chunks. chunkSize indicates the size | ||
// of each chunk.. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
// of each chunk.. | |
// of each chunk. |
var hasStored *types.BlockMeta | ||
if height < p.currentHeight { | ||
hasStored = p.store.LoadBlockMeta(height) | ||
} | ||
|
||
cachedProps, has := p.proposals[height] | ||
cachedProp, hasRound := cachedProps[round] | ||
|
||
// if the round is less than zero, then they're asking for the latest |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
// if the round is less than zero, then they're asking for the latest | |
// if the round is less than -1, then they're asking for the latest |
// prune deletes all cached compact blocks for heights less than the provided | ||
// height and round. | ||
// | ||
// todo: also prune rounds. this requires prune in the consensus reactor after |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is pruning rounds too no? since we delete all the proposals for that round
hc.Sub(fullReqs) | ||
|
||
if hc.IsEmpty() { | ||
return |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this also shouldn't happen since we're checking if the block is complete, maybe write some log
} | ||
|
||
reqLimit := 1 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
so in the happy path, we only send wants once?
@@ -106,7 +104,7 @@ func (blockProp *Reactor) handleHaves(peer p2p.ID, haves *proptypes.HaveParts, b | |||
}, | |||
} | |||
|
|||
if !p2p.SendEnvelopeShim(p.peer, e, blockProp.Logger) { //nolint:staticcheck | |||
if !p2p.TrySendEnvelopeShim(p.peer, e, blockProp.Logger) { //nolint:staticcheck |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
since we're not making sure the message is sent, does it make sense to increase the request limit to 2? for additional redundancy
@@ -17,8 +17,7 @@ import ( | |||
) | |||
|
|||
const ( | |||
// TODO: set a valid max msg size | |||
maxMsgSize = 1048576 | |||
maxMsgSize = 4194304 // 4MiB |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
any rational for 4mib?
// blockCacheSize determines the number of blocks to keep in the cache. | ||
// After each block is committed, only the last `blockCacheSize` blocks are | ||
// kept. | ||
blockCacheSize = 5 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
IMO, this should be increased. Assuming 128mb blocks, we can assume this will hold 800mb of RAM (including proofs and other stuff), which is still small.
The issue is the blocktime ~3s. this only keeps the last 15sec compact blocks. If a node falls behind for more than 15sec (which I think is a lot likely to happen), will block sync be started automatically? I don't think that's the case, and this will leave the node hanging if all peers prune.
Maybe increase it to 25
, which would keep ~4GB of RAM but that's fine IMO
Description
adds catchup and proof verification to fix the e2e test. still has some unit testing gaps though