From 9a9370acd07b92f9175ea16b6fd4434240ffa8fe Mon Sep 17 00:00:00 2001 From: Shawn Date: Tue, 24 Dec 2024 13:29:13 +0800 Subject: [PATCH 001/119] feat: added new ExternalMessage::RequestFromHeight and ExternalMessage::RequestFromHash message types. --- zilliqa/src/message.rs | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/zilliqa/src/message.rs b/zilliqa/src/message.rs index 25d33d897..65f703a49 100644 --- a/zilliqa/src/message.rs +++ b/zilliqa/src/message.rs @@ -209,7 +209,7 @@ pub struct BlockRequest { pub to_view: u64, } -#[derive(Clone, Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] pub struct BlockResponse { pub proposals: Vec, pub from_view: u64, @@ -227,6 +227,13 @@ impl fmt::Debug for BlockResponse { } } +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct RequestBlock { + pub from_height: u64, + pub from_hash: Hash, + pub batch_size: u64, +} + /// Used to convey proposal processing internally, to avoid blocking threads for too long. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct ProcessProposal { @@ -259,6 +266,8 @@ pub enum ExternalMessage { /// An acknowledgement of the receipt of a message. Note this is only used as a response when the caller doesn't /// require any data in the response. Acknowledgement, + RequestFromHeight(RequestBlock), + RequestFromHash(RequestBlock), } impl ExternalMessage { @@ -274,6 +283,16 @@ impl ExternalMessage { impl Display for ExternalMessage { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { match self { + ExternalMessage::RequestFromHeight(r) => { + write!( + f, + "RequestFromHeight({}, num={})", + r.from_height, r.batch_size + ) + } + ExternalMessage::RequestFromHash(r) => { + write!(f, "RequestFromHash({}, num={})", r.from_hash, r.batch_size) + } ExternalMessage::Proposal(p) => write!(f, "Proposal({})", p.view()), ExternalMessage::Vote(v) => write!(f, "Vote({})", v.view), ExternalMessage::NewView(n) => write!(f, "NewView({})", n.view), From ef104fe431a515f8e904e7b79b38468f8a7fc562 Mon Sep 17 00:00:00 2001 From: Shawn Date: Wed, 25 Dec 2024 16:35:41 +0800 Subject: [PATCH 002/119] feat: initial blockstore.rs skeleton. --- zilliqa/src/message.rs | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/zilliqa/src/message.rs b/zilliqa/src/message.rs index 65f703a49..865574a00 100644 --- a/zilliqa/src/message.rs +++ b/zilliqa/src/message.rs @@ -234,6 +234,11 @@ pub struct RequestBlock { pub batch_size: u64, } +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ResponseBlock { + pub proposals: Vec, +} + /// Used to convey proposal processing internally, to avoid blocking threads for too long. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct ProcessProposal { @@ -268,6 +273,8 @@ pub enum ExternalMessage { Acknowledgement, RequestFromHeight(RequestBlock), RequestFromHash(RequestBlock), + ResponseFromHeight(ResponseBlock), + ResponseFromHash(ResponseBlock), } impl ExternalMessage { @@ -283,6 +290,12 @@ impl ExternalMessage { impl Display for ExternalMessage { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { match self { + ExternalMessage::ResponseFromHeight(r) => { + write!(f, "ResponseFromHeight({})", r.proposals.len()) + } + ExternalMessage::ResponseFromHash(r) => { + write!(f, "ResponseFromHash({})", r.proposals.len()) + } ExternalMessage::RequestFromHeight(r) => { write!( f, From 24225618a01727f846e8217d950a4f0d42b31beb Mon Sep 17 00:00:00 2001 From: Shawn Date: Thu, 26 Dec 2024 15:10:25 +0800 Subject: [PATCH 003/119] feat: added request/response skeleton. --- zilliqa/src/message.rs | 6 +- zilliqa/src/node.rs | 127 ++++++++++++++++++++++++++++++++++++++++- 2 files changed, 129 insertions(+), 4 deletions(-) diff --git a/zilliqa/src/message.rs b/zilliqa/src/message.rs index 865574a00..9a306f3a2 100644 --- a/zilliqa/src/message.rs +++ b/zilliqa/src/message.rs @@ -229,9 +229,9 @@ impl fmt::Debug for BlockResponse { #[derive(Debug, Clone, Serialize, Deserialize)] pub struct RequestBlock { - pub from_height: u64, + pub from_number: u64, pub from_hash: Hash, - pub batch_size: u64, + pub batch_size: usize, } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -300,7 +300,7 @@ impl Display for ExternalMessage { write!( f, "RequestFromHeight({}, num={})", - r.from_height, r.batch_size + r.from_number, r.batch_size ) } ExternalMessage::RequestFromHash(r) => { diff --git a/zilliqa/src/node.rs b/zilliqa/src/node.rs index 5eb0db947..956346c2f 100644 --- a/zilliqa/src/node.rs +++ b/zilliqa/src/node.rs @@ -34,7 +34,7 @@ use crate::{ inspector::{self, ScillaInspector}, message::{ Block, BlockHeader, BlockResponse, ExternalMessage, InternalMessage, IntershardCall, - ProcessProposal, Proposal, + ProcessProposal, Proposal, ResponseBlock, }, node_launcher::ResponseChannel, p2p_node::{LocalMessageTuple, OutboundMessageTuple}, @@ -269,6 +269,131 @@ impl Node { self.request_responses .send((response_channel, ExternalMessage::Acknowledgement))?; } + ExternalMessage::RequestFromHeight(request) => { + if from == self.peer_id { + warn!("block_store::RequestFromHeight : ignoring blocks request to self"); + return Ok(()); + } + + // TODO: Check if we should service this request. + // Validators shall not respond to this request. + + trace!( + "block_store::RequestFromHeight : received a block request - {}", + self.peer_id + ); + + // TODO: Replace this entire block with a single SQL query + let Some(alpha) = self.db.get_block_by_hash(&request.from_hash)? else { + // We do not have the starting block + self.request_responses.send(( + response_channel, + ExternalMessage::ResponseFromHash(ResponseBlock { proposals: vec![] }), + ))?; + return Ok(()); + }; + let mut proposals = Vec::new(); + for num in alpha.number().saturating_add(1) + ..=alpha.number().saturating_add(request.batch_size as u64) + { + let Some(block) = self.db.get_canonical_block_by_number(num)? else { + // that's all we have! + break; + }; + proposals.push(self.block_to_proposal(block)); + } + + self.request_responses.send(( + response_channel, + ExternalMessage::ResponseFromHash(ResponseBlock { proposals }), + ))?; + } + ExternalMessage::ResponseFromHeight(response) => { + // Check that we have enough to complete the process, otherwise ignore + if response.proposals.is_empty() { + // Empty response, downgrade peer + warn!("block_store::ResponseFromHeight : empty blocks in flight {from}",); + } + if response.proposals.len() < self.config.max_blocks_in_flight as usize { + // Partial response, downgrade peer + warn!("block_store::ResponseFromHeight : insufficient blocks in flight {from}",); + } + + // TODO: Inject proposals + debug!( + "block_store::ResponseFromHeight : injecting proposals {:?}", + response + ); + + // Acknowledge this block response. This does nothing because the `BlockResponse` request was sent by + // us, but we keep it here for symmetry with the other handlers. + self.request_responses + .send((response_channel, ExternalMessage::Acknowledgement))?; + } + ExternalMessage::RequestFromHash(request) => { + if from == self.peer_id { + warn!("block_store::RequestFromHash : ignoring blocks request to self"); + return Ok(()); + } + + trace!( + "block_store::RequestFromHash : received a block request - {}", + self.peer_id + ); + + // TODO: Check if we should service this request + // Validators could respond to this request if there is nothing else to do. + + let Some(omega_block) = self.db.get_block_by_hash(&request.from_hash)? else { + // We do not have the starting block + self.request_responses.send(( + response_channel, + ExternalMessage::ResponseFromHash(ResponseBlock { proposals: vec![] }), + ))?; + return Ok(()); + }; + + let mut proposals = Vec::new(); + let mut hash = omega_block.parent_hash(); + // grab up to batch_size blocks + while proposals.len() < request.batch_size { + // grab the parent + let Some(block) = self.db.get_block_by_hash(&hash)? else { + // that's all we have! + break; + }; + hash = block.parent_hash(); + proposals.push(self.block_to_proposal(block)); + } + + self.request_responses.send(( + response_channel, + ExternalMessage::ResponseFromHash(ResponseBlock { proposals }), + ))?; + } + ExternalMessage::ResponseFromHash(response) => { + // Check that we have enough to complete the process, otherwise ignore + if response.proposals.is_empty() { + // Empty response, downgrade peer + warn!("block_store::ResponseFromHeight : empty blocks in flight {from}",); + } + // Check that we have enough to complete the process, otherwise ignore + if response.proposals.len() * 2 < self.config.max_blocks_in_flight as usize { + warn!("block_store::ResponseFromHash : insufficient blocks in flight {from}",); + return Ok(()); + } + + // TODO: Inject proposals + debug!( + "block_store::ResponseFromHash : injecting proposals {:?}", + response + ); + + // Acknowledge this block response. This does nothing because the `BlockResponse` request was sent by + // us, but we keep it here for symmetry with the other handlers. + self.request_responses + .send((response_channel, ExternalMessage::Acknowledgement))?; + } ExternalMessage::BlockRequest(request) => { if from == self.peer_id { debug!("block_store::BlockRequest : ignoring blocks request to self"); From f811b776d9c9c8e7689f72895b570907e3983da8 Mon Sep 17 00:00:00 2001 From: Shawn Date: Thu, 26 Dec 2024 15:43:11 +0800 Subject: [PATCH 004/119] feat: hook up initial wiring of blockstore with consensus. --- zilliqa/src/consensus.rs | 5 +++++ zilliqa/src/lib.rs | 1 + 2 files changed, 6 insertions(+) diff --git a/zilliqa/src/consensus.rs b/zilliqa/src/consensus.rs index c5a93c761..efa18ed8a 100644 --- a/zilliqa/src/consensus.rs +++ b/zilliqa/src/consensus.rs @@ -22,6 +22,7 @@ use tracing::*; use crate::{ block_store::BlockStore, blockhooks, + blockstore::BlockStore as BlockStore2, cfg::{ConsensusConfig, NodeConfig}, constants::TIME_TO_ALLOW_PROPOSAL_BROADCAST, contracts, @@ -151,6 +152,7 @@ pub struct Consensus { config: NodeConfig, message_sender: MessageSender, reset_timeout: UnboundedSender, + blockstore: BlockStore2, pub block_store: BlockStore, latest_leader_cache: RefCell>, votes: BTreeMap, @@ -206,6 +208,8 @@ impl Consensus { )?; } + let blockstore = BlockStore2::new(&config, db.clone(), message_sender.clone())?; + // It is important to create the `BlockStore` after the checkpoint has been loaded into the DB. The // `BlockStore` pre-loads and caches information about the currently stored blocks. let block_store = BlockStore::new(&config, db.clone(), message_sender.clone())?; @@ -324,6 +328,7 @@ impl Consensus { let mut consensus = Consensus { secret_key, config, + blockstore, block_store, latest_leader_cache: RefCell::new(None), message_sender, diff --git a/zilliqa/src/lib.rs b/zilliqa/src/lib.rs index b949e6493..642e82df2 100644 --- a/zilliqa/src/lib.rs +++ b/zilliqa/src/lib.rs @@ -27,3 +27,4 @@ pub mod test_util; pub mod time; pub mod transaction; pub mod zq1_proto; +pub mod blockstore; From 87c12f9afc81dba7378c671c0733fc51a3b2c73f Mon Sep 17 00:00:00 2001 From: Shawn Date: Thu, 26 Dec 2024 15:49:27 +0800 Subject: [PATCH 005/119] feat: added blockstore.rs. --- zilliqa/src/blockstore.rs | 203 ++++++++++++++++++++++++++++++++++++++ zilliqa/src/consensus.rs | 2 +- 2 files changed, 204 insertions(+), 1 deletion(-) create mode 100644 zilliqa/src/blockstore.rs diff --git a/zilliqa/src/blockstore.rs b/zilliqa/src/blockstore.rs new file mode 100644 index 000000000..7b648d1de --- /dev/null +++ b/zilliqa/src/blockstore.rs @@ -0,0 +1,203 @@ +use std::{ + cmp::Ordering, + collections::BinaryHeap, + sync::Arc, + time::{Duration, Instant}, +}; + +use anyhow::Result; + +use libp2p::PeerId; + +use crate::{ + cfg::NodeConfig, + db::Db, + message::{Block, ExternalMessage, Proposal, RequestBlock}, + node::{MessageSender, RequestId}, +}; + +/// Stores and manages the node's list of blocks. Also responsible for making requests for new blocks. +/// +/// # Syncing Algorithm +/// +/// We rely on [crate::consensus::Consensus] informing us of newly received block proposals via: +/// * [BlockStore::process_block] for blocks that can be part of our chain, because we already have their parent. +/// * [BlockStore::buffer_proposal] for blocks that can't (yet) be part of our chain. +/// +/// Both these code paths also call [BlockStore::request_missing_blocks]. This finds the greatest view of any proposal +/// we've seen (whether its part of our chain or not). + +#[derive(Debug)] +pub struct BlockStore { + // database + db: Arc, + // message bus + message_sender: MessageSender, + // internal peers + peers: BinaryHeap, + // in-flight + in_flight: Option, + // in-flight timeout + request_timeout: Duration, + // how many blocks to request at once + max_blocks_in_flight: usize, +} + +impl BlockStore { + pub fn new( + config: &NodeConfig, + db: Arc, + message_sender: MessageSender, + peers: Vec, + ) -> Result { + let peers = peers + .into_iter() + .map(|peer_id| PeerInfo { + score: 0, + peer_id, + last_used: Instant::now(), + }) + .collect(); + + Ok(Self { + db, + message_sender, + peers, + in_flight: None, + request_timeout: config.consensus.consensus_timeout, + max_blocks_in_flight: config.max_blocks_in_flight.max(31) as usize, // between 30 seconds and 3 days of blocks. + }) + } + + /// Route each proposal as if it were received. + pub fn handle_response_from_height(&mut self, proposals: Vec) -> Result<()> { + // Just pump the Proposals back to ourselves, and it will be picked up and processed as if it were received. + // Only issue is the timestamp skew. We should probably fix that. + for p in proposals { + tracing::trace!("Received proposal from height: {:?}", p); + self.message_sender.send_external_message( + self.message_sender.our_peer_id, + ExternalMessage::Proposal(p), + )?; + } + Ok(()) + } + + pub fn handle_from_hash(&mut self, _: Vec) -> Result<()> { + // ... + Ok(()) + } + + pub fn process_proposal(&self, block: Block) -> Result<()> { + // ... + // check if block parent exists + let parent_block = self.db.get_block_by_hash(&block.parent_hash())?; + + // no parent block, trigger sync + if parent_block.is_none() {} + Ok(()) + } + + pub fn buffer_proposal(&self, block: Block) { + // ... + } + + /// Request blocks between the current height and the given block. + /// + /// The approach is to request blocks in batches of `max_blocks_in_flight` blocks. + /// If the block gap is large, we request blocks from the last known canonical block forwards. + /// If the block gap is small, we request blocks from the latest block backwards. + /// + pub fn request_missing_blocks(&mut self, omega_block: Block) -> Result { + // highest canonical block we have + // TODO: Replace this with a single SQL query. + let height = self + .db + .get_highest_canonical_block_number()? + .unwrap_or_default(); + let alpha_block = self.db.get_canonical_block_by_number(height)?.unwrap(); + + // Compute the block gap. + let block_gap = omega_block + .header + .number + .saturating_sub(alpha_block.header.number); + + // TODO: Double-check computation + let message = if block_gap > self.max_blocks_in_flight as u64 / 2 { + // we're far from latest block + ExternalMessage::RequestFromHeight(RequestBlock { + from_number: alpha_block.header.number, + from_hash: alpha_block.header.hash, + batch_size: self.max_blocks_in_flight, + }) + } else { + // we're close to latest block + ExternalMessage::RequestFromHash(RequestBlock { + from_number: omega_block.header.number, + from_hash: omega_block.header.hash, + batch_size: self.max_blocks_in_flight, + }) + }; + + let peer = self.in_flight.as_ref().unwrap(); + + self.message_sender + .send_external_message(peer.peer_id, message) + } + + /// Add a peer to the list of peers. + pub fn add_peer(&mut self, peer: PeerId) { + // new peers should be tried last, which gives them time to sync first. + // peers do not need to be unique. + let new_peer = PeerInfo { + score: self.peers.iter().map(|p| p.score).max().unwrap_or(0), + peer_id: peer, + last_used: Instant::now(), + }; + self.peers.push(new_peer); + } + + /// Remove a peer from the list of peers. + pub fn remove_peer(&mut self, peer: PeerId) { + self.peers.retain(|p| p.peer_id != peer); + } + + pub fn get_next_peer(&mut self, prev_peer: Option) -> Option { + // Push the current peer into the heap, risks spamming the same peer. + // TODO: implement a better strategy for this. + if let Some(peer) = prev_peer { + self.peers.push(peer); + } + + let Some(mut peer) = self.peers.pop() else { + return None; + }; + + // used to determine stale in-flight requests. + peer.last_used = std::time::Instant::now(); + + Some(peer) + } +} + +#[derive(Debug, Clone, PartialEq, Eq)] +struct PeerInfo { + score: u32, + peer_id: PeerId, + last_used: Instant, +} + +impl Ord for PeerInfo { + fn cmp(&self, other: &Self) -> Ordering { + self.score + .cmp(&other.score) + .then_with(|| self.last_used.cmp(&other.last_used)) + } +} + +impl PartialOrd for PeerInfo { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} diff --git a/zilliqa/src/consensus.rs b/zilliqa/src/consensus.rs index efa18ed8a..879486c3e 100644 --- a/zilliqa/src/consensus.rs +++ b/zilliqa/src/consensus.rs @@ -208,7 +208,7 @@ impl Consensus { )?; } - let blockstore = BlockStore2::new(&config, db.clone(), message_sender.clone())?; + let blockstore = BlockStore2::new(&config, db.clone(), message_sender.clone(), Vec::new())?; // It is important to create the `BlockStore` after the checkpoint has been loaded into the DB. The // `BlockStore` pre-loads and caches information about the currently stored blocks. From 4b7097a8cef2b55a3953f3f4fe296e2dc89853f7 Mon Sep 17 00:00:00 2001 From: Shawn Date: Fri, 27 Dec 2024 16:02:42 +0800 Subject: [PATCH 006/119] feat: added in-flight check. --- zilliqa/src/blockstore.rs | 43 +++++++++++++++++++++++++++++++++------ zilliqa/src/consensus.rs | 3 +++ 2 files changed, 40 insertions(+), 6 deletions(-) diff --git a/zilliqa/src/blockstore.rs b/zilliqa/src/blockstore.rs index 7b648d1de..3dabce05a 100644 --- a/zilliqa/src/blockstore.rs +++ b/zilliqa/src/blockstore.rs @@ -88,13 +88,20 @@ impl BlockStore { Ok(()) } - pub fn process_proposal(&self, block: Block) -> Result<()> { + pub fn process_proposal(&mut self, block: Block) -> Result<()> { // ... // check if block parent exists let parent_block = self.db.get_block_by_hash(&block.parent_hash())?; // no parent block, trigger sync - if parent_block.is_none() {} + let peer = self.in_flight.take(); + self.in_flight = self.get_next_peer(peer); + + if parent_block.is_none() && self.in_flight.is_some() { + self.request_missing_blocks(block)?; + tracing::debug!("Parent block not found, requesting missing blocks",); + return Ok(()); + } Ok(()) } @@ -108,7 +115,28 @@ impl BlockStore { /// If the block gap is large, we request blocks from the last known canonical block forwards. /// If the block gap is small, we request blocks from the latest block backwards. /// - pub fn request_missing_blocks(&mut self, omega_block: Block) -> Result { + pub fn request_missing_blocks(&mut self, omega_block: Block) -> Result<()> { + // Early exit if there's a request in-flight; and if it has not expired. + if let Some(peer) = self.in_flight.as_ref() { + if peer.last_used.elapsed() > self.request_timeout { + tracing::warn!( + "In-flight request {} timed out, requesting from new peer", + peer.peer_id + ); + let mut peer = self.in_flight.take().unwrap(); + peer.score += 1; // TODO: Downgrade score if we keep timing out. + self.in_flight = self.get_next_peer(Some(peer)); + } else { + return Ok(()); + } + } else { + self.in_flight = self.get_next_peer(None); + if self.in_flight.is_none() { + tracing::error!("No peers available to request missing blocks"); + return Ok(()); + } + } + // highest canonical block we have // TODO: Replace this with a single SQL query. let height = self @@ -123,7 +151,7 @@ impl BlockStore { .number .saturating_sub(alpha_block.header.number); - // TODO: Double-check computation + // TODO: Double-check hysteresis logic. let message = if block_gap > self.max_blocks_in_flight as u64 / 2 { // we're far from latest block ExternalMessage::RequestFromHeight(RequestBlock { @@ -142,8 +170,11 @@ impl BlockStore { let peer = self.in_flight.as_ref().unwrap(); + tracing::debug!(?message, "Requesting missing blocks from {}", peer.peer_id); + self.message_sender - .send_external_message(peer.peer_id, message) + .send_external_message(peer.peer_id, message)?; + Ok(()) } /// Add a peer to the list of peers. @@ -163,7 +194,7 @@ impl BlockStore { self.peers.retain(|p| p.peer_id != peer); } - pub fn get_next_peer(&mut self, prev_peer: Option) -> Option { + fn get_next_peer(&mut self, prev_peer: Option) -> Option { // Push the current peer into the heap, risks spamming the same peer. // TODO: implement a better strategy for this. if let Some(peer) = prev_peer { diff --git a/zilliqa/src/consensus.rs b/zilliqa/src/consensus.rs index 879486c3e..7e9ebb58b 100644 --- a/zilliqa/src/consensus.rs +++ b/zilliqa/src/consensus.rs @@ -650,6 +650,9 @@ impl Consensus { block.hash() ); + // FIXME: Cleanup + self.blockstore.process_proposal(block.clone())?; + if self.block_store.contains_block(&block.hash())? { trace!("ignoring block proposal, block store contains this block already"); return Ok(None); From 58f229008e8e82d6e0e3edd59191f269ed787989 Mon Sep 17 00:00:00 2001 From: Shawn Date: Fri, 27 Dec 2024 16:23:47 +0800 Subject: [PATCH 007/119] feat: added debug/warn/trace messages. --- zilliqa/src/blockstore.rs | 18 +++-- zilliqa/src/node.rs | 140 +++++++++++++++++++++++--------------- 2 files changed, 94 insertions(+), 64 deletions(-) diff --git a/zilliqa/src/blockstore.rs b/zilliqa/src/blockstore.rs index 3dabce05a..e3e3d5f82 100644 --- a/zilliqa/src/blockstore.rs +++ b/zilliqa/src/blockstore.rs @@ -13,7 +13,7 @@ use crate::{ cfg::NodeConfig, db::Db, message::{Block, ExternalMessage, Proposal, RequestBlock}, - node::{MessageSender, RequestId}, + node::MessageSender, }; /// Stores and manages the node's list of blocks. Also responsible for making requests for new blocks. @@ -94,18 +94,18 @@ impl BlockStore { let parent_block = self.db.get_block_by_hash(&block.parent_hash())?; // no parent block, trigger sync - let peer = self.in_flight.take(); - self.in_flight = self.get_next_peer(peer); - - if parent_block.is_none() && self.in_flight.is_some() { + if parent_block.is_none() { + tracing::warn!( + "blockstore::ProcessProposal : Parent block {} not found, requesting missing blocks", + block.parent_hash() + ); self.request_missing_blocks(block)?; - tracing::debug!("Parent block not found, requesting missing blocks",); return Ok(()); } Ok(()) } - pub fn buffer_proposal(&self, block: Block) { + pub fn buffer_proposal(&self, _block: Block) { // ... } @@ -201,9 +201,7 @@ impl BlockStore { self.peers.push(peer); } - let Some(mut peer) = self.peers.pop() else { - return None; - }; + let mut peer = self.peers.pop()?; // used to determine stale in-flight requests. peer.last_used = std::time::Instant::now(); diff --git a/zilliqa/src/node.rs b/zilliqa/src/node.rs index 956346c2f..9cabf3e8a 100644 --- a/zilliqa/src/node.rs +++ b/zilliqa/src/node.rs @@ -270,31 +270,40 @@ impl Node { .send((response_channel, ExternalMessage::Acknowledgement))?; } ExternalMessage::RequestFromHeight(request) => { + tracing::debug!( + "blockstore::RequestFromHeight : received a block request from {}", + from + ); + if from == self.peer_id { - warn!("block_store::RequestFromHeight : ignoring blocks request to self"); + warn!("blockstore::RequestFromHeight : ignoring blocks from self"); return Ok(()); } // TODO: Check if we should service this request. // Validators shall not respond to this request. - trace!( - "block_store::RequestFromHeight : received a block request - {}", - self.peer_id - ); - - // TODO: Replace this entire block with a single SQL query let Some(alpha) = self.db.get_block_by_hash(&request.from_hash)? else { // We do not have the starting block + tracing::warn!( + "blockstore::RequestFromHeight : missing starting block {}", + request.from_hash + ); self.request_responses.send(( response_channel, ExternalMessage::ResponseFromHash(ResponseBlock { proposals: vec![] }), ))?; return Ok(()); }; + + // TODO: Replace this with a single SQL query let mut proposals = Vec::new(); - for num in alpha.number().saturating_add(1) - ..=alpha.number().saturating_add(request.batch_size as u64) + let batch_size = self + .config + .max_blocks_in_flight + .min(request.batch_size as u64); + for num in + alpha.number().saturating_add(1)..=alpha.number().saturating_add(batch_size) { let Some(block) = self.db.get_canonical_block_by_number(num)? else { // that's all we have! @@ -303,10 +312,12 @@ impl Node { proposals.push(self.block_to_proposal(block)); } - self.request_responses.send(( - response_channel, - ExternalMessage::ResponseFromHash(ResponseBlock { proposals }), - ))?; + let message = ExternalMessage::ResponseFromHash(ResponseBlock { proposals }); + tracing::trace!( + ?message, + "blockstore::RequestFromHeight : responding to block request from height" + ); + self.request_responses.send((response_channel, message))?; } ExternalMessage::ResponseFromHeight(response) => { // Check that we have enough to complete the process, otherwise ignore @@ -331,21 +342,25 @@ impl Node { .send((response_channel, ExternalMessage::Acknowledgement))?; } ExternalMessage::RequestFromHash(request) => { + debug!( + "blockstore::RequestFromHash : received a block request from {}", + from + ); + if from == self.peer_id { - warn!("block_store::RequestFromHash : ignoring blocks request to self"); + warn!("blockstore::RequestFromHash : ignoring request from self"); return Ok(()); } - trace!( - "block_store::RequestFromHash : received a block request - {}", - self.peer_id - ); - // TODO: Check if we should service this request // Validators could respond to this request if there is nothing else to do. let Some(omega_block) = self.db.get_block_by_hash(&request.from_hash)? else { // We do not have the starting block + tracing::warn!( + "blockstore::RequestFromHash : missing starting block {}", + request.from_hash + ); self.request_responses.send(( response_channel, ExternalMessage::ResponseFromHash(ResponseBlock { proposals: vec![] }), @@ -356,7 +371,10 @@ impl Node { let mut proposals = Vec::new(); let mut hash = omega_block.parent_hash(); // grab up to batch_size blocks - while proposals.len() < request.batch_size { + let batch_size = request + .batch_size + .min(self.config.max_blocks_in_flight as usize); + while proposals.len() < batch_size { // grab the parent let Some(block) = self.db.get_block_by_hash(&hash)? else { // that's all we have! @@ -366,10 +384,12 @@ impl Node { proposals.push(self.block_to_proposal(block)); } - self.request_responses.send(( - response_channel, - ExternalMessage::ResponseFromHash(ResponseBlock { proposals }), - ))?; + let message = ExternalMessage::ResponseFromHash(ResponseBlock { proposals }); + tracing::trace!( + ?message, + "blockstore::RequestFromHash : responding to block request from height" + ); + self.request_responses.send((response_channel, message))?; } ExternalMessage::ResponseFromHash(response) => { // Check that we have enough to complete the process, otherwise ignore @@ -394,43 +414,55 @@ impl Node { self.request_responses .send((response_channel, ExternalMessage::Acknowledgement))?; } - ExternalMessage::BlockRequest(request) => { - if from == self.peer_id { - debug!("block_store::BlockRequest : ignoring blocks request to self"); - return Ok(()); - } - trace!( - "block_store::BlockRequest : received a block request - {}", - self.peer_id - ); - // Note that it is very important that we limit this by number of blocks - // returned, _not_ by max view range returned. If we don't, then any - // view gap larger than block_request_limit will never be filliable - // because no node will ever be prepared to return the block after it. - let proposals: Vec = (request.from_view..=request.to_view) - .take(self.config.block_request_limit) - .filter_map(|view| { - self.consensus - .get_block_by_view(view) - .transpose() - .map(|block| Ok(self.block_to_proposal(block?))) - }) - .collect::>()?; - - let availability = self.consensus.block_store.availability()?; - trace!("block_store::BlockRequest - responding to new blocks request {id:?} from {from:?} of {request:?} with props {0:?} availability {availability:?}", - proposals.iter().fold("".to_string(), |state, x| format!("{},{}", state, x.header.view))); - - // Send the response to this block request. + // Respond negatively to old BlockRequests. + ExternalMessage::BlockRequest(request) => { self.request_responses.send(( response_channel, ExternalMessage::BlockResponse(BlockResponse { - proposals, + proposals: vec![], from_view: request.from_view, - availability, + availability: None, }), ))?; + return Ok(()); + + // if from == self.peer_id { + // debug!("block_store::BlockRequest : ignoring blocks request to self"); + // return Ok(()); + // } + + // trace!( + // "block_store::BlockRequest : received a block request - {}", + // self.peer_id + // ); + // // Note that it is very important that we limit this by number of blocks + // // returned, _not_ by max view range returned. If we don't, then any + // // view gap larger than block_request_limit will never be filliable + // // because no node will ever be prepared to return the block after it. + // let proposals: Vec = (request.from_view..=request.to_view) + // .take(self.config.block_request_limit) + // .filter_map(|view| { + // self.consensus + // .get_block_by_view(view) + // .transpose() + // .map(|block| Ok(self.block_to_proposal(block?))) + // }) + // .collect::>()?; + + // let availability = self.consensus.block_store.availability()?; + // trace!("block_store::BlockRequest - responding to new blocks request {id:?} from {from:?} of {request:?} with props {0:?} availability {availability:?}", + // proposals.iter().fold("".to_string(), |state, x| format!("{},{}", state, x.header.view))); + + // // Send the response to this block request. + // self.request_responses.send(( + // response_channel, + // ExternalMessage::BlockResponse(BlockResponse { + // proposals, + // from_view: request.from_view, + // availability, + // }), + // ))?; } // We don't usually expect a [BlockResponse] to be received as a request, however this can occur when our // [BlockStore] has re-sent a previously unusable block because we didn't (yet) have the block's parent. From b6959ad1ebe3eacb8004a140da411bc5663c3a27 Mon Sep 17 00:00:00 2001 From: Shawn Date: Fri, 27 Dec 2024 16:41:27 +0800 Subject: [PATCH 008/119] feat: initial requests firing. --- zilliqa/src/blockstore.rs | 9 +++++---- zilliqa/src/consensus.rs | 2 +- zilliqa/src/message.rs | 4 ++++ zilliqa/src/node.rs | 6 ++++++ zilliqa/src/p2p_node.rs | 8 ++++++++ 5 files changed, 24 insertions(+), 5 deletions(-) diff --git a/zilliqa/src/blockstore.rs b/zilliqa/src/blockstore.rs index e3e3d5f82..804a752ed 100644 --- a/zilliqa/src/blockstore.rs +++ b/zilliqa/src/blockstore.rs @@ -96,7 +96,7 @@ impl BlockStore { // no parent block, trigger sync if parent_block.is_none() { tracing::warn!( - "blockstore::ProcessProposal : Parent block {} not found, requesting missing blocks", + "blockstore::ProcessProposal : Parent block {} not found", block.parent_hash() ); self.request_missing_blocks(block)?; @@ -219,9 +219,10 @@ struct PeerInfo { impl Ord for PeerInfo { fn cmp(&self, other: &Self) -> Ordering { - self.score - .cmp(&other.score) - .then_with(|| self.last_used.cmp(&other.last_used)) + other + .score + .cmp(&self.score) + .then_with(|| other.last_used.cmp(&self.last_used)) } } diff --git a/zilliqa/src/consensus.rs b/zilliqa/src/consensus.rs index 7e9ebb58b..e1f5db700 100644 --- a/zilliqa/src/consensus.rs +++ b/zilliqa/src/consensus.rs @@ -152,7 +152,7 @@ pub struct Consensus { config: NodeConfig, message_sender: MessageSender, reset_timeout: UnboundedSender, - blockstore: BlockStore2, + pub blockstore: BlockStore2, pub block_store: BlockStore, latest_leader_cache: RefCell>, votes: BTreeMap, diff --git a/zilliqa/src/message.rs b/zilliqa/src/message.rs index 9a306f3a2..8689c6f80 100644 --- a/zilliqa/src/message.rs +++ b/zilliqa/src/message.rs @@ -271,6 +271,8 @@ pub enum ExternalMessage { /// An acknowledgement of the receipt of a message. Note this is only used as a response when the caller doesn't /// require any data in the response. Acknowledgement, + AddPeer, + RemovePeer, RequestFromHeight(RequestBlock), RequestFromHash(RequestBlock), ResponseFromHeight(ResponseBlock), @@ -290,6 +292,8 @@ impl ExternalMessage { impl Display for ExternalMessage { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { match self { + ExternalMessage::AddPeer => write!(f, "AddPeer"), + ExternalMessage::RemovePeer => write!(f, "RemovePeer"), ExternalMessage::ResponseFromHeight(r) => { write!(f, "ResponseFromHeight({})", r.proposals.len()) } diff --git a/zilliqa/src/node.rs b/zilliqa/src/node.rs index 9cabf3e8a..43af7bff3 100644 --- a/zilliqa/src/node.rs +++ b/zilliqa/src/node.rs @@ -227,6 +227,12 @@ impl Node { )))?; } } + ExternalMessage::AddPeer => { + self.consensus.blockstore.add_peer(from); + } + ExternalMessage::RemovePeer => { + self.consensus.blockstore.remove_peer(from); + } // `Proposals` are re-routed to `handle_request()` _ => { warn!("unexpected message type"); diff --git a/zilliqa/src/p2p_node.rs b/zilliqa/src/p2p_node.rs index 3e3d5e127..e353d6cf5 100644 --- a/zilliqa/src/p2p_node.rs +++ b/zilliqa/src/p2p_node.rs @@ -264,6 +264,14 @@ impl P2pNode { .kademlia .add_address(&peer_id, address.clone()); } + SwarmEvent::Behaviour(BehaviourEvent::Gossipsub(gossipsub::Event::Subscribed { peer_id, topic })) => { + let message = ExternalMessage::AddPeer; + self.send_to(&topic, |c| c.broadcasts.send((peer_id, message)))?; + } + SwarmEvent::Behaviour(BehaviourEvent::Gossipsub(gossipsub::Event::Unsubscribed { peer_id, topic })) => { + let message = ExternalMessage::RemovePeer; + self.send_to(&topic, |c| c.broadcasts.send((peer_id, message)))?; + } SwarmEvent::Behaviour(BehaviourEvent::Gossipsub(gossipsub::Event::Message{ message_id: msg_id, message: gossipsub::Message { From 8bd7214d8de894028bd6dd4590a2d9775be4efb3 Mon Sep 17 00:00:00 2001 From: Shawn Date: Fri, 27 Dec 2024 16:51:53 +0800 Subject: [PATCH 009/119] feat: convert config value to usize for simplicity. --- zilliqa/src/block_store.rs | 4 ++-- zilliqa/src/blockstore.rs | 6 +++++- zilliqa/src/cfg.rs | 4 ++-- zilliqa/src/node.rs | 21 ++++++++------------- 4 files changed, 17 insertions(+), 18 deletions(-) diff --git a/zilliqa/src/block_store.rs b/zilliqa/src/block_store.rs index 24756eefe..32bf71a71 100644 --- a/zilliqa/src/block_store.rs +++ b/zilliqa/src/block_store.rs @@ -567,11 +567,11 @@ impl BlockStore { highest_known_view: 0, highest_confirmed_view: 0, peers: BTreeMap::new(), - max_blocks_in_flight: config.max_blocks_in_flight, + max_blocks_in_flight: config.max_blocks_in_flight as u64, failed_request_sleep_duration: config.failed_request_sleep_duration, strategies: vec![BlockStrategy::Latest(constants::RETAINS_LAST_N_BLOCKS)], available_blocks, - buffered: BlockCache::new(config.max_blocks_in_flight), + buffered: BlockCache::new(config.max_blocks_in_flight as u64), unserviceable_requests: None, message_sender, clock: 0, diff --git a/zilliqa/src/blockstore.rs b/zilliqa/src/blockstore.rs index 804a752ed..dc3415aab 100644 --- a/zilliqa/src/blockstore.rs +++ b/zilliqa/src/blockstore.rs @@ -41,6 +41,8 @@ pub struct BlockStore { request_timeout: Duration, // how many blocks to request at once max_blocks_in_flight: usize, + // our peer id + peer_id: PeerId, } impl BlockStore { @@ -58,7 +60,8 @@ impl BlockStore { last_used: Instant::now(), }) .collect(); - + let peer_id = message_sender.our_peer_id; + Ok(Self { db, message_sender, @@ -66,6 +69,7 @@ impl BlockStore { in_flight: None, request_timeout: config.consensus.consensus_timeout, max_blocks_in_flight: config.max_blocks_in_flight.max(31) as usize, // between 30 seconds and 3 days of blocks. + peer_id, }) } diff --git a/zilliqa/src/cfg.rs b/zilliqa/src/cfg.rs index 4e6f9f5d8..c08c0bc88 100644 --- a/zilliqa/src/cfg.rs +++ b/zilliqa/src/cfg.rs @@ -101,7 +101,7 @@ pub struct NodeConfig { pub block_request_limit: usize, /// The maximum number of blocks to have outstanding requests for at a time when syncing. #[serde(default = "max_blocks_in_flight_default")] - pub max_blocks_in_flight: u64, + pub max_blocks_in_flight: usize, /// The maximum number of blocks to request in a single message when syncing. #[serde(default = "block_request_batch_size_default")] pub block_request_batch_size: u64, @@ -204,7 +204,7 @@ pub fn block_request_limit_default() -> usize { 100 } -pub fn max_blocks_in_flight_default() -> u64 { +pub fn max_blocks_in_flight_default() -> usize { 1000 } diff --git a/zilliqa/src/node.rs b/zilliqa/src/node.rs index 43af7bff3..9b0e481f7 100644 --- a/zilliqa/src/node.rs +++ b/zilliqa/src/node.rs @@ -297,17 +297,14 @@ impl Node { ); self.request_responses.send(( response_channel, - ExternalMessage::ResponseFromHash(ResponseBlock { proposals: vec![] }), + ExternalMessage::ResponseFromHeight(ResponseBlock { proposals: vec![] }), ))?; return Ok(()); }; // TODO: Replace this with a single SQL query let mut proposals = Vec::new(); - let batch_size = self - .config - .max_blocks_in_flight - .min(request.batch_size as u64); + let batch_size = self.config.max_blocks_in_flight.min(request.batch_size) as u64; for num in alpha.number().saturating_add(1)..=alpha.number().saturating_add(batch_size) { @@ -318,7 +315,7 @@ impl Node { proposals.push(self.block_to_proposal(block)); } - let message = ExternalMessage::ResponseFromHash(ResponseBlock { proposals }); + let message = ExternalMessage::ResponseFromHeight(ResponseBlock { proposals }); tracing::trace!( ?message, "blockstore::RequestFromHeight : responding to block request from height" @@ -329,16 +326,16 @@ impl Node { // Check that we have enough to complete the process, otherwise ignore if response.proposals.is_empty() { // Empty response, downgrade peer - warn!("block_store::ResponseFromHeight : empty blocks in flight {from}",); + warn!("blockstore::ResponseFromHeight : empty blocks in flight {from}",); } - if response.proposals.len() < self.config.max_blocks_in_flight as usize { + if response.proposals.len() < self.config.max_blocks_in_flight { // Partial response, downgrade peer - warn!("block_store::ResponseFromHeight : insufficient blocks in flight {from}",); + warn!("blockstore::ResponseFromHeight : insufficient blocks in flight {from}",); } // TODO: Inject proposals debug!( - "block_store::ResponseFromHeight : injecting proposals {:?}", + "blockstore::ResponseFromHeight : injecting proposals {:?}", response ); @@ -377,9 +374,7 @@ impl Node { let mut proposals = Vec::new(); let mut hash = omega_block.parent_hash(); // grab up to batch_size blocks - let batch_size = request - .batch_size - .min(self.config.max_blocks_in_flight as usize); + let batch_size = request.batch_size.min(self.config.max_blocks_in_flight); while proposals.len() < batch_size { // grab the parent let Some(block) = self.db.get_block_by_hash(&hash)? else { From 7edca1cdb630f10b9a12dac5016d9618719c9a21 Mon Sep 17 00:00:00 2001 From: Shawn Date: Fri, 27 Dec 2024 17:01:24 +0800 Subject: [PATCH 010/119] feat: added blockstore::handle_response_from_height(). --- zilliqa/src/blockstore.rs | 50 ++++++++++++++++++++++++++------------- zilliqa/src/node.rs | 17 +------------ 2 files changed, 35 insertions(+), 32 deletions(-) diff --git a/zilliqa/src/blockstore.rs b/zilliqa/src/blockstore.rs index dc3415aab..7b14b3db7 100644 --- a/zilliqa/src/blockstore.rs +++ b/zilliqa/src/blockstore.rs @@ -12,7 +12,7 @@ use libp2p::PeerId; use crate::{ cfg::NodeConfig, db::Db, - message::{Block, ExternalMessage, Proposal, RequestBlock}, + message::{Block, ExternalMessage, Proposal, RequestBlock, ResponseBlock}, node::MessageSender, }; @@ -61,7 +61,7 @@ impl BlockStore { }) .collect(); let peer_id = message_sender.our_peer_id; - + Ok(Self { db, message_sender, @@ -73,20 +73,6 @@ impl BlockStore { }) } - /// Route each proposal as if it were received. - pub fn handle_response_from_height(&mut self, proposals: Vec) -> Result<()> { - // Just pump the Proposals back to ourselves, and it will be picked up and processed as if it were received. - // Only issue is the timestamp skew. We should probably fix that. - for p in proposals { - tracing::trace!("Received proposal from height: {:?}", p); - self.message_sender.send_external_message( - self.message_sender.our_peer_id, - ExternalMessage::Proposal(p), - )?; - } - Ok(()) - } - pub fn handle_from_hash(&mut self, _: Vec) -> Result<()> { // ... Ok(()) @@ -113,6 +99,38 @@ impl BlockStore { // ... } + pub fn handle_response_from_height( + &mut self, + from: PeerId, + response: ResponseBlock, + ) -> Result<()> { + // Check that we have enough to complete the process, otherwise ignore + if response.proposals.is_empty() { + // Empty response, downgrade peer + tracing::warn!("blockstore::ResponseFromHeight : empty blocks {from}",); + } + if response.proposals.len() < self.max_blocks_in_flight { + // Partial response, downgrade peer + tracing::warn!("blockstore::ResponseFromHeight : partial blocks {from}",); + } + + // TODO: Inject proposals + tracing::debug!( + "blockstore::ResponseFromHeight : injecting proposals {:?}", + response.proposals + ); + + // Just pump the Proposals back to ourselves, and it will be picked up and processed as if it were received. + // Only issue is the timestamp skew. We should probably fix that. + for p in response.proposals { + tracing::trace!("Received proposal from height: {:?}", p); + self.message_sender + .send_external_message(self.peer_id, ExternalMessage::Proposal(p))?; + } + // ... + Ok(()) + } + /// Request blocks between the current height and the given block. /// /// The approach is to request blocks in batches of `max_blocks_in_flight` blocks. diff --git a/zilliqa/src/node.rs b/zilliqa/src/node.rs index 9b0e481f7..305666489 100644 --- a/zilliqa/src/node.rs +++ b/zilliqa/src/node.rs @@ -323,22 +323,7 @@ impl Node { self.request_responses.send((response_channel, message))?; } ExternalMessage::ResponseFromHeight(response) => { - // Check that we have enough to complete the process, otherwise ignore - if response.proposals.is_empty() { - // Empty response, downgrade peer - warn!("blockstore::ResponseFromHeight : empty blocks in flight {from}",); - } - if response.proposals.len() < self.config.max_blocks_in_flight { - // Partial response, downgrade peer - warn!("blockstore::ResponseFromHeight : insufficient blocks in flight {from}",); - } - - // TODO: Inject proposals - debug!( - "blockstore::ResponseFromHeight : injecting proposals {:?}", - response - ); - + self.consensus.blockstore.handle_response_from_height(from, response)?; // Acknowledge this block response. This does nothing because the `BlockResponse` request was sent by // us, but we keep it here for symmetry with the other handlers. self.request_responses From b5b7c35682515efa07c30490a6834a66e8146e5b Mon Sep 17 00:00:00 2001 From: Shawn Date: Fri, 27 Dec 2024 17:43:44 +0800 Subject: [PATCH 011/119] feat: checkpoint, successful RequestFromHeight-ResponseFromHeight. --- zilliqa/src/blockstore.rs | 83 +++++++++++++++++++++++++++++++++++---- zilliqa/src/message.rs | 3 +- zilliqa/src/node.rs | 60 +++++----------------------- 3 files changed, 85 insertions(+), 61 deletions(-) diff --git a/zilliqa/src/blockstore.rs b/zilliqa/src/blockstore.rs index 7b14b3db7..abe076842 100644 --- a/zilliqa/src/blockstore.rs +++ b/zilliqa/src/blockstore.rs @@ -7,6 +7,7 @@ use std::{ use anyhow::Result; +use itertools::Itertools; use libp2p::PeerId; use crate::{ @@ -99,6 +100,61 @@ impl BlockStore { // ... } + fn block_to_proposal(&self, block: Block) -> Proposal { + let txs = block + .transactions + .iter() + .map(|hash| self.db.get_transaction(hash).unwrap().unwrap()) + .map(|tx| tx.verify().unwrap()) + .collect(); + + Proposal::from_parts(block, txs) + } + + pub fn handle_request_from_height( + &mut self, + from: PeerId, + request: RequestBlock, + ) -> Result { + // ... + tracing::debug!( + "blockstore::RequestFromHeight : received a block request from {}", + from + ); + + // TODO: Check if we should service this request. + // Validators shall not respond to this request. + + let Some(alpha) = self.db.get_block_by_hash(&request.from_hash)? else { + // We do not have the starting block + tracing::warn!( + "blockstore::RequestFromHeight : missing starting block {}", + request.from_hash + ); + let message: ExternalMessage = + ExternalMessage::ResponseFromHeight(ResponseBlock { proposals: vec![] }); + return Ok(message); + }; + + // TODO: Replace this with a single SQL query + let mut proposals = Vec::new(); + let batch_size = self.max_blocks_in_flight.min(request.batch_size) as u64; + for num in alpha.number().saturating_add(1)..=alpha.number().saturating_add(batch_size) { + let Some(block) = self.db.get_canonical_block_by_number(num)? else { + // that's all we have! + break; + }; + proposals.push(self.block_to_proposal(block)); + } + + let message = ExternalMessage::ResponseFromHeight(ResponseBlock { proposals }); + tracing::trace!( + ?message, + "blockstore::RequestFromHeight : responding to block request from height" + ); + Ok(message) + } + pub fn handle_response_from_height( &mut self, from: PeerId, @@ -116,17 +172,30 @@ impl BlockStore { // TODO: Inject proposals tracing::debug!( - "blockstore::ResponseFromHeight : injecting proposals {:?}", - response.proposals + "blockstore::ResponseFromHeight : injecting {} proposals", + response.proposals.len() ); + // Sort proposals by number + let proposals = response + .proposals + .into_iter() + .sorted_by_key(|p| p.number()) + .collect_vec(); + // Just pump the Proposals back to ourselves, and it will be picked up and processed as if it were received. // Only issue is the timestamp skew. We should probably fix that. - for p in response.proposals { - tracing::trace!("Received proposal from height: {:?}", p); - self.message_sender - .send_external_message(self.peer_id, ExternalMessage::Proposal(p))?; + for p in proposals { + tracing::trace!( + "Received proposal number: {} hash: {}", + p.number(), + p.hash(), + ); + // replay the proposals } + + // We're done with this peer + self.peers.push(self.in_flight.take().unwrap()); // ... Ok(()) } @@ -177,14 +246,12 @@ impl BlockStore { let message = if block_gap > self.max_blocks_in_flight as u64 / 2 { // we're far from latest block ExternalMessage::RequestFromHeight(RequestBlock { - from_number: alpha_block.header.number, from_hash: alpha_block.header.hash, batch_size: self.max_blocks_in_flight, }) } else { // we're close to latest block ExternalMessage::RequestFromHash(RequestBlock { - from_number: omega_block.header.number, from_hash: omega_block.header.hash, batch_size: self.max_blocks_in_flight, }) diff --git a/zilliqa/src/message.rs b/zilliqa/src/message.rs index 8689c6f80..e51fbdc7b 100644 --- a/zilliqa/src/message.rs +++ b/zilliqa/src/message.rs @@ -229,7 +229,6 @@ impl fmt::Debug for BlockResponse { #[derive(Debug, Clone, Serialize, Deserialize)] pub struct RequestBlock { - pub from_number: u64, pub from_hash: Hash, pub batch_size: usize, } @@ -304,7 +303,7 @@ impl Display for ExternalMessage { write!( f, "RequestFromHeight({}, num={})", - r.from_number, r.batch_size + r.from_hash, r.batch_size ) } ExternalMessage::RequestFromHash(r) => { diff --git a/zilliqa/src/node.rs b/zilliqa/src/node.rs index 305666489..48d8fabf3 100644 --- a/zilliqa/src/node.rs +++ b/zilliqa/src/node.rs @@ -276,59 +276,12 @@ impl Node { .send((response_channel, ExternalMessage::Acknowledgement))?; } ExternalMessage::RequestFromHeight(request) => { - tracing::debug!( - "blockstore::RequestFromHeight : received a block request from {}", - from - ); - - if from == self.peer_id { - warn!("blockstore::RequestFromHeight : ignoring blocks from self"); - return Ok(()); - } - - // TODO: Check if we should service this request. - // Validators shall not respond to this request. - - let Some(alpha) = self.db.get_block_by_hash(&request.from_hash)? else { - // We do not have the starting block - tracing::warn!( - "blockstore::RequestFromHeight : missing starting block {}", - request.from_hash - ); - self.request_responses.send(( - response_channel, - ExternalMessage::ResponseFromHeight(ResponseBlock { proposals: vec![] }), - ))?; - return Ok(()); - }; - - // TODO: Replace this with a single SQL query - let mut proposals = Vec::new(); - let batch_size = self.config.max_blocks_in_flight.min(request.batch_size) as u64; - for num in - alpha.number().saturating_add(1)..=alpha.number().saturating_add(batch_size) - { - let Some(block) = self.db.get_canonical_block_by_number(num)? else { - // that's all we have! - break; - }; - proposals.push(self.block_to_proposal(block)); - } - - let message = ExternalMessage::ResponseFromHeight(ResponseBlock { proposals }); - tracing::trace!( - ?message, - "blockstore::RequestFromHeight : responding to block request from height" - ); + let message = self + .consensus + .blockstore + .handle_request_from_height(from, request)?; self.request_responses.send((response_channel, message))?; } - ExternalMessage::ResponseFromHeight(response) => { - self.consensus.blockstore.handle_response_from_height(from, response)?; - // Acknowledge this block response. This does nothing because the `BlockResponse` request was sent by - // us, but we keep it here for symmetry with the other handlers. - self.request_responses - .send((response_channel, ExternalMessage::Acknowledgement))?; - } ExternalMessage::RequestFromHash(request) => { debug!( "blockstore::RequestFromHash : received a block request from {}", @@ -497,6 +450,11 @@ impl Node { pub fn handle_response(&mut self, from: PeerId, message: ExternalMessage) -> Result<()> { debug!(%from, to = %self.peer_id, %message, "handling response"); match message { + ExternalMessage::ResponseFromHeight(response) => { + self.consensus + .blockstore + .handle_response_from_height(from, response)?; + } ExternalMessage::BlockResponse(m) => self.handle_block_response(from, m)?, ExternalMessage::Acknowledgement => {} _ => { From 3fa48405bc90ccf5ab90db782eb6910225b837bd Mon Sep 17 00:00:00 2001 From: Shawn Date: Fri, 27 Dec 2024 20:17:01 +0800 Subject: [PATCH 012/119] feat: direct insert into DB, without receipts/touched/state. --- zilliqa/src/blockstore.rs | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/zilliqa/src/blockstore.rs b/zilliqa/src/blockstore.rs index abe076842..dd06bf57c 100644 --- a/zilliqa/src/blockstore.rs +++ b/zilliqa/src/blockstore.rs @@ -13,7 +13,7 @@ use libp2p::PeerId; use crate::{ cfg::NodeConfig, db::Db, - message::{Block, ExternalMessage, Proposal, RequestBlock, ResponseBlock}, + message::{Block, ExternalMessage, InternalMessage, Proposal, RequestBlock, ResponseBlock}, node::MessageSender, }; @@ -187,11 +187,18 @@ impl BlockStore { // Only issue is the timestamp skew. We should probably fix that. for p in proposals { tracing::trace!( - "Received proposal number: {} hash: {}", + "Inserting proposal number: {} hash: {}", p.number(), p.hash(), ); - // replay the proposals + + let (block, transactions) = p.into_parts(); + + // TODO: Bulk SQL insert + for tx in transactions { + self.db.insert_transaction(&tx.calculate_hash(), &tx)?; + } + self.db.insert_block(&block)?; } // We're done with this peer From dcfeed1a84dd57dd1579d80e7cb0740defc1e042 Mon Sep 17 00:00:00 2001 From: Shawn Date: Fri, 27 Dec 2024 23:05:33 +0800 Subject: [PATCH 013/119] feat: successfully injecting blocks/state_trie --- zilliqa/src/blockstore.rs | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/zilliqa/src/blockstore.rs b/zilliqa/src/blockstore.rs index dd06bf57c..e6b6706d5 100644 --- a/zilliqa/src/blockstore.rs +++ b/zilliqa/src/blockstore.rs @@ -13,7 +13,7 @@ use libp2p::PeerId; use crate::{ cfg::NodeConfig, db::Db, - message::{Block, ExternalMessage, InternalMessage, Proposal, RequestBlock, ResponseBlock}, + message::{Block, ExternalMessage, ProcessProposal, Proposal, RequestBlock, ResponseBlock}, node::MessageSender, }; @@ -170,9 +170,9 @@ impl BlockStore { tracing::warn!("blockstore::ResponseFromHeight : partial blocks {from}",); } - // TODO: Inject proposals - tracing::debug!( - "blockstore::ResponseFromHeight : injecting {} proposals", + // TODO: Any additional checks we should do here? + tracing::info!( + "blockstore::ResponseFromHeight : injecting {} proposals from {from}", response.proposals.len() ); @@ -183,22 +183,21 @@ impl BlockStore { .sorted_by_key(|p| p.number()) .collect_vec(); - // Just pump the Proposals back to ourselves, and it will be picked up and processed as if it were received. - // Only issue is the timestamp skew. We should probably fix that. + // Just pump the Proposals back to ourselves. for p in proposals { tracing::trace!( - "Inserting proposal number: {} hash: {}", + "Injecting proposal number: {} hash: {}", p.number(), p.hash(), ); - let (block, transactions) = p.into_parts(); - - // TODO: Bulk SQL insert - for tx in transactions { - self.db.insert_transaction(&tx.calculate_hash(), &tx)?; - } - self.db.insert_block(&block)?; + self.message_sender.send_external_message( + self.peer_id, + ExternalMessage::ProcessProposal(ProcessProposal { + from: self.peer_id.to_bytes(), // FIXME: change this to PeerId instead of Vec + block: p, + }), + )?; } // We're done with this peer @@ -249,7 +248,7 @@ impl BlockStore { .number .saturating_sub(alpha_block.header.number); - // TODO: Double-check hysteresis logic. + // TODO: Double-check hysteresis logic - may not even be necessary to do RequestFromHash let message = if block_gap > self.max_blocks_in_flight as u64 / 2 { // we're far from latest block ExternalMessage::RequestFromHeight(RequestBlock { @@ -266,7 +265,7 @@ impl BlockStore { let peer = self.in_flight.as_ref().unwrap(); - tracing::debug!(?message, "Requesting missing blocks from {}", peer.peer_id); + tracing::info!(?message, "Requesting missing blocks from {}", peer.peer_id); self.message_sender .send_external_message(peer.peer_id, message)?; From b3fb9992145def38aad11a6eca2b664bc3b25ee6 Mon Sep 17 00:00:00 2001 From: Shawn Date: Sat, 28 Dec 2024 10:03:47 +0800 Subject: [PATCH 014/119] feat: small refactor blockstore.rs --- zilliqa/src/blockstore.rs | 124 +++++++++++++++++++++++++++++--------- zilliqa/src/node.rs | 5 ++ 2 files changed, 102 insertions(+), 27 deletions(-) diff --git a/zilliqa/src/blockstore.rs b/zilliqa/src/blockstore.rs index e6b6706d5..0845baf6d 100644 --- a/zilliqa/src/blockstore.rs +++ b/zilliqa/src/blockstore.rs @@ -17,6 +17,13 @@ use crate::{ node::MessageSender, }; +enum DownGrade { + None, + Partial, + Empty, + Timeout, +} + /// Stores and manages the node's list of blocks. Also responsible for making requests for new blocks. /// /// # Syncing Algorithm @@ -155,34 +162,23 @@ impl BlockStore { Ok(message) } - pub fn handle_response_from_height( - &mut self, - from: PeerId, - response: ResponseBlock, - ) -> Result<()> { - // Check that we have enough to complete the process, otherwise ignore - if response.proposals.is_empty() { - // Empty response, downgrade peer - tracing::warn!("blockstore::ResponseFromHeight : empty blocks {from}",); - } - if response.proposals.len() < self.max_blocks_in_flight { - // Partial response, downgrade peer - tracing::warn!("blockstore::ResponseFromHeight : partial blocks {from}",); - } - - // TODO: Any additional checks we should do here? + fn inject_proposals(&mut self, proposals: Vec) -> Result> { tracing::info!( - "blockstore::ResponseFromHeight : injecting {} proposals from {from}", - response.proposals.len() + "blockstore::InjectProposals : injecting {} proposals", + proposals.len() ); + if proposals.is_empty() { + return Ok(None); + } // Sort proposals by number - let proposals = response - .proposals + let proposals = proposals .into_iter() .sorted_by_key(|p| p.number()) .collect_vec(); + let last_proposal = proposals.last().cloned(); + // Just pump the Proposals back to ourselves. for p in proposals { tracing::trace!( @@ -199,9 +195,80 @@ impl BlockStore { }), )?; } + // return last proposal + Ok(last_proposal) + } + + fn done_with_peer(&mut self, downgrade: DownGrade) { + // ... + if let Some(mut peer) = self.in_flight.take() { + peer.score += downgrade as u32; + self.peers.push(peer); + } + } + + pub fn handle_response_from_height( + &mut self, + from: PeerId, + response: ResponseBlock, + ) -> Result<()> { + // Check that we have enough to complete the process, otherwise ignore + if response.proposals.is_empty() { + // Empty response, downgrade peer + tracing::warn!("blockstore::ResponseFromHeight : empty blocks {from}",); + self.done_with_peer(DownGrade::Empty); + return Ok(()); + } else if response.proposals.len() < self.max_blocks_in_flight { + // Partial response, downgrade peer + tracing::warn!("blockstore::ResponseFromHeight : partial blocks {from}",); + self.done_with_peer(DownGrade::Partial); + } else { + self.done_with_peer(DownGrade::None); + } + + tracing::info!( + "blockstore::ResponseFromHeight : received {} blocks from {}", + response.proposals.len(), + from + ); + + // TODO: Any additional checks we should do here? + self.inject_proposals(response.proposals)?; + + // Speculatively request more blocks + Ok(()) + } + + pub fn handle_response_from_hash( + &mut self, + from: PeerId, + response: ResponseBlock, + ) -> Result<()> { + if response.proposals.is_empty() { + // Empty response, downgrade peer + tracing::warn!("blockstore::ResponseFromHash : empty blocks {from}",); + self.done_with_peer(DownGrade::Empty); + return Ok(()); + } else if response.proposals.len() <= self.max_blocks_in_flight / 2 { + // Partial response, downgrade peer + tracing::warn!("blockstore::ResponseFromHash : partial blocks {from}",); + self.done_with_peer(DownGrade::Partial); + return Ok(()); + } else { + // only process full responses + self.done_with_peer(DownGrade::None); + } + + tracing::info!( + "blockstore::ResponseFromHash : received {} blocks from {}", + response.proposals.len(), + from + ); + + // TODO: Any additional checks we should do here? + + self.inject_proposals(response.proposals)?; - // We're done with this peer - self.peers.push(self.in_flight.take().unwrap()); // ... Ok(()) } @@ -220,9 +287,8 @@ impl BlockStore { "In-flight request {} timed out, requesting from new peer", peer.peer_id ); - let mut peer = self.in_flight.take().unwrap(); - peer.score += 1; // TODO: Downgrade score if we keep timing out. - self.in_flight = self.get_next_peer(Some(peer)); + self.done_with_peer(DownGrade::Timeout); + self.in_flight = self.get_next_peer(None); } else { return Ok(()); } @@ -265,7 +331,11 @@ impl BlockStore { let peer = self.in_flight.as_ref().unwrap(); - tracing::info!(?message, "Requesting missing blocks from {}", peer.peer_id); + tracing::info!( + "Requesting {} missing blocks from {}", + self.max_blocks_in_flight, + peer.peer_id, + ); self.message_sender .send_external_message(peer.peer_id, message)?; @@ -277,7 +347,7 @@ impl BlockStore { // new peers should be tried last, which gives them time to sync first. // peers do not need to be unique. let new_peer = PeerInfo { - score: self.peers.iter().map(|p| p.score).max().unwrap_or(0), + score: self.peers.iter().map(|p| p.score).max().unwrap_or_default(), peer_id: peer, last_used: Instant::now(), }; diff --git a/zilliqa/src/node.rs b/zilliqa/src/node.rs index 48d8fabf3..1086fe460 100644 --- a/zilliqa/src/node.rs +++ b/zilliqa/src/node.rs @@ -455,6 +455,11 @@ impl Node { .blockstore .handle_response_from_height(from, response)?; } + ExternalMessage::ResponseFromHash(response) => { + self.consensus + .blockstore + .handle_response_from_hash(from, response)?; + } ExternalMessage::BlockResponse(m) => self.handle_block_response(from, m)?, ExternalMessage::Acknowledgement => {} _ => { From ef3a8fbb4b4329645c42076d2fdf578527b6aa55 Mon Sep 17 00:00:00 2001 From: Shawn Date: Sat, 28 Dec 2024 10:49:12 +0800 Subject: [PATCH 015/119] sec: make RequestId random, to mitigate response injections. --- zilliqa/src/node.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/zilliqa/src/node.rs b/zilliqa/src/node.rs index 1086fe460..9d125fc15 100644 --- a/zilliqa/src/node.rs +++ b/zilliqa/src/node.rs @@ -84,9 +84,7 @@ impl MessageSender { } pub fn next_request_id(&mut self) -> RequestId { - let request_id = self.request_id; - self.request_id.0 = self.request_id.0.wrapping_add(1); - request_id + RequestId(rand::random()) // TODO: make this more secure, non-predictable } /// Send a message to a remote node of the same shard. From 36449fa186df54add1c35111ed8866d3be2c4580 Mon Sep 17 00:00:00 2001 From: Shawn Date: Sat, 28 Dec 2024 16:26:10 +0800 Subject: [PATCH 016/119] feat: minor reorg. --- zilliqa/src/blockstore.rs | 149 +++++++++++++++++++++++++++----------- zilliqa/src/node.rs | 69 ++---------------- 2 files changed, 113 insertions(+), 105 deletions(-) diff --git a/zilliqa/src/blockstore.rs b/zilliqa/src/blockstore.rs index 0845baf6d..bd82a015a 100644 --- a/zilliqa/src/blockstore.rs +++ b/zilliqa/src/blockstore.rs @@ -20,13 +20,11 @@ use crate::{ enum DownGrade { None, Partial, - Empty, Timeout, + Empty, } -/// Stores and manages the node's list of blocks. Also responsible for making requests for new blocks. -/// -/// # Syncing Algorithm +/// Syncing Algorithm /// /// We rely on [crate::consensus::Consensus] informing us of newly received block proposals via: /// * [BlockStore::process_block] for blocks that can be part of our chain, because we already have their parent. @@ -35,6 +33,9 @@ enum DownGrade { /// Both these code paths also call [BlockStore::request_missing_blocks]. This finds the greatest view of any proposal /// we've seen (whether its part of our chain or not). +// TODO: What if we receive a fork +// TODO: How to start syncing at the start + #[derive(Debug)] pub struct BlockStore { // database @@ -81,11 +82,8 @@ impl BlockStore { }) } - pub fn handle_from_hash(&mut self, _: Vec) -> Result<()> { - // ... - Ok(()) - } - + /// Process a block proposal. + /// Checks if the parent block exists, and if not, triggers a sync. pub fn process_proposal(&mut self, block: Block) -> Result<()> { // ... // check if block parent exists @@ -97,7 +95,7 @@ impl BlockStore { "blockstore::ProcessProposal : Parent block {} not found", block.parent_hash() ); - self.request_missing_blocks(block)?; + self.request_missing_blocks(Some(block))?; return Ok(()); } Ok(()) @@ -107,17 +105,65 @@ impl BlockStore { // ... } + /// Convenience function to convert a block to a proposal (add full txs) + /// NOTE: Includes intershard transactions. Should only be used for syncing history, + /// not for consensus messages regarding new blocks. fn block_to_proposal(&self, block: Block) -> Proposal { + // since block must be valid, unwrap(s) are safe let txs = block .transactions .iter() .map(|hash| self.db.get_transaction(hash).unwrap().unwrap()) .map(|tx| tx.verify().unwrap()) - .collect(); + .collect_vec(); Proposal::from_parts(block, txs) } + pub fn handle_request_from_hash( + &mut self, + from: PeerId, + request: RequestBlock, + ) -> Result { + tracing::debug!( + "blockstore::RequestFromHash : received a block request from {}", + from + ); + + // TODO: Check if we should service this request + // Validators could respond to this request if there is nothing else to do. + + let Some(omega_block) = self.db.get_block_by_hash(&request.from_hash)? else { + // We do not have the starting block + tracing::warn!( + "blockstore::RequestFromHash : missing starting block {}", + request.from_hash + ); + let message = ExternalMessage::ResponseFromHash(ResponseBlock { proposals: vec![] }); + return Ok(message); + }; + + let batch_size = self.max_blocks_in_flight.min(request.batch_size); // mitigate DOS attacks by limiting the number of blocks we send + let mut proposals: Vec = Vec::new(); + let mut hash = omega_block.parent_hash(); + while proposals.len() < batch_size { + // grab the parent + let Some(block) = self.db.get_block_by_hash(&hash)? else { + // that's all we have! + break; + }; + hash = block.parent_hash(); + proposals.push(self.block_to_proposal(block)); + } + + let message = ExternalMessage::ResponseFromHash(ResponseBlock { proposals }); + tracing::trace!( + ?message, + "blockstore::RequestFromHash : responding to block request from height" + ); + Ok(message) + } + pub fn handle_request_from_height( &mut self, from: PeerId, @@ -144,8 +190,8 @@ impl BlockStore { }; // TODO: Replace this with a single SQL query + let batch_size = self.max_blocks_in_flight.min(request.batch_size) as u64; // mitigate DOS attacks by limiting the number of blocks we send let mut proposals = Vec::new(); - let batch_size = self.max_blocks_in_flight.min(request.batch_size) as u64; for num in alpha.number().saturating_add(1)..=alpha.number().saturating_add(batch_size) { let Some(block) = self.db.get_canonical_block_by_number(num)? else { // that's all we have! @@ -162,14 +208,15 @@ impl BlockStore { Ok(message) } - fn inject_proposals(&mut self, proposals: Vec) -> Result> { + /// Pump the proposals into the chain. + fn inject_proposals(&mut self, proposals: Vec) -> Result<()> { tracing::info!( "blockstore::InjectProposals : injecting {} proposals", proposals.len() ); if proposals.is_empty() { - return Ok(None); + return Ok(()); } // Sort proposals by number let proposals = proposals @@ -177,8 +224,6 @@ impl BlockStore { .sorted_by_key(|p| p.number()) .collect_vec(); - let last_proposal = proposals.last().cloned(); - // Just pump the Proposals back to ourselves. for p in proposals { tracing::trace!( @@ -196,9 +241,10 @@ impl BlockStore { )?; } // return last proposal - Ok(last_proposal) + Ok(()) } + /// Downgrade a peer based on the response received. fn done_with_peer(&mut self, downgrade: DownGrade) { // ... if let Some(mut peer) = self.in_flight.take() { @@ -212,7 +258,7 @@ impl BlockStore { from: PeerId, response: ResponseBlock, ) -> Result<()> { - // Check that we have enough to complete the process, otherwise ignore + // Process whatever we have received. if response.proposals.is_empty() { // Empty response, downgrade peer tracing::warn!("blockstore::ResponseFromHeight : empty blocks {from}",); @@ -233,9 +279,29 @@ impl BlockStore { ); // TODO: Any additional checks we should do here? + + // Inject received proposals + let next_hash = response.proposals.last().unwrap().hash(); self.inject_proposals(response.proposals)?; - // Speculatively request more blocks + // Speculatively request more blocks, as there might be more + self.in_flight = self.get_next_peer(); + if let Some(peer) = self.in_flight.as_ref() { + let message = ExternalMessage::RequestFromHeight(RequestBlock { + batch_size: self.max_blocks_in_flight, + from_hash: next_hash, + }); + + tracing::info!( + "Requesting {} missing blocks from {}", + self.max_blocks_in_flight, + peer.peer_id, + ); + + self.message_sender + .send_external_message(peer.peer_id, message)?; + } + Ok(()) } @@ -244,13 +310,15 @@ impl BlockStore { from: PeerId, response: ResponseBlock, ) -> Result<()> { + // Check that we have enough to complete the process, otherwise ignore if response.proposals.is_empty() { - // Empty response, downgrade peer + // Empty response, downgrade peer, skip tracing::warn!("blockstore::ResponseFromHash : empty blocks {from}",); self.done_with_peer(DownGrade::Empty); return Ok(()); } else if response.proposals.len() <= self.max_blocks_in_flight / 2 { // Partial response, downgrade peer + // Skip processing because we want to ensure that we have ALL the needed blocks to sync up. tracing::warn!("blockstore::ResponseFromHash : partial blocks {from}",); self.done_with_peer(DownGrade::Partial); return Ok(()); @@ -267,19 +335,18 @@ impl BlockStore { // TODO: Any additional checks we should do here? + // Inject the proposals self.inject_proposals(response.proposals)?; - - // ... Ok(()) } /// Request blocks between the current height and the given block. /// /// The approach is to request blocks in batches of `max_blocks_in_flight` blocks. + /// If None block is provided, we request blocks from the last known canonical block forwards. /// If the block gap is large, we request blocks from the last known canonical block forwards. /// If the block gap is small, we request blocks from the latest block backwards. - /// - pub fn request_missing_blocks(&mut self, omega_block: Block) -> Result<()> { + pub fn request_missing_blocks(&mut self, omega_block: Option) -> Result<()> { // Early exit if there's a request in-flight; and if it has not expired. if let Some(peer) = self.in_flight.as_ref() { if peer.last_used.elapsed() > self.request_timeout { @@ -288,14 +355,14 @@ impl BlockStore { peer.peer_id ); self.done_with_peer(DownGrade::Timeout); - self.in_flight = self.get_next_peer(None); + self.in_flight = self.get_next_peer(); } else { return Ok(()); } } else { - self.in_flight = self.get_next_peer(None); + self.in_flight = self.get_next_peer(); if self.in_flight.is_none() { - tracing::error!("No peers available to request missing blocks"); + tracing::warn!("Insufficient peers available to request missing blocks"); return Ok(()); } } @@ -309,10 +376,15 @@ impl BlockStore { let alpha_block = self.db.get_canonical_block_by_number(height)?.unwrap(); // Compute the block gap. - let block_gap = omega_block - .header - .number - .saturating_sub(alpha_block.header.number); + let block_gap = if let Some(omega_block) = omega_block.as_ref() { + omega_block + .header + .number + .saturating_sub(alpha_block.header.number) + } else { + // Trigger a RequestFromHeight if the source block is None + self.max_blocks_in_flight as u64 + }; // TODO: Double-check hysteresis logic - may not even be necessary to do RequestFromHash let message = if block_gap > self.max_blocks_in_flight as u64 / 2 { @@ -324,7 +396,7 @@ impl BlockStore { } else { // we're close to latest block ExternalMessage::RequestFromHash(RequestBlock { - from_hash: omega_block.header.hash, + from_hash: omega_block.unwrap().header.hash, batch_size: self.max_blocks_in_flight, }) }; @@ -345,7 +417,6 @@ impl BlockStore { /// Add a peer to the list of peers. pub fn add_peer(&mut self, peer: PeerId) { // new peers should be tried last, which gives them time to sync first. - // peers do not need to be unique. let new_peer = PeerInfo { score: self.peers.iter().map(|p| p.score).max().unwrap_or_default(), peer_id: peer, @@ -359,18 +430,14 @@ impl BlockStore { self.peers.retain(|p| p.peer_id != peer); } - fn get_next_peer(&mut self, prev_peer: Option) -> Option { - // Push the current peer into the heap, risks spamming the same peer. + fn get_next_peer(&mut self) -> Option { // TODO: implement a better strategy for this. - if let Some(peer) = prev_peer { - self.peers.push(peer); + if self.peers.len() < 2 { + return None; } let mut peer = self.peers.pop()?; - - // used to determine stale in-flight requests. - peer.last_used = std::time::Instant::now(); - + peer.last_used = std::time::Instant::now(); // used to determine stale in-flight requests. Some(peer) } } diff --git a/zilliqa/src/node.rs b/zilliqa/src/node.rs index 9d125fc15..2c0c730fa 100644 --- a/zilliqa/src/node.rs +++ b/zilliqa/src/node.rs @@ -34,7 +34,7 @@ use crate::{ inspector::{self, ScillaInspector}, message::{ Block, BlockHeader, BlockResponse, ExternalMessage, InternalMessage, IntershardCall, - ProcessProposal, Proposal, ResponseBlock, + ProcessProposal, Proposal, }, node_launcher::ResponseChannel, p2p_node::{LocalMessageTuple, OutboundMessageTuple}, @@ -281,51 +281,10 @@ impl Node { self.request_responses.send((response_channel, message))?; } ExternalMessage::RequestFromHash(request) => { - debug!( - "blockstore::RequestFromHash : received a block request from {}", - from - ); - - if from == self.peer_id { - warn!("blockstore::RequestFromHash : ignoring request from self"); - return Ok(()); - } - - // TODO: Check if we should service this request - // Validators could respond to this request if there is nothing else to do. - - let Some(omega_block) = self.db.get_block_by_hash(&request.from_hash)? else { - // We do not have the starting block - tracing::warn!( - "blockstore::RequestFromHash : missing starting block {}", - request.from_hash - ); - self.request_responses.send(( - response_channel, - ExternalMessage::ResponseFromHash(ResponseBlock { proposals: vec![] }), - ))?; - return Ok(()); - }; - - let mut proposals = Vec::new(); - let mut hash = omega_block.parent_hash(); - // grab up to batch_size blocks - let batch_size = request.batch_size.min(self.config.max_blocks_in_flight); - while proposals.len() < batch_size { - // grab the parent - let Some(block) = self.db.get_block_by_hash(&hash)? else { - // that's all we have! - break; - }; - hash = block.parent_hash(); - proposals.push(self.block_to_proposal(block)); - } - - let message = ExternalMessage::ResponseFromHash(ResponseBlock { proposals }); - tracing::trace!( - ?message, - "blockstore::RequestFromHash : responding to block request from height" - ); + let message = self + .consensus + .blockstore + .handle_request_from_hash(from, request)?; self.request_responses.send((response_channel, message))?; } ExternalMessage::ResponseFromHash(response) => { @@ -1008,24 +967,6 @@ impl Node { self.peer_num.load(std::sync::atomic::Ordering::Relaxed) } - /// Convenience function to convert a block to a proposal (add full txs) - /// NOTE: Includes intershard transactions. Should only be used for syncing history, - /// not for consensus messages regarding new blocks. - fn block_to_proposal(&self, block: Block) -> Proposal { - let txs: Vec<_> = block - .transactions - .iter() - .map(|tx_hash| { - self.consensus - .get_transaction_by_hash(*tx_hash) - .unwrap() - .unwrap() - }) - .collect(); - - Proposal::from_parts(block, txs) - } - fn handle_proposal(&mut self, from: PeerId, proposal: Proposal) -> Result<()> { if let Some((to, message)) = self.consensus.proposal(from, proposal, false)? { self.reset_timeout From a8abeac1da63b98c365f79edd86e19e08b355a5b Mon Sep 17 00:00:00 2001 From: Shawn Date: Sat, 28 Dec 2024 17:09:15 +0800 Subject: [PATCH 017/119] feat: disable speculative requests for now, until we have a better way to limit it. --- zilliqa/src/blockstore.rs | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/zilliqa/src/blockstore.rs b/zilliqa/src/blockstore.rs index bd82a015a..a1fe815ab 100644 --- a/zilliqa/src/blockstore.rs +++ b/zilliqa/src/blockstore.rs @@ -35,6 +35,7 @@ enum DownGrade { // TODO: What if we receive a fork // TODO: How to start syncing at the start +// TODO: Do speculative fetches #[derive(Debug)] pub struct BlockStore { @@ -281,26 +282,26 @@ impl BlockStore { // TODO: Any additional checks we should do here? // Inject received proposals - let next_hash = response.proposals.last().unwrap().hash(); + // let next_hash = response.proposals.last().unwrap().hash(); self.inject_proposals(response.proposals)?; // Speculatively request more blocks, as there might be more - self.in_flight = self.get_next_peer(); - if let Some(peer) = self.in_flight.as_ref() { - let message = ExternalMessage::RequestFromHeight(RequestBlock { - batch_size: self.max_blocks_in_flight, - from_hash: next_hash, - }); - - tracing::info!( - "Requesting {} missing blocks from {}", - self.max_blocks_in_flight, - peer.peer_id, - ); - - self.message_sender - .send_external_message(peer.peer_id, message)?; - } + // self.in_flight = self.get_next_peer(); + // if let Some(peer) = self.in_flight.as_ref() { + // let message = ExternalMessage::RequestFromHeight(RequestBlock { + // batch_size: self.max_blocks_in_flight, + // from_hash: next_hash, + // }); + + // tracing::info!( + // "Requesting {} missing blocks from {}", + // self.max_blocks_in_flight, + // peer.peer_id, + // ); + + // self.message_sender + // .send_external_message(peer.peer_id, message)?; + // } Ok(()) } From 1f65f75c5963066dae06debc2b594a7422b192fe Mon Sep 17 00:00:00 2001 From: Shawn Date: Sat, 28 Dec 2024 21:25:59 +0800 Subject: [PATCH 018/119] feat: re-enabled speculative fetch. --- zilliqa/src/blockstore.rs | 71 ++++++++++++++++++++++++--------------- 1 file changed, 44 insertions(+), 27 deletions(-) diff --git a/zilliqa/src/blockstore.rs b/zilliqa/src/blockstore.rs index a1fe815ab..3238bde93 100644 --- a/zilliqa/src/blockstore.rs +++ b/zilliqa/src/blockstore.rs @@ -34,8 +34,8 @@ enum DownGrade { /// we've seen (whether its part of our chain or not). // TODO: What if we receive a fork -// TODO: How to start syncing at the start -// TODO: Do speculative fetches +// TODO: How to start syncing validators +// TODO: How to handle restarting new blocks, while injected blocks are still in-queue. #[derive(Debug)] pub struct BlockStore { @@ -51,8 +51,12 @@ pub struct BlockStore { request_timeout: Duration, // how many blocks to request at once max_blocks_in_flight: usize, + // how many blocks to inject into the queue + max_blocks_injected: usize, // our peer id peer_id: PeerId, + // how many injected proposals + injected: usize, } impl BlockStore { @@ -71,18 +75,27 @@ impl BlockStore { }) .collect(); let peer_id = message_sender.our_peer_id; + let max_blocks = config.max_blocks_in_flight.max(31) as usize; // between 30 seconds and 3 days of blocks. Ok(Self { db, message_sender, peers, - in_flight: None, - request_timeout: config.consensus.consensus_timeout, - max_blocks_in_flight: config.max_blocks_in_flight.max(31) as usize, // between 30 seconds and 3 days of blocks. peer_id, + request_timeout: config.consensus.consensus_timeout, + max_blocks_in_flight: max_blocks, + max_blocks_injected: max_blocks * 10, // fire 10 speculative requests + in_flight: None, + injected: 0, }) } + /// Handle an injected proposal + /// + pub fn handle_injected_proposal(&mut self, proposal: Proposal) -> Result<()> { + Ok(()) + } + /// Process a block proposal. /// Checks if the parent block exists, and if not, triggers a sync. pub fn process_proposal(&mut self, block: Block) -> Result<()> { @@ -102,10 +115,6 @@ impl BlockStore { Ok(()) } - pub fn buffer_proposal(&self, _block: Block) { - // ... - } - /// Convenience function to convert a block to a proposal (add full txs) /// NOTE: Includes intershard transactions. Should only be used for syncing history, /// not for consensus messages regarding new blocks. @@ -121,6 +130,9 @@ impl BlockStore { Proposal::from_parts(block, txs) } + /// Request blocks from a hash, backwards. + /// + /// It will collect N blocks by following the block.parent_hash() of the requested block. pub fn handle_request_from_hash( &mut self, from: PeerId, @@ -165,6 +177,7 @@ impl BlockStore { Ok(message) } + /// Request for blocks from a height, forwards. pub fn handle_request_from_height( &mut self, from: PeerId, @@ -225,6 +238,9 @@ impl BlockStore { .sorted_by_key(|p| p.number()) .collect_vec(); + // Increment propoals injected + self.injected += proposals.len(); + // Just pump the Proposals back to ourselves. for p in proposals { tracing::trace!( @@ -282,27 +298,28 @@ impl BlockStore { // TODO: Any additional checks we should do here? // Inject received proposals - // let next_hash = response.proposals.last().unwrap().hash(); + let next_hash = response.proposals.last().unwrap().hash(); self.inject_proposals(response.proposals)?; // Speculatively request more blocks, as there might be more - // self.in_flight = self.get_next_peer(); - // if let Some(peer) = self.in_flight.as_ref() { - // let message = ExternalMessage::RequestFromHeight(RequestBlock { - // batch_size: self.max_blocks_in_flight, - // from_hash: next_hash, - // }); - - // tracing::info!( - // "Requesting {} missing blocks from {}", - // self.max_blocks_in_flight, - // peer.peer_id, - // ); - - // self.message_sender - // .send_external_message(peer.peer_id, message)?; - // } + if self.injected < self.max_blocks_injected { + self.in_flight = self.get_next_peer(); + if let Some(peer) = self.in_flight.as_ref() { + let message = ExternalMessage::RequestFromHeight(RequestBlock { + batch_size: self.max_blocks_in_flight, + from_hash: next_hash, + }); + + tracing::info!( + "Requesting {} future blocks from {}", + self.max_blocks_in_flight, + peer.peer_id, + ); + self.message_sender + .send_external_message(peer.peer_id, message)?; + } + } Ok(()) } @@ -432,7 +449,7 @@ impl BlockStore { } fn get_next_peer(&mut self) -> Option { - // TODO: implement a better strategy for this. + // Minimum of 2 peers to avoid single source of truth. if self.peers.len() < 2 { return None; } From 093054036a6289ca4c6324fafdd061a50686ba47 Mon Sep 17 00:00:00 2001 From: Shawn Date: Mon, 30 Dec 2024 10:12:12 +0800 Subject: [PATCH 019/119] feat: use InjectedProposals instead of ProcessProposals. --- zilliqa/src/blockstore.rs | 33 +++++++++++++++++++-------------- zilliqa/src/cfg.rs | 4 ++-- zilliqa/src/message.rs | 12 ++++++++++++ zilliqa/src/node.rs | 13 +++++++------ 4 files changed, 40 insertions(+), 22 deletions(-) diff --git a/zilliqa/src/blockstore.rs b/zilliqa/src/blockstore.rs index 3238bde93..6000a8d34 100644 --- a/zilliqa/src/blockstore.rs +++ b/zilliqa/src/blockstore.rs @@ -13,7 +13,7 @@ use libp2p::PeerId; use crate::{ cfg::NodeConfig, db::Db, - message::{Block, ExternalMessage, ProcessProposal, Proposal, RequestBlock, ResponseBlock}, + message::{Block, ExternalMessage, InjectedProposal, Proposal, RequestBlock, ResponseBlock}, node::MessageSender, }; @@ -34,8 +34,6 @@ enum DownGrade { /// we've seen (whether its part of our chain or not). // TODO: What if we receive a fork -// TODO: How to start syncing validators -// TODO: How to handle restarting new blocks, while injected blocks are still in-queue. #[derive(Debug)] pub struct BlockStore { @@ -75,7 +73,6 @@ impl BlockStore { }) .collect(); let peer_id = message_sender.our_peer_id; - let max_blocks = config.max_blocks_in_flight.max(31) as usize; // between 30 seconds and 3 days of blocks. Ok(Self { db, @@ -83,16 +80,19 @@ impl BlockStore { peers, peer_id, request_timeout: config.consensus.consensus_timeout, - max_blocks_in_flight: max_blocks, - max_blocks_injected: max_blocks * 10, // fire 10 speculative requests + max_blocks_in_flight: config.block_request_batch_size.max(31), // between 30 seconds and 3 days of blocks. + max_blocks_injected: config.max_blocks_in_flight.min(3600), // cap to 1-hr worth of blocks in_flight: None, injected: 0, }) } - /// Handle an injected proposal - /// - pub fn handle_injected_proposal(&mut self, proposal: Proposal) -> Result<()> { + /// Match a received proposal + pub fn mark_received_proposal(&mut self, prop: &InjectedProposal) -> Result<()> { + if prop.from != self.peer_id { + tracing::warn!("Received a foreign InjectedProposal from {}", prop.from); + } + self.injected = self.injected.saturating_sub(1); Ok(()) } @@ -238,8 +238,8 @@ impl BlockStore { .sorted_by_key(|p| p.number()) .collect_vec(); - // Increment propoals injected - self.injected += proposals.len(); + // Increment proposals injected + self.injected = self.injected.saturating_add(proposals.len()); // Just pump the Proposals back to ourselves. for p in proposals { @@ -251,8 +251,8 @@ impl BlockStore { self.message_sender.send_external_message( self.peer_id, - ExternalMessage::ProcessProposal(ProcessProposal { - from: self.peer_id.to_bytes(), // FIXME: change this to PeerId instead of Vec + ExternalMessage::InjectedProposal(InjectedProposal { + from: self.peer_id, block: p, }), )?; @@ -297,8 +297,10 @@ impl BlockStore { // TODO: Any additional checks we should do here? - // Inject received proposals + // Last known proposal let next_hash = response.proposals.last().unwrap().hash(); + + // Inject received proposals self.inject_proposals(response.proposals)?; // Speculatively request more blocks, as there might be more @@ -378,6 +380,9 @@ impl BlockStore { return Ok(()); } } else { + if self.injected > 0 { + return Ok(()); + } self.in_flight = self.get_next_peer(); if self.in_flight.is_none() { tracing::warn!("Insufficient peers available to request missing blocks"); diff --git a/zilliqa/src/cfg.rs b/zilliqa/src/cfg.rs index c08c0bc88..cf137e4a4 100644 --- a/zilliqa/src/cfg.rs +++ b/zilliqa/src/cfg.rs @@ -104,7 +104,7 @@ pub struct NodeConfig { pub max_blocks_in_flight: usize, /// The maximum number of blocks to request in a single message when syncing. #[serde(default = "block_request_batch_size_default")] - pub block_request_batch_size: u64, + pub block_request_batch_size: usize, /// The maximum number of key value pairs allowed to be returned withing the response of the `GetSmartContractState` RPC. Defaults to no limit. #[serde(default = "state_rpc_limit_default")] pub state_rpc_limit: usize, @@ -208,7 +208,7 @@ pub fn max_blocks_in_flight_default() -> usize { 1000 } -pub fn block_request_batch_size_default() -> u64 { +pub fn block_request_batch_size_default() -> usize { 100 } diff --git a/zilliqa/src/message.rs b/zilliqa/src/message.rs index e51fbdc7b..c2724b3cc 100644 --- a/zilliqa/src/message.rs +++ b/zilliqa/src/message.rs @@ -238,6 +238,14 @@ pub struct ResponseBlock { pub proposals: Vec, } +/// Used to convey proposal processing internally, to avoid blocking threads for too long. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct InjectedProposal { + // An encoded PeerId + pub from: PeerId, + pub block: Proposal, +} + /// Used to convey proposal processing internally, to avoid blocking threads for too long. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct ProcessProposal { @@ -276,6 +284,7 @@ pub enum ExternalMessage { RequestFromHash(RequestBlock), ResponseFromHeight(ResponseBlock), ResponseFromHash(ResponseBlock), + InjectedProposal(InjectedProposal), } impl ExternalMessage { @@ -291,6 +300,9 @@ impl ExternalMessage { impl Display for ExternalMessage { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { match self { + ExternalMessage::InjectedProposal(p) => { + write!(f, "InjectedProposal {}", p.block.number()) + } ExternalMessage::AddPeer => write!(f, "AddPeer"), ExternalMessage::RemovePeer => write!(f, "RemovePeer"), ExternalMessage::ResponseFromHeight(r) => { diff --git a/zilliqa/src/node.rs b/zilliqa/src/node.rs index 2c0c730fa..dfe830354 100644 --- a/zilliqa/src/node.rs +++ b/zilliqa/src/node.rs @@ -33,8 +33,8 @@ use crate::{ exec::{PendingState, TransactionApplyResult}, inspector::{self, ScillaInspector}, message::{ - Block, BlockHeader, BlockResponse, ExternalMessage, InternalMessage, IntershardCall, - ProcessProposal, Proposal, + Block, BlockHeader, BlockResponse, ExternalMessage, InjectedProposal, InternalMessage, + IntershardCall, Proposal, }, node_launcher::ResponseChannel, p2p_node::{LocalMessageTuple, OutboundMessageTuple}, @@ -373,8 +373,8 @@ impl Node { } // This just breaks down group block messages into individual messages to stop them blocking threads // for long periods. - ExternalMessage::ProcessProposal(m) => { - self.handle_process_proposal(from, m)?; + ExternalMessage::InjectedProposal(p) => { + self.handle_injected_proposal(from, p)?; } // Handle requests which contain a block proposal. Initially sent as a broadcast, it is re-routed into // a Request by the underlying layer, with a faux request-id. This is to mitigate issues when there are @@ -1001,12 +1001,13 @@ impl Node { Ok(()) } - fn handle_process_proposal(&mut self, from: PeerId, req: ProcessProposal) -> Result<()> { + fn handle_injected_proposal(&mut self, from: PeerId, req: InjectedProposal) -> Result<()> { if from != self.consensus.peer_id() { - warn!("Someone ({from}) sent me a ProcessProposal; illegal- ignoring"); + warn!("Someone ({from}) sent me a InjectedProposal; illegal- ignoring"); return Ok(()); } trace!("Handling proposal for view {0}", req.block.header.view); + self.consensus.blockstore.mark_received_proposal(&req)?; let proposal = self.consensus.receive_block(from, req.block)?; if let Some(proposal) = proposal { trace!( From 45c202e9fcb66378f9a0a74467045d3ad949e7d1 Mon Sep 17 00:00:00 2001 From: Shawn Date: Mon, 30 Dec 2024 10:35:38 +0800 Subject: [PATCH 020/119] chore: minor cleanups. --- zilliqa/src/blockstore.rs | 1 - zilliqa/src/lib.rs | 2 +- zilliqa/src/node.rs | 24 ------------------------ zilliqa/src/p2p_node.rs | 2 +- 4 files changed, 2 insertions(+), 27 deletions(-) diff --git a/zilliqa/src/blockstore.rs b/zilliqa/src/blockstore.rs index 6000a8d34..d555651ea 100644 --- a/zilliqa/src/blockstore.rs +++ b/zilliqa/src/blockstore.rs @@ -6,7 +6,6 @@ use std::{ }; use anyhow::Result; - use itertools::Itertools; use libp2p::PeerId; diff --git a/zilliqa/src/lib.rs b/zilliqa/src/lib.rs index 642e82df2..28445f822 100644 --- a/zilliqa/src/lib.rs +++ b/zilliqa/src/lib.rs @@ -1,6 +1,7 @@ pub mod api; pub mod block_store; mod blockhooks; +pub mod blockstore; pub mod cfg; pub mod consensus; pub mod constants; @@ -27,4 +28,3 @@ pub mod test_util; pub mod time; pub mod transaction; pub mod zq1_proto; -pub mod blockstore; diff --git a/zilliqa/src/node.rs b/zilliqa/src/node.rs index dfe830354..ce792ca5a 100644 --- a/zilliqa/src/node.rs +++ b/zilliqa/src/node.rs @@ -287,30 +287,6 @@ impl Node { .handle_request_from_hash(from, request)?; self.request_responses.send((response_channel, message))?; } - ExternalMessage::ResponseFromHash(response) => { - // Check that we have enough to complete the process, otherwise ignore - if response.proposals.is_empty() { - // Empty response, downgrade peer - warn!("block_store::ResponseFromHeight : empty blocks in flight {from}",); - } - // Check that we have enough to complete the process, otherwise ignore - if response.proposals.len() * 2 < self.config.max_blocks_in_flight as usize { - warn!("block_store::ResponseFromHash : insufficient blocks in flight {from}",); - return Ok(()); - } - - // TODO: Inject proposals - debug!( - "block_store::ResponseFromHash : injecting proposals {:?}", - response - ); - - // Acknowledge this block response. This does nothing because the `BlockResponse` request was sent by - // us, but we keep it here for symmetry with the other handlers. - self.request_responses - .send((response_channel, ExternalMessage::Acknowledgement))?; - } - // Respond negatively to old BlockRequests. ExternalMessage::BlockRequest(request) => { self.request_responses.send(( diff --git a/zilliqa/src/p2p_node.rs b/zilliqa/src/p2p_node.rs index e353d6cf5..a2632ad34 100644 --- a/zilliqa/src/p2p_node.rs +++ b/zilliqa/src/p2p_node.rs @@ -271,7 +271,7 @@ impl P2pNode { SwarmEvent::Behaviour(BehaviourEvent::Gossipsub(gossipsub::Event::Unsubscribed { peer_id, topic })) => { let message = ExternalMessage::RemovePeer; self.send_to(&topic, |c| c.broadcasts.send((peer_id, message)))?; - } + } SwarmEvent::Behaviour(BehaviourEvent::Gossipsub(gossipsub::Event::Message{ message_id: msg_id, message: gossipsub::Message { From 3f379b77439dc55c5f63ce270f479aafb8341f67 Mon Sep 17 00:00:00 2001 From: Shawn Date: Mon, 30 Dec 2024 11:12:43 +0800 Subject: [PATCH 021/119] feat: avoid single source of truth. --- zilliqa/src/blockstore.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/zilliqa/src/blockstore.rs b/zilliqa/src/blockstore.rs index d555651ea..8140b1618 100644 --- a/zilliqa/src/blockstore.rs +++ b/zilliqa/src/blockstore.rs @@ -262,9 +262,11 @@ impl BlockStore { /// Downgrade a peer based on the response received. fn done_with_peer(&mut self, downgrade: DownGrade) { - // ... if let Some(mut peer) = self.in_flight.take() { - peer.score += downgrade as u32; + // Downgrade peer, if necessary + peer.score = peer.score.saturating_add(downgrade as u32); + // Ensure that the next peer is equal or better, to avoid a single source of truth. + peer.score = peer.score.max(self.peers.peek().unwrap().score); self.peers.push(peer); } } From 7a522d96d56226b1b41a884499baaf9defe4c3fa Mon Sep 17 00:00:00 2001 From: Shawn Date: Mon, 30 Dec 2024 14:15:14 +0800 Subject: [PATCH 022/119] fix: insufficient peers in GCP. --- zilliqa/src/p2p_node.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/zilliqa/src/p2p_node.rs b/zilliqa/src/p2p_node.rs index a2632ad34..6fd941b23 100644 --- a/zilliqa/src/p2p_node.rs +++ b/zilliqa/src/p2p_node.rs @@ -133,7 +133,8 @@ impl P2pNode { // So, the nodes are unable to see each other directly and remain isolated, defeating kademlia and autonat. identify: identify::Behaviour::new( identify::Config::new("zilliqa/1.0.0".into(), key_pair.public()) - .with_hide_listen_addrs(!cfg!(debug_assertions)), + .with_hide_listen_addrs(false) + .with_push_listen_addr_updates(true), ), }) })? From 85a797532ad5e2229858f8a6dffa71cd063acd78 Mon Sep 17 00:00:00 2001 From: Shawn Date: Mon, 30 Dec 2024 15:46:34 +0800 Subject: [PATCH 023/119] feat: only inject blocks sourced from two peers - impossible to sync, atm. --- zilliqa/src/blockstore.rs | 168 +++++++++++++++++++++++--------------- zilliqa/src/message.rs | 13 +-- zilliqa/src/node.rs | 8 +- 3 files changed, 114 insertions(+), 75 deletions(-) diff --git a/zilliqa/src/blockstore.rs b/zilliqa/src/blockstore.rs index 8140b1618..466d362c7 100644 --- a/zilliqa/src/blockstore.rs +++ b/zilliqa/src/blockstore.rs @@ -1,6 +1,6 @@ use std::{ cmp::Ordering, - collections::BinaryHeap, + collections::{BinaryHeap, HashMap}, sync::Arc, time::{Duration, Instant}, }; @@ -25,14 +25,17 @@ enum DownGrade { /// Syncing Algorithm /// -/// We rely on [crate::consensus::Consensus] informing us of newly received block proposals via: -/// * [BlockStore::process_block] for blocks that can be part of our chain, because we already have their parent. -/// * [BlockStore::buffer_proposal] for blocks that can't (yet) be part of our chain. +/// When a Proposal is received by Consensus, we check if the parent exists in our DB. +/// If not, then it triggers a syncing algorithm. +/// +/// 1. We check if the gap between our last canonical block and the latest Proposal. +/// a. If it is a small gap, we request for history, going backwards from Proposal. +/// b. If it is a big gap, we request for history, going forwards from Canonical. +/// 2. When we receive a response, we inject the Proposals into our processing pipeline. /// -/// Both these code paths also call [BlockStore::request_missing_blocks]. This finds the greatest view of any proposal -/// we've seen (whether its part of our chain or not). // TODO: What if we receive a fork +// TODO: How to handle adverserial history #[derive(Debug)] pub struct BlockStore { @@ -54,6 +57,9 @@ pub struct BlockStore { peer_id: PeerId, // how many injected proposals injected: usize, + // cache + cache: HashMap, + latest_block: Option, } impl BlockStore { @@ -83,13 +89,24 @@ impl BlockStore { max_blocks_injected: config.max_blocks_in_flight.min(3600), // cap to 1-hr worth of blocks in_flight: None, injected: 0, + cache: HashMap::new(), + latest_block: None, }) } /// Match a received proposal pub fn mark_received_proposal(&mut self, prop: &InjectedProposal) -> Result<()> { if prop.from != self.peer_id { - tracing::warn!("Received a foreign InjectedProposal from {}", prop.from); + tracing::error!( + "blockstore::MarkReceivedProposal : foreign InjectedProposal from {}", + prop.from + ); + } + if let Some((_, p)) = self.cache.remove(&prop.block.number()) { + tracing::warn!( + "blockstore::MarkReceivedProposal : removing stale cache proposal {}", + p.number() + ); } self.injected = self.injected.saturating_sub(1); Ok(()) @@ -177,28 +194,28 @@ impl BlockStore { } /// Request for blocks from a height, forwards. - pub fn handle_request_from_height( + pub fn handle_request_from_number( &mut self, from: PeerId, request: RequestBlock, ) -> Result { // ... tracing::debug!( - "blockstore::RequestFromHeight : received a block request from {}", + "blockstore::RequestFromNumber : received a block request from {}", from ); // TODO: Check if we should service this request. // Validators shall not respond to this request. - let Some(alpha) = self.db.get_block_by_hash(&request.from_hash)? else { + let Some(alpha) = self.db.get_canonical_block_by_number(request.from_number)? else { // We do not have the starting block tracing::warn!( - "blockstore::RequestFromHeight : missing starting block {}", - request.from_hash + "blockstore::RequestFromNumber : missing starting block {}", + request.from_number ); let message: ExternalMessage = - ExternalMessage::ResponseFromHeight(ResponseBlock { proposals: vec![] }); + ExternalMessage::ResponseFromNumber(ResponseBlock { proposals: vec![] }); return Ok(message); }; @@ -213,15 +230,19 @@ impl BlockStore { proposals.push(self.block_to_proposal(block)); } - let message = ExternalMessage::ResponseFromHeight(ResponseBlock { proposals }); + let message = ExternalMessage::ResponseFromNumber(ResponseBlock { proposals }); tracing::trace!( ?message, - "blockstore::RequestFromHeight : responding to block request from height" + "blockstore::RequestFromNumber : responding to block request from height" ); Ok(message) } - /// Pump the proposals into the chain. + /// Inject the proposals into the chain. + /// + /// Besides pumping the set of Proposals into the processing pipeline, it also records the + /// last known Proposal in the pipeline. This is used for speculative fetches, and also for + /// knowing where to continue fetching from. fn inject_proposals(&mut self, proposals: Vec) -> Result<()> { tracing::info!( "blockstore::InjectProposals : injecting {} proposals", @@ -231,11 +252,10 @@ impl BlockStore { if proposals.is_empty() { return Ok(()); } - // Sort proposals by number - let proposals = proposals - .into_iter() - .sorted_by_key(|p| p.number()) - .collect_vec(); + + // Store the tip + let (last_block, _) = proposals.last().unwrap().clone().into_parts(); + self.latest_block = Some(last_block); // Increment proposals injected self.injected = self.injected.saturating_add(proposals.len()); @@ -271,7 +291,7 @@ impl BlockStore { } } - pub fn handle_response_from_height( + pub fn handle_response_from_number( &mut self, from: PeerId, response: ResponseBlock, @@ -279,50 +299,54 @@ impl BlockStore { // Process whatever we have received. if response.proposals.is_empty() { // Empty response, downgrade peer - tracing::warn!("blockstore::ResponseFromHeight : empty blocks {from}",); + tracing::warn!("blockstore::ResponseFromNumber : empty blocks {from}",); self.done_with_peer(DownGrade::Empty); return Ok(()); } else if response.proposals.len() < self.max_blocks_in_flight { // Partial response, downgrade peer - tracing::warn!("blockstore::ResponseFromHeight : partial blocks {from}",); + tracing::warn!("blockstore::ResponseFromNumber : partial blocks {from}",); self.done_with_peer(DownGrade::Partial); } else { self.done_with_peer(DownGrade::None); } tracing::info!( - "blockstore::ResponseFromHeight : received {} blocks from {}", + "blockstore::ResponseFromNumber : received {} blocks from {}", response.proposals.len(), from ); // TODO: Any additional checks we should do here? - // Last known proposal - let next_hash = response.proposals.last().unwrap().hash(); - - // Inject received proposals - self.inject_proposals(response.proposals)?; - - // Speculatively request more blocks, as there might be more - if self.injected < self.max_blocks_injected { - self.in_flight = self.get_next_peer(); - if let Some(peer) = self.in_flight.as_ref() { - let message = ExternalMessage::RequestFromHeight(RequestBlock { - batch_size: self.max_blocks_in_flight, - from_hash: next_hash, - }); - - tracing::info!( - "Requesting {} future blocks from {}", - self.max_blocks_in_flight, - peer.peer_id, - ); + // Sort proposals by number + let proposals = response + .proposals + .into_iter() + .sorted_by_key(|p| p.number()) + .collect_vec(); - self.message_sender - .send_external_message(peer.peer_id, message)?; + // Insert into the cache. + // If current proposal matches another one in cache, from a different peer, inject the proposal. + // Else, replace the cached values with the new ones. + let mut injections = Vec::new(); + for p in proposals { + // If the proposal already exists + if let Some((peer, proposal)) = self.cache.remove(&p.number()) { + if peer != from && proposal.hash() == p.hash() { + injections.push(proposal); + } else { + // insert the new one and; + self.cache.insert(p.number(), (from, p)); + break; // TODO: Replace the rest + } + } else { + self.cache.insert(p.number(), (from, p)); } } + + // Inject matched proposals + self.inject_proposals(injections)?; + Ok(()) } @@ -391,13 +415,18 @@ impl BlockStore { } } - // highest canonical block we have - // TODO: Replace this with a single SQL query. - let height = self - .db - .get_highest_canonical_block_number()? - .unwrap_or_default(); - let alpha_block = self.db.get_canonical_block_by_number(height)?.unwrap(); + // highest canonical block we know + let alpha_block = if self.latest_block.is_some() { + self.latest_block.as_ref().unwrap().clone() + } else { + // TODO: Replace this with a single SQL query. + let height = self + .db + .get_highest_canonical_block_number()? + .unwrap_or_default(); + let alpha_block = self.db.get_canonical_block_by_number(height)?.unwrap(); + alpha_block + }; // Compute the block gap. let block_gap = if let Some(omega_block) = omega_block.as_ref() { @@ -406,23 +435,32 @@ impl BlockStore { .number .saturating_sub(alpha_block.header.number) } else { - // Trigger a RequestFromHeight if the source block is None + // Trigger a RequestFromNumber if the source block is None self.max_blocks_in_flight as u64 }; // TODO: Double-check hysteresis logic - may not even be necessary to do RequestFromHash - let message = if block_gap > self.max_blocks_in_flight as u64 / 2 { + let (message, hash) = if block_gap > self.max_blocks_in_flight as u64 / 2 { // we're far from latest block - ExternalMessage::RequestFromHeight(RequestBlock { - from_hash: alpha_block.header.hash, - batch_size: self.max_blocks_in_flight, - }) + ( + ExternalMessage::RequestFromNumber(RequestBlock { + from_number: alpha_block.number(), + from_hash: alpha_block.hash(), + batch_size: self.max_blocks_in_flight, + }), + alpha_block.hash(), + ) } else { // we're close to latest block - ExternalMessage::RequestFromHash(RequestBlock { - from_hash: omega_block.unwrap().header.hash, - batch_size: self.max_blocks_in_flight, - }) + let omega_block = omega_block.unwrap(); + ( + ExternalMessage::RequestFromHash(RequestBlock { + from_hash: omega_block.hash(), + from_number: omega_block.number(), + batch_size: self.max_blocks_in_flight, + }), + omega_block.hash(), + ) }; let peer = self.in_flight.as_ref().unwrap(); @@ -430,7 +468,7 @@ impl BlockStore { tracing::info!( "Requesting {} missing blocks from {}", self.max_blocks_in_flight, - peer.peer_id, + hash, ); self.message_sender diff --git a/zilliqa/src/message.rs b/zilliqa/src/message.rs index c2724b3cc..6672f51ac 100644 --- a/zilliqa/src/message.rs +++ b/zilliqa/src/message.rs @@ -229,6 +229,7 @@ impl fmt::Debug for BlockResponse { #[derive(Debug, Clone, Serialize, Deserialize)] pub struct RequestBlock { + pub from_number: u64, pub from_hash: Hash, pub batch_size: usize, } @@ -280,9 +281,9 @@ pub enum ExternalMessage { Acknowledgement, AddPeer, RemovePeer, - RequestFromHeight(RequestBlock), + RequestFromNumber(RequestBlock), RequestFromHash(RequestBlock), - ResponseFromHeight(ResponseBlock), + ResponseFromNumber(ResponseBlock), ResponseFromHash(ResponseBlock), InjectedProposal(InjectedProposal), } @@ -305,16 +306,16 @@ impl Display for ExternalMessage { } ExternalMessage::AddPeer => write!(f, "AddPeer"), ExternalMessage::RemovePeer => write!(f, "RemovePeer"), - ExternalMessage::ResponseFromHeight(r) => { - write!(f, "ResponseFromHeight({})", r.proposals.len()) + ExternalMessage::ResponseFromNumber(r) => { + write!(f, "ResponseFromNumber({})", r.proposals.len()) } ExternalMessage::ResponseFromHash(r) => { write!(f, "ResponseFromHash({})", r.proposals.len()) } - ExternalMessage::RequestFromHeight(r) => { + ExternalMessage::RequestFromNumber(r) => { write!( f, - "RequestFromHeight({}, num={})", + "RequestFromNumber({}, num={})", r.from_hash, r.batch_size ) } diff --git a/zilliqa/src/node.rs b/zilliqa/src/node.rs index ce792ca5a..563312e0a 100644 --- a/zilliqa/src/node.rs +++ b/zilliqa/src/node.rs @@ -273,11 +273,11 @@ impl Node { self.request_responses .send((response_channel, ExternalMessage::Acknowledgement))?; } - ExternalMessage::RequestFromHeight(request) => { + ExternalMessage::RequestFromNumber(request) => { let message = self .consensus .blockstore - .handle_request_from_height(from, request)?; + .handle_request_from_number(from, request)?; self.request_responses.send((response_channel, message))?; } ExternalMessage::RequestFromHash(request) => { @@ -383,10 +383,10 @@ impl Node { pub fn handle_response(&mut self, from: PeerId, message: ExternalMessage) -> Result<()> { debug!(%from, to = %self.peer_id, %message, "handling response"); match message { - ExternalMessage::ResponseFromHeight(response) => { + ExternalMessage::ResponseFromNumber(response) => { self.consensus .blockstore - .handle_response_from_height(from, response)?; + .handle_response_from_number(from, response)?; } ExternalMessage::ResponseFromHash(response) => { self.consensus From 486056df022f9742d7bbcb1f7d175e8c202a4c7b Mon Sep 17 00:00:00 2001 From: Shawn Date: Mon, 30 Dec 2024 16:29:38 +0800 Subject: [PATCH 024/119] feat: sort-of working sync with multiple sources of truth. --- zilliqa/src/blockstore.rs | 78 +++++++++++++++++++-------------------- 1 file changed, 39 insertions(+), 39 deletions(-) diff --git a/zilliqa/src/blockstore.rs b/zilliqa/src/blockstore.rs index 466d362c7..37dc3d2c3 100644 --- a/zilliqa/src/blockstore.rs +++ b/zilliqa/src/blockstore.rs @@ -1,6 +1,7 @@ use std::{ cmp::Ordering, collections::{BinaryHeap, HashMap}, + ops::Sub, sync::Arc, time::{Duration, Instant}, }; @@ -37,6 +38,8 @@ enum DownGrade { // TODO: What if we receive a fork // TODO: How to handle adverserial history +const GAP_THRESHOLD: usize = 5; // How big is big/small gap. + #[derive(Debug)] pub struct BlockStore { // database @@ -162,19 +165,9 @@ impl BlockStore { // TODO: Check if we should service this request // Validators could respond to this request if there is nothing else to do. - let Some(omega_block) = self.db.get_block_by_hash(&request.from_hash)? else { - // We do not have the starting block - tracing::warn!( - "blockstore::RequestFromHash : missing starting block {}", - request.from_hash - ); - let message = ExternalMessage::ResponseFromHash(ResponseBlock { proposals: vec![] }); - return Ok(message); - }; - let batch_size = self.max_blocks_in_flight.min(request.batch_size); // mitigate DOS attacks by limiting the number of blocks we send let mut proposals: Vec = Vec::new(); - let mut hash = omega_block.parent_hash(); + let mut hash = request.from_hash; while proposals.len() < batch_size { // grab the parent let Some(block) = self.db.get_block_by_hash(&hash)? else { @@ -361,7 +354,7 @@ impl BlockStore { tracing::warn!("blockstore::ResponseFromHash : empty blocks {from}",); self.done_with_peer(DownGrade::Empty); return Ok(()); - } else if response.proposals.len() <= self.max_blocks_in_flight / 2 { + } else if response.proposals.len() < GAP_THRESHOLD { // Partial response, downgrade peer // Skip processing because we want to ensure that we have ALL the needed blocks to sync up. tracing::warn!("blockstore::ResponseFromHash : partial blocks {from}",); @@ -379,9 +372,15 @@ impl BlockStore { ); // TODO: Any additional checks we should do here? + // Sort proposals by number + let proposals = response + .proposals + .into_iter() + .sorted_by_key(|p| p.number()) + .collect_vec(); // Inject the proposals - self.inject_proposals(response.proposals)?; + self.inject_proposals(proposals)?; Ok(()) } @@ -439,38 +438,39 @@ impl BlockStore { self.max_blocks_in_flight as u64 }; - // TODO: Double-check hysteresis logic - may not even be necessary to do RequestFromHash - let (message, hash) = if block_gap > self.max_blocks_in_flight as u64 / 2 { + let peer = self.in_flight.as_ref().unwrap(); + + let message = if block_gap > self.max_blocks_in_flight.sub(GAP_THRESHOLD) as u64 { // we're far from latest block - ( - ExternalMessage::RequestFromNumber(RequestBlock { - from_number: alpha_block.number(), - from_hash: alpha_block.hash(), - batch_size: self.max_blocks_in_flight, - }), - alpha_block.hash(), - ) + let message = RequestBlock { + from_number: alpha_block.number(), + from_hash: alpha_block.hash(), + batch_size: self.max_blocks_in_flight, + }; + tracing::info!( + "blockstore::RequestMissingBlocks : requesting {} blocks at {} from {}", + message.batch_size, + message.from_number, + peer.peer_id, + ); + ExternalMessage::RequestFromNumber(message) } else { // we're close to latest block let omega_block = omega_block.unwrap(); - ( - ExternalMessage::RequestFromHash(RequestBlock { - from_hash: omega_block.hash(), - from_number: omega_block.number(), - batch_size: self.max_blocks_in_flight, - }), - omega_block.hash(), - ) + let message = RequestBlock { + from_hash: omega_block.hash(), + from_number: omega_block.number(), + batch_size: GAP_THRESHOLD * 2, + }; + tracing::info!( + "blockstore::RequestMissingBlocks : requesting {} blocks at {} from {}", + message.batch_size, + message.from_hash, + peer.peer_id, + ); + ExternalMessage::RequestFromHash(message) }; - let peer = self.in_flight.as_ref().unwrap(); - - tracing::info!( - "Requesting {} missing blocks from {}", - self.max_blocks_in_flight, - hash, - ); - self.message_sender .send_external_message(peer.peer_id, message)?; Ok(()) From de60034f29f22901d0c7b3ec56638520cbd7dc2a Mon Sep 17 00:00:00 2001 From: Shawn Date: Mon, 30 Dec 2024 17:05:02 +0800 Subject: [PATCH 025/119] feat: pre-allocate enough capacity; corrected block_gap check. --- zilliqa/src/blockstore.rs | 43 ++++++++++++++++----------------------- 1 file changed, 18 insertions(+), 25 deletions(-) diff --git a/zilliqa/src/blockstore.rs b/zilliqa/src/blockstore.rs index 37dc3d2c3..f97f66e4a 100644 --- a/zilliqa/src/blockstore.rs +++ b/zilliqa/src/blockstore.rs @@ -1,7 +1,6 @@ use std::{ cmp::Ordering, collections::{BinaryHeap, HashMap}, - ops::Sub, sync::Arc, time::{Duration, Instant}, }; @@ -30,9 +29,15 @@ enum DownGrade { /// If not, then it triggers a syncing algorithm. /// /// 1. We check if the gap between our last canonical block and the latest Proposal. -/// a. If it is a small gap, we request for history, going backwards from Proposal. -/// b. If it is a big gap, we request for history, going forwards from Canonical. -/// 2. When we receive a response, we inject the Proposals into our processing pipeline. +/// a. If it is a small gap, we request for blocks, going backwards from Proposal. +/// b. If it is a big gap, we request for blocks, going forwards from Canonical. +/// 2. When we receive a forwards history response, we check for matches against the cache. +/// This means that for a proposal to be injected, it must be corroborated by 2 sources. +/// a. If it matches the cached Proposal, we inject the proposal into the pipeline. +/// b. If it does not exist in the cache, we cache the new value. +/// c. If it does not match the cached Proposal, something is up and we stop there and request for more. +/// 3. When we receive a backwards history response, we inject it into the pipeline. +/// a. If it does not link up with the existing Canonical, then it will be dropped. /// // TODO: What if we receive a fork @@ -165,8 +170,8 @@ impl BlockStore { // TODO: Check if we should service this request // Validators could respond to this request if there is nothing else to do. - let batch_size = self.max_blocks_in_flight.min(request.batch_size); // mitigate DOS attacks by limiting the number of blocks we send - let mut proposals: Vec = Vec::new(); + let batch_size = self.max_blocks_in_flight.min(request.batch_size); // mitigate DOS by limiting the number of blocks we return + let mut proposals = Vec::with_capacity(batch_size); let mut hash = request.from_hash; while proposals.len() < batch_size { // grab the parent @@ -201,21 +206,12 @@ impl BlockStore { // TODO: Check if we should service this request. // Validators shall not respond to this request. - let Some(alpha) = self.db.get_canonical_block_by_number(request.from_number)? else { - // We do not have the starting block - tracing::warn!( - "blockstore::RequestFromNumber : missing starting block {}", - request.from_number - ); - let message: ExternalMessage = - ExternalMessage::ResponseFromNumber(ResponseBlock { proposals: vec![] }); - return Ok(message); - }; - // TODO: Replace this with a single SQL query - let batch_size = self.max_blocks_in_flight.min(request.batch_size) as u64; // mitigate DOS attacks by limiting the number of blocks we send - let mut proposals = Vec::new(); - for num in alpha.number().saturating_add(1)..=alpha.number().saturating_add(batch_size) { + let batch_size = self.max_blocks_in_flight.min(request.batch_size); // mitigate DOS attacks by limiting the number of blocks we send + let mut proposals = Vec::with_capacity(batch_size); + for num in request.from_number.saturating_add(1) + ..=request.from_number.saturating_add(batch_size as u64) + { let Some(block) = self.db.get_canonical_block_by_number(num)? else { // that's all we have! break; @@ -321,7 +317,7 @@ impl BlockStore { // Insert into the cache. // If current proposal matches another one in cache, from a different peer, inject the proposal. // Else, replace the cached values with the new ones. - let mut injections = Vec::new(); + let mut injections = Vec::with_capacity(proposals.len()); for p in proposals { // If the proposal already exists if let Some((peer, proposal)) = self.cache.remove(&p.number()) { @@ -404,9 +400,6 @@ impl BlockStore { return Ok(()); } } else { - if self.injected > 0 { - return Ok(()); - } self.in_flight = self.get_next_peer(); if self.in_flight.is_none() { tracing::warn!("Insufficient peers available to request missing blocks"); @@ -440,7 +433,7 @@ impl BlockStore { let peer = self.in_flight.as_ref().unwrap(); - let message = if block_gap > self.max_blocks_in_flight.sub(GAP_THRESHOLD) as u64 { + let message = if block_gap > GAP_THRESHOLD as u64 { // we're far from latest block let message = RequestBlock { from_number: alpha_block.number(), From 247c504317b4abe7b554b0997668b092f0fc7da9 Mon Sep 17 00:00:00 2001 From: Shawn Date: Mon, 30 Dec 2024 17:31:15 +0800 Subject: [PATCH 026/119] feat: replace non-corroborated blocks in cache. --- zilliqa/src/blockstore.rs | 56 ++++++++++++++++++++++----------------- 1 file changed, 32 insertions(+), 24 deletions(-) diff --git a/zilliqa/src/blockstore.rs b/zilliqa/src/blockstore.rs index f97f66e4a..977498bee 100644 --- a/zilliqa/src/blockstore.rs +++ b/zilliqa/src/blockstore.rs @@ -33,15 +33,14 @@ enum DownGrade { /// b. If it is a big gap, we request for blocks, going forwards from Canonical. /// 2. When we receive a forwards history response, we check for matches against the cache. /// This means that for a proposal to be injected, it must be corroborated by 2 sources. -/// a. If it matches the cached Proposal, we inject the proposal into the pipeline. -/// b. If it does not exist in the cache, we cache the new value. -/// c. If it does not match the cached Proposal, something is up and we stop there and request for more. +/// a. If it matches the cached value, we inject the proposal into the pipeline. +/// b. If it does not match, we replace the cached value and request for more. +/// b. If it does not exist in the cache, we cache the proposal. /// 3. When we receive a backwards history response, we inject it into the pipeline. -/// a. If it does not link up with the existing Canonical, then it will be dropped. +/// a. If it does not line up with the existing Canonical, then it will be dropped. /// -// TODO: What if we receive a fork -// TODO: How to handle adverserial history +// TODO: Speculative fetch, to speed things up. const GAP_THRESHOLD: usize = 5; // How big is big/small gap. @@ -58,9 +57,9 @@ pub struct BlockStore { // in-flight timeout request_timeout: Duration, // how many blocks to request at once - max_blocks_in_flight: usize, + max_batch_size: usize, // how many blocks to inject into the queue - max_blocks_injected: usize, + max_blocks_in_flight: usize, // our peer id peer_id: PeerId, // how many injected proposals @@ -93,8 +92,8 @@ impl BlockStore { peers, peer_id, request_timeout: config.consensus.consensus_timeout, - max_blocks_in_flight: config.block_request_batch_size.max(31), // between 30 seconds and 3 days of blocks. - max_blocks_injected: config.max_blocks_in_flight.min(3600), // cap to 1-hr worth of blocks + max_batch_size: config.block_request_batch_size.max(31), // between 30 seconds and 3 days of blocks. + max_blocks_in_flight: config.max_blocks_in_flight.min(3600), // cap to 1-hr worth of blocks in_flight: None, injected: 0, cache: HashMap::new(), @@ -170,7 +169,7 @@ impl BlockStore { // TODO: Check if we should service this request // Validators could respond to this request if there is nothing else to do. - let batch_size = self.max_blocks_in_flight.min(request.batch_size); // mitigate DOS by limiting the number of blocks we return + let batch_size = self.max_batch_size.min(request.batch_size); // mitigate DOS by limiting the number of blocks we return let mut proposals = Vec::with_capacity(batch_size); let mut hash = request.from_hash; while proposals.len() < batch_size { @@ -207,7 +206,7 @@ impl BlockStore { // Validators shall not respond to this request. // TODO: Replace this with a single SQL query - let batch_size = self.max_blocks_in_flight.min(request.batch_size); // mitigate DOS attacks by limiting the number of blocks we send + let batch_size = self.max_batch_size.min(request.batch_size); // mitigate DOS attacks by limiting the number of blocks we send let mut proposals = Vec::with_capacity(batch_size); for num in request.from_number.saturating_add(1) ..=request.from_number.saturating_add(batch_size as u64) @@ -291,7 +290,7 @@ impl BlockStore { tracing::warn!("blockstore::ResponseFromNumber : empty blocks {from}",); self.done_with_peer(DownGrade::Empty); return Ok(()); - } else if response.proposals.len() < self.max_blocks_in_flight { + } else if response.proposals.len() < self.max_batch_size { // Partial response, downgrade peer tracing::warn!("blockstore::ResponseFromNumber : partial blocks {from}",); self.done_with_peer(DownGrade::Partial); @@ -316,25 +315,34 @@ impl BlockStore { // Insert into the cache. // If current proposal matches another one in cache, from a different peer, inject the proposal. - // Else, replace the cached values with the new ones. - let mut injections = Vec::with_capacity(proposals.len()); - for p in proposals { - // If the proposal already exists + // Else, replace the cached Proposal with the new one. + let mut corroborated_proposals = Vec::with_capacity(proposals.len()); + let mut props = proposals.into_iter(); + + // Collect corroborated proposals + for p in props.by_ref() { if let Some((peer, proposal)) = self.cache.remove(&p.number()) { + // If the proposal already exists if peer != from && proposal.hash() == p.hash() { - injections.push(proposal); + // is corroborated proposal + corroborated_proposals.push(proposal); } else { - // insert the new one and; + // insert the different one and; self.cache.insert(p.number(), (from, p)); - break; // TODO: Replace the rest + break; // replace the rest in the next loop } } else { self.cache.insert(p.number(), (from, p)); } } + // Replace/insert the rest of the proposals in the cache + for p in props { + self.cache.insert(p.number(), (from, p)); + } + // Inject matched proposals - self.inject_proposals(injections)?; + self.inject_proposals(corroborated_proposals)?; Ok(()) } @@ -382,7 +390,7 @@ impl BlockStore { /// Request blocks between the current height and the given block. /// - /// The approach is to request blocks in batches of `max_blocks_in_flight` blocks. + /// The approach is to request blocks in batches of `max_batch_size` blocks. /// If None block is provided, we request blocks from the last known canonical block forwards. /// If the block gap is large, we request blocks from the last known canonical block forwards. /// If the block gap is small, we request blocks from the latest block backwards. @@ -428,7 +436,7 @@ impl BlockStore { .saturating_sub(alpha_block.header.number) } else { // Trigger a RequestFromNumber if the source block is None - self.max_blocks_in_flight as u64 + self.max_batch_size as u64 }; let peer = self.in_flight.as_ref().unwrap(); @@ -438,7 +446,7 @@ impl BlockStore { let message = RequestBlock { from_number: alpha_block.number(), from_hash: alpha_block.hash(), - batch_size: self.max_blocks_in_flight, + batch_size: self.max_batch_size, }; tracing::info!( "blockstore::RequestMissingBlocks : requesting {} blocks at {} from {}", From d87d09ed5734704b05099ee9b94674e91d370eaa Mon Sep 17 00:00:00 2001 From: Shawn Date: Mon, 30 Dec 2024 17:59:21 +0800 Subject: [PATCH 027/119] chore: clippy. checkpoint. [corroborate proposals] --- zilliqa/src/blockstore.rs | 40 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/zilliqa/src/blockstore.rs b/zilliqa/src/blockstore.rs index 977498bee..d8a5eff13 100644 --- a/zilliqa/src/blockstore.rs +++ b/zilliqa/src/blockstore.rs @@ -23,23 +23,22 @@ enum DownGrade { Empty, } -/// Syncing Algorithm -/// -/// When a Proposal is received by Consensus, we check if the parent exists in our DB. -/// If not, then it triggers a syncing algorithm. -/// -/// 1. We check if the gap between our last canonical block and the latest Proposal. -/// a. If it is a small gap, we request for blocks, going backwards from Proposal. -/// b. If it is a big gap, we request for blocks, going forwards from Canonical. -/// 2. When we receive a forwards history response, we check for matches against the cache. -/// This means that for a proposal to be injected, it must be corroborated by 2 sources. -/// a. If it matches the cached value, we inject the proposal into the pipeline. -/// b. If it does not match, we replace the cached value and request for more. -/// b. If it does not exist in the cache, we cache the proposal. -/// 3. When we receive a backwards history response, we inject it into the pipeline. -/// a. If it does not line up with the existing Canonical, then it will be dropped. -/// - +// Syncing Algorithm +// +// When a Proposal is received by Consensus, we check if the parent exists in our DB. +// If not, then it triggers a syncing algorithm. +// +// 1. We check if the gap between our last canonical block and the latest Proposal. +// a. If it is a small gap, we request for blocks, going backwards from Proposal. +// b. If it is a big gap, we request for blocks, going forwards from Canonical. +// 2. When we receive a forwards history response, we check for matches against the cache. +// This means that for a proposal to be injected, it must be corroborated by 2 sources. +// a. If it matches the cached value, we inject the proposal into the pipeline. +// b. If it does not match, we replace the cached value and request for more. +// b. If it does not exist in the cache, we cache the proposal. +// 3. When we receive a backwards history response, we inject it into the pipeline. +// a. If it does not line up with the existing Canonical, then it will be dropped. +// // TODO: Speculative fetch, to speed things up. const GAP_THRESHOLD: usize = 5; // How big is big/small gap. @@ -59,7 +58,7 @@ pub struct BlockStore { // how many blocks to request at once max_batch_size: usize, // how many blocks to inject into the queue - max_blocks_in_flight: usize, + _max_blocks_in_flight: usize, // our peer id peer_id: PeerId, // how many injected proposals @@ -93,7 +92,7 @@ impl BlockStore { peer_id, request_timeout: config.consensus.consensus_timeout, max_batch_size: config.block_request_batch_size.max(31), // between 30 seconds and 3 days of blocks. - max_blocks_in_flight: config.max_blocks_in_flight.min(3600), // cap to 1-hr worth of blocks + _max_blocks_in_flight: config.max_blocks_in_flight.min(3600), // cap to 1-hr worth of blocks in_flight: None, injected: 0, cache: HashMap::new(), @@ -424,8 +423,7 @@ impl BlockStore { .db .get_highest_canonical_block_number()? .unwrap_or_default(); - let alpha_block = self.db.get_canonical_block_by_number(height)?.unwrap(); - alpha_block + self.db.get_canonical_block_by_number(height)?.unwrap() }; // Compute the block gap. From eed2ee658ab0a71af22759148340ee1ef5535a0f Mon Sep 17 00:00:00 2001 From: Shawn Date: Mon, 30 Dec 2024 18:56:03 +0800 Subject: [PATCH 028/119] feat: [speculative fetch] --- zilliqa/src/blockstore.rs | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/zilliqa/src/blockstore.rs b/zilliqa/src/blockstore.rs index d8a5eff13..67cb9a6c9 100644 --- a/zilliqa/src/blockstore.rs +++ b/zilliqa/src/blockstore.rs @@ -39,7 +39,7 @@ enum DownGrade { // 3. When we receive a backwards history response, we inject it into the pipeline. // a. If it does not line up with the existing Canonical, then it will be dropped. // -// TODO: Speculative fetch, to speed things up. +// TODO: How to handle case where only single source of truth i.e. bootstrap node? const GAP_THRESHOLD: usize = 5; // How big is big/small gap. @@ -58,7 +58,7 @@ pub struct BlockStore { // how many blocks to request at once max_batch_size: usize, // how many blocks to inject into the queue - _max_blocks_in_flight: usize, + max_blocks_in_flight: usize, // our peer id peer_id: PeerId, // how many injected proposals @@ -92,7 +92,7 @@ impl BlockStore { peer_id, request_timeout: config.consensus.consensus_timeout, max_batch_size: config.block_request_batch_size.max(31), // between 30 seconds and 3 days of blocks. - _max_blocks_in_flight: config.max_blocks_in_flight.min(3600), // cap to 1-hr worth of blocks + max_blocks_in_flight: config.max_blocks_in_flight.min(3600), // cap to 1-hr worth of blocks in_flight: None, injected: 0, cache: HashMap::new(), @@ -343,6 +343,32 @@ impl BlockStore { // Inject matched proposals self.inject_proposals(corroborated_proposals)?; + // Fire speculative request + if self.latest_block.is_some() { + if self.injected < self.max_blocks_in_flight { + if let Some(peer) = self.get_next_peer() { + // we're far from latest block + let message = RequestBlock { + from_number: self.latest_block.as_ref().unwrap().number(), + from_hash: self.latest_block.as_ref().unwrap().hash(), + batch_size: self.max_batch_size, + }; + tracing::info!( + "blockstore::RequestMissingBlocks : speculative requesting {} blocks at {} from {}", + message.batch_size, + message.from_number, + peer.peer_id, + ); + self.message_sender.send_external_message( + peer.peer_id, + ExternalMessage::RequestFromNumber(message), + )?; + + self.in_flight = Some(peer); + } + } + } + Ok(()) } From 66abff8aaf594bfb7ded62a4d3719736cbc8b281 Mon Sep 17 00:00:00 2001 From: Shawn Date: Mon, 30 Dec 2024 21:33:12 +0800 Subject: [PATCH 029/119] feat: remove peer check, which allows it to proceed under circumstances where there is only 1 peer with the blocks. --- zilliqa/src/blockstore.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/zilliqa/src/blockstore.rs b/zilliqa/src/blockstore.rs index 67cb9a6c9..2371ec4fd 100644 --- a/zilliqa/src/blockstore.rs +++ b/zilliqa/src/blockstore.rs @@ -64,7 +64,7 @@ pub struct BlockStore { // how many injected proposals injected: usize, // cache - cache: HashMap, + cache: HashMap, latest_block: Option, } @@ -108,7 +108,7 @@ impl BlockStore { prop.from ); } - if let Some((_, p)) = self.cache.remove(&prop.block.number()) { + if let Some(p) = self.cache.remove(&prop.block.number()) { tracing::warn!( "blockstore::MarkReceivedProposal : removing stale cache proposal {}", p.number() @@ -320,24 +320,24 @@ impl BlockStore { // Collect corroborated proposals for p in props.by_ref() { - if let Some((peer, proposal)) = self.cache.remove(&p.number()) { + if let Some(proposal) = self.cache.remove(&p.number()) { // If the proposal already exists - if peer != from && proposal.hash() == p.hash() { + if proposal.hash() == p.hash() { // is corroborated proposal corroborated_proposals.push(proposal); } else { // insert the different one and; - self.cache.insert(p.number(), (from, p)); + self.cache.insert(p.number(), p); break; // replace the rest in the next loop } } else { - self.cache.insert(p.number(), (from, p)); + self.cache.insert(p.number(), p); } } // Replace/insert the rest of the proposals in the cache for p in props { - self.cache.insert(p.number(), (from, p)); + self.cache.insert(p.number(), p); } // Inject matched proposals From 27ae3ea595c60375c760ba48e86602946024e602 Mon Sep 17 00:00:00 2001 From: Shawn Date: Mon, 30 Dec 2024 21:58:48 +0800 Subject: [PATCH 030/119] chore: clippy. --- zilliqa/src/blockstore.rs | 47 ++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 25 deletions(-) diff --git a/zilliqa/src/blockstore.rs b/zilliqa/src/blockstore.rs index 2371ec4fd..eec712fd6 100644 --- a/zilliqa/src/blockstore.rs +++ b/zilliqa/src/blockstore.rs @@ -344,28 +344,25 @@ impl BlockStore { self.inject_proposals(corroborated_proposals)?; // Fire speculative request - if self.latest_block.is_some() { - if self.injected < self.max_blocks_in_flight { - if let Some(peer) = self.get_next_peer() { - // we're far from latest block - let message = RequestBlock { - from_number: self.latest_block.as_ref().unwrap().number(), - from_hash: self.latest_block.as_ref().unwrap().hash(), - batch_size: self.max_batch_size, - }; - tracing::info!( - "blockstore::RequestMissingBlocks : speculative requesting {} blocks at {} from {}", - message.batch_size, - message.from_number, - peer.peer_id, - ); - self.message_sender.send_external_message( - peer.peer_id, - ExternalMessage::RequestFromNumber(message), - )?; - - self.in_flight = Some(peer); - } + if self.latest_block.is_some() && self.injected < self.max_blocks_in_flight { + if let Some(peer) = self.get_next_peer() { + // we're far from latest block + let message = RequestBlock { + from_number: self.latest_block.as_ref().unwrap().number(), + from_hash: self.latest_block.as_ref().unwrap().hash(), + batch_size: self.max_batch_size, + }; + tracing::info!( + "blockstore::RequestMissingBlocks : speculative fetch {} blocks at {} from {}", + message.batch_size, + message.from_number, + peer.peer_id, + ); + self.message_sender.send_external_message( + peer.peer_id, + ExternalMessage::RequestFromNumber(message), + )?; + self.in_flight = Some(peer); } } @@ -424,7 +421,7 @@ impl BlockStore { if let Some(peer) = self.in_flight.as_ref() { if peer.last_used.elapsed() > self.request_timeout { tracing::warn!( - "In-flight request {} timed out, requesting from new peer", + "blockstore::RequestMissingBlocks : in-flight request {} timed out, requesting from new peer", peer.peer_id ); self.done_with_peer(DownGrade::Timeout); @@ -435,7 +432,7 @@ impl BlockStore { } else { self.in_flight = self.get_next_peer(); if self.in_flight.is_none() { - tracing::warn!("Insufficient peers available to request missing blocks"); + tracing::warn!("blockstore::RequestMissingBlocks : insufficient peers to request missing blocks"); return Ok(()); } } @@ -485,7 +482,7 @@ impl BlockStore { let message = RequestBlock { from_hash: omega_block.hash(), from_number: omega_block.number(), - batch_size: GAP_THRESHOLD * 2, + batch_size: GAP_THRESHOLD + 1, }; tracing::info!( "blockstore::RequestMissingBlocks : requesting {} blocks at {} from {}", From 20ccc462327b9e7f3aff4966c7fffb5a741ef268 Mon Sep 17 00:00:00 2001 From: Shawn Date: Thu, 2 Jan 2025 15:21:17 +0800 Subject: [PATCH 031/119] feat: added handle_metadata_request/response(). --- zilliqa/src/blockstore.rs | 208 +++++++++++++++++++++++++++----------- zilliqa/src/message.rs | 26 +++++ zilliqa/src/node.rs | 12 +++ 3 files changed, 188 insertions(+), 58 deletions(-) diff --git a/zilliqa/src/blockstore.rs b/zilliqa/src/blockstore.rs index eec712fd6..3800870f6 100644 --- a/zilliqa/src/blockstore.rs +++ b/zilliqa/src/blockstore.rs @@ -1,10 +1,11 @@ use std::{ cmp::Ordering, - collections::{BinaryHeap, HashMap}, + collections::{BTreeMap, BinaryHeap, HashMap}, sync::Arc, time::{Duration, Instant}, }; +use crate::crypto::Hash; use anyhow::Result; use itertools::Itertools; use libp2p::PeerId; @@ -12,7 +13,10 @@ use libp2p::PeerId; use crate::{ cfg::NodeConfig, db::Db, - message::{Block, ExternalMessage, InjectedProposal, Proposal, RequestBlock, ResponseBlock}, + message::{ + Block, ChainMetaData, ExternalMessage, InjectedProposal, Proposal, RequestBlock, + ResponseBlock, + }, node::MessageSender, }; @@ -66,6 +70,10 @@ pub struct BlockStore { // cache cache: HashMap, latest_block: Option, + + // Chain metadata + chain_metadata: BTreeMap, + last_metadata: Option, } impl BlockStore { @@ -97,10 +105,14 @@ impl BlockStore { injected: 0, cache: HashMap::new(), latest_block: None, + chain_metadata: BTreeMap::new(), + last_metadata: None, }) } - /// Match a received proposal + /// Mark a received proposal + /// + /// Mark a proposal as received, and remove it from the cache. pub fn mark_received_proposal(&mut self, prop: &InjectedProposal) -> Result<()> { if prop.from != self.peer_id { tracing::error!( @@ -131,7 +143,7 @@ impl BlockStore { "blockstore::ProcessProposal : Parent block {} not found", block.parent_hash() ); - self.request_missing_blocks(Some(block))?; + self.request_missing_chain(Some(block))?; return Ok(()); } Ok(()) @@ -152,6 +164,123 @@ impl BlockStore { Proposal::from_parts(block, txs) } + /// Convenience function to extract metadata from the block. + fn block_to_metadata(&self, block: Block) -> ChainMetaData { + ChainMetaData { + block_number: block.number(), + block_hash: block.hash(), + parent_hash: block.parent_hash(), + block_timestamp: block.timestamp(), + } + } + + pub fn handle_metadata_response( + &mut self, + from: PeerId, + response: Vec, + ) -> Result<()> { + // ... + tracing::info!( + "blockstore::MetadataResponse : received {} metadata from {}", + response.len(), + from + ); + + // Process whatever we have received. + if response.is_empty() { + // Empty response, downgrade peer + tracing::warn!("blockstore::MetadataResponse : empty blocks {from}",); + self.done_with_peer(DownGrade::Empty); + return Ok(()); + } else if response.len() < self.max_blocks_in_flight { + // Partial response, downgrade peer + tracing::warn!("blockstore::MetadataResponse : partial blocks {from}",); + self.done_with_peer(DownGrade::Partial); + } else { + self.done_with_peer(DownGrade::None); + } + + // Sort metadata by number, reversed + let mut metadata = response + .into_iter() + .sorted_by_key(|f| f.block_number) + .collect_vec(); + metadata.reverse(); + // mark the block + metadata.last_mut().unwrap().parent_hash = metadata.first().unwrap().block_hash; + + // Store the metadata + for meta in metadata { + // TODO: Check the linkage of the returned chain + if let Some(meta) = self.chain_metadata.insert(meta.block_hash, meta) { + self.last_metadata = Some(meta); + } + } + + // If the last block does not exist in our canonical history, fire the next request + if self.last_metadata.is_some() + && self + .db + .get_block_by_hash(&self.last_metadata.as_ref().unwrap().block_hash)? + .is_none() + { + self.request_missing_chain(None)?; + } else { + // Hit our internal history. Begin replicating chain. + self.request_missing_blocks()?; + } + + Ok(()) + } + + fn request_missing_blocks(&mut self) -> Result<()> { + // ... + tracing::info!( + "blockstore::RequestMissingBlocks : requesting missing blocks {:?}", + self.last_metadata + ); + + Ok(()) + } + + /// Returns the metadata of the chain from a given hash. + /// + /// This constructs a historical chain going backwards from a hash, by following the parent_hash. + /// It collects N blocks and returns the metadata of that particular chain. + /// This is mainly used in Phase 1 of the syncing algorithm, to construct a chain history. + pub fn handle_metadata_request( + &mut self, + from: PeerId, + request: RequestBlock, + ) -> Result { + tracing::info!( + "blockstore::MetadataRequest : received a metadata request from {}", + from + ); + + // TODO: Check if we should service this request + // Validators could respond to this request if there is nothing else to do. + + let batch_size = self.max_batch_size.min(request.batch_size); // mitigate DOS by limiting the number of blocks we return + let mut metas = Vec::with_capacity(batch_size); + let mut hash = request.from_hash; + while metas.len() < batch_size { + // grab the parent + let Some(block) = self.db.get_block_by_hash(&hash)? else { + break; // that's all we have! + }; + hash = block.parent_hash(); + metas.push(self.block_to_metadata(block)); + } + + let message = ExternalMessage::MetaDataResponse(metas); + tracing::trace!( + ?message, + "blockstore::MetadataFromHash : responding to block request" + ); + Ok(message) + } + /// Request blocks from a hash, backwards. /// /// It will collect N blocks by following the block.parent_hash() of the requested block. @@ -416,7 +545,7 @@ impl BlockStore { /// If None block is provided, we request blocks from the last known canonical block forwards. /// If the block gap is large, we request blocks from the last known canonical block forwards. /// If the block gap is small, we request blocks from the latest block backwards. - pub fn request_missing_blocks(&mut self, omega_block: Option) -> Result<()> { + pub fn request_missing_chain(&mut self, omega_block: Option) -> Result<()> { // Early exit if there's a request in-flight; and if it has not expired. if let Some(peer) = self.in_flight.as_ref() { if peer.last_used.elapsed() > self.request_timeout { @@ -437,62 +566,25 @@ impl BlockStore { } } - // highest canonical block we know - let alpha_block = if self.latest_block.is_some() { - self.latest_block.as_ref().unwrap().clone() - } else { - // TODO: Replace this with a single SQL query. - let height = self - .db - .get_highest_canonical_block_number()? - .unwrap_or_default(); - self.db.get_canonical_block_by_number(height)?.unwrap() - }; - - // Compute the block gap. - let block_gap = if let Some(omega_block) = omega_block.as_ref() { - omega_block - .header - .number - .saturating_sub(alpha_block.header.number) + let message = if let Some(omega_block) = omega_block { + ExternalMessage::MetaDataRequest(RequestBlock { + from_number: omega_block.number(), + from_hash: omega_block.hash(), + batch_size: self.max_blocks_in_flight, + }) } else { - // Trigger a RequestFromNumber if the source block is None - self.max_batch_size as u64 + ExternalMessage::MetaDataRequest(RequestBlock { + from_number: self.last_metadata.as_ref().unwrap().block_number, + from_hash: self.last_metadata.as_ref().unwrap().block_hash, + batch_size: self.max_blocks_in_flight, + }) }; - let peer = self.in_flight.as_ref().unwrap(); - - let message = if block_gap > GAP_THRESHOLD as u64 { - // we're far from latest block - let message = RequestBlock { - from_number: alpha_block.number(), - from_hash: alpha_block.hash(), - batch_size: self.max_batch_size, - }; - tracing::info!( - "blockstore::RequestMissingBlocks : requesting {} blocks at {} from {}", - message.batch_size, - message.from_number, - peer.peer_id, - ); - ExternalMessage::RequestFromNumber(message) - } else { - // we're close to latest block - let omega_block = omega_block.unwrap(); - let message = RequestBlock { - from_hash: omega_block.hash(), - from_number: omega_block.number(), - batch_size: GAP_THRESHOLD + 1, - }; - tracing::info!( - "blockstore::RequestMissingBlocks : requesting {} blocks at {} from {}", - message.batch_size, - message.from_hash, - peer.peer_id, - ); - ExternalMessage::RequestFromHash(message) - }; - + tracing::info!( + ?message, + "blockstore::RequestMissingBlocks : requesting missing chain from {}", + peer.peer_id + ); self.message_sender .send_external_message(peer.peer_id, message)?; Ok(()) diff --git a/zilliqa/src/message.rs b/zilliqa/src/message.rs index 6672f51ac..15ff364aa 100644 --- a/zilliqa/src/message.rs +++ b/zilliqa/src/message.rs @@ -247,6 +247,16 @@ pub struct InjectedProposal { pub block: Proposal, } +/// Used to hold metadata about the chain +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ChainMetaData { + // An encoded PeerId + pub block_hash: Hash, + pub parent_hash: Hash, + pub block_number: u64, + pub block_timestamp: SystemTime, +} + /// Used to convey proposal processing internally, to avoid blocking threads for too long. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct ProcessProposal { @@ -286,6 +296,10 @@ pub enum ExternalMessage { ResponseFromNumber(ResponseBlock), ResponseFromHash(ResponseBlock), InjectedProposal(InjectedProposal), + MetaDataRequest(RequestBlock), + MetaDataResponse(Vec), + MultiBlockRequest(Vec), + MultiBlockResponse(Vec), } impl ExternalMessage { @@ -301,6 +315,18 @@ impl ExternalMessage { impl Display for ExternalMessage { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { match self { + ExternalMessage::MultiBlockRequest(r) => { + write!(f, "MultiBlockRequest({})", r.len()) + } + ExternalMessage::MultiBlockResponse(r) => { + write!(f, "MultiBlockResponse({})", r.len()) + } + ExternalMessage::MetaDataResponse(r) => { + write!(f, "MetaDataResponse({})", r.len()) + } + ExternalMessage::MetaDataRequest(r) => { + write!(f, "MetaDataRequest({}, num={})", r.from_hash, r.batch_size) + } ExternalMessage::InjectedProposal(p) => { write!(f, "InjectedProposal {}", p.block.number()) } diff --git a/zilliqa/src/node.rs b/zilliqa/src/node.rs index 563312e0a..440656849 100644 --- a/zilliqa/src/node.rs +++ b/zilliqa/src/node.rs @@ -273,6 +273,13 @@ impl Node { self.request_responses .send((response_channel, ExternalMessage::Acknowledgement))?; } + ExternalMessage::MetaDataRequest(request) => { + let message = self + .consensus + .blockstore + .handle_metadata_request(from, request)?; + self.request_responses.send((response_channel, message))?; + } ExternalMessage::RequestFromNumber(request) => { let message = self .consensus @@ -383,6 +390,11 @@ impl Node { pub fn handle_response(&mut self, from: PeerId, message: ExternalMessage) -> Result<()> { debug!(%from, to = %self.peer_id, %message, "handling response"); match message { + ExternalMessage::MetaDataResponse(response) => { + self.consensus + .blockstore + .handle_metadata_response(from, response)?; + } ExternalMessage::ResponseFromNumber(response) => { self.consensus .blockstore From 7580b399b452303dcd0bdc6b4423f75cf9ab249a Mon Sep 17 00:00:00 2001 From: Shawn Date: Fri, 3 Jan 2025 09:58:23 +0800 Subject: [PATCH 032/119] feat: [checkpoint - retrieve chain metadata]. --- zilliqa/src/blockstore.rs | 83 ++++++++++++++++++++++++++------------- 1 file changed, 56 insertions(+), 27 deletions(-) diff --git a/zilliqa/src/blockstore.rs b/zilliqa/src/blockstore.rs index 3800870f6..319d6b5e9 100644 --- a/zilliqa/src/blockstore.rs +++ b/zilliqa/src/blockstore.rs @@ -74,6 +74,7 @@ pub struct BlockStore { // Chain metadata chain_metadata: BTreeMap, last_metadata: Option, + landmark_metadata: Vec, } impl BlockStore { @@ -107,6 +108,7 @@ impl BlockStore { latest_block: None, chain_metadata: BTreeMap::new(), last_metadata: None, + landmark_metadata: Vec::new(), }) } @@ -174,6 +176,46 @@ impl BlockStore { } } + /// Request missing blocks from the chain. + /// + /// It constructs a set of hashes, which constitute the series of blocks that are missing. + /// These hashes are then sent to a Peer for retrieval. + fn request_missing_blocks(&mut self) -> Result<()> { + // ... + tracing::info!("blockstore::RequestMissingBlocks : requesting missing blocks"); + + // If we have no landmarks, we have nothing to do + if let Some(mut hash) = self.landmark_metadata.pop() { + let mut request_hashes = Vec::with_capacity(self.max_batch_size); + request_hashes.push(hash); + while let Some(meta) = self.chain_metadata.remove(&hash) { + request_hashes.push(meta.block_hash); + hash = meta.parent_hash; + // re-insert the metadata so as not to lose it + // self.chain_metadata.insert(hash, meta); + } + // Fire request + if let Some(peer) = self.get_next_peer() { + tracing::debug!( + "blockstore::RequestMissingBlocks : requesting {} blocks from {}", + request_hashes.len(), + peer.peer_id + ); + self.message_sender.send_external_message( + peer.peer_id, + ExternalMessage::MultiBlockRequest(request_hashes), + )?; + self.in_flight = Some(peer); + } + } + + Ok(()) + } + + /// Handle a response to a metadata request. + /// + /// This is the first step in the syncing algorithm, where we receive a set of metadata and use it to + /// construct a chain history. We then request the missing blocks from the chain. pub fn handle_metadata_response( &mut self, from: PeerId, @@ -201,13 +243,10 @@ impl BlockStore { } // Sort metadata by number, reversed - let mut metadata = response + let metadata = response .into_iter() - .sorted_by_key(|f| f.block_number) + .sorted_by(|a, b| b.block_number.cmp(&a.block_number)) .collect_vec(); - metadata.reverse(); - // mark the block - metadata.last_mut().unwrap().parent_hash = metadata.first().unwrap().block_hash; // Store the metadata for meta in metadata { @@ -218,31 +257,18 @@ impl BlockStore { } // If the last block does not exist in our canonical history, fire the next request - if self.last_metadata.is_some() - && self - .db - .get_block_by_hash(&self.last_metadata.as_ref().unwrap().block_hash)? - .is_none() - { - self.request_missing_chain(None)?; - } else { - // Hit our internal history. Begin replicating chain. - self.request_missing_blocks()?; + if let Some(meta) = self.last_metadata.as_ref() { + if self.db.get_block_by_hash(&meta.block_hash)?.is_none() { + self.request_missing_chain(None)?; + } else { + // Hit our internal history. Begin replicating chain. + self.request_missing_blocks()?; + } } Ok(()) } - fn request_missing_blocks(&mut self) -> Result<()> { - // ... - tracing::info!( - "blockstore::RequestMissingBlocks : requesting missing blocks {:?}", - self.last_metadata - ); - - Ok(()) - } - /// Returns the metadata of the chain from a given hash. /// /// This constructs a historical chain going backwards from a hash, by following the parent_hash. @@ -283,7 +309,7 @@ impl BlockStore { /// Request blocks from a hash, backwards. /// - /// It will collect N blocks by following the block.parent_hash() of the requested block. + /// It will collect N blocks by following the block.parent_hash() of each requested block. pub fn handle_request_from_hash( &mut self, from: PeerId, @@ -567,15 +593,18 @@ impl BlockStore { } let message = if let Some(omega_block) = omega_block { + self.landmark_metadata.push(omega_block.hash()); ExternalMessage::MetaDataRequest(RequestBlock { from_number: omega_block.number(), from_hash: omega_block.hash(), batch_size: self.max_blocks_in_flight, }) } else { + let hash = self.last_metadata.as_ref().unwrap().parent_hash; + self.landmark_metadata.push(hash); ExternalMessage::MetaDataRequest(RequestBlock { from_number: self.last_metadata.as_ref().unwrap().block_number, - from_hash: self.last_metadata.as_ref().unwrap().block_hash, + from_hash: hash, batch_size: self.max_blocks_in_flight, }) }; From abfa6d474df153196e6c6bc98073ac3b2442f514 Mon Sep 17 00:00:00 2001 From: Shawn Date: Fri, 3 Jan 2025 10:13:31 +0800 Subject: [PATCH 033/119] feat: added handle_multiblock_request/response(). --- zilliqa/src/blockstore.rs | 133 +++++++++++++++++++++++++++++++------- zilliqa/src/node.rs | 12 ++++ 2 files changed, 121 insertions(+), 24 deletions(-) diff --git a/zilliqa/src/blockstore.rs b/zilliqa/src/blockstore.rs index 319d6b5e9..7fbd8a0c1 100644 --- a/zilliqa/src/blockstore.rs +++ b/zilliqa/src/blockstore.rs @@ -176,10 +176,85 @@ impl BlockStore { } } + /// Handle a multi-block response. + /// + /// This is the final step in the syncing algorithm, where we receive a set of blocks and inject them into + /// the pipeline. We also remove the blocks from the chain metadata, because they are now in the pipeline. + pub fn handle_multiblock_response( + &mut self, + from: PeerId, + response: Vec, + ) -> Result<()> { + // ... + tracing::info!( + "blockstore::MultiBlockResponse : received {} blocks from {}", + response.len(), + from + ); + + // Process whatever we received + if response.is_empty() { + // Empty response, downgrade peer + tracing::warn!("blockstore::MultiBlockResponse : empty blocks {from}",); + self.done_with_peer(DownGrade::Empty); + } else if response.len() < self.max_blocks_in_flight { + // Partial response, downgrade peer + tracing::warn!("blockstore::MultiBlockResponse : partial blocks {from}",); + self.done_with_peer(DownGrade::Partial); + } else { + self.done_with_peer(DownGrade::None); + } + + let proposals = response + .into_iter() + .sorted_by_key(|p| p.number()) + .collect_vec(); + + // Remove the blocks from the chain metadata, if they exist + for p in &proposals { + self.chain_metadata.remove(&p.hash()); + } + + self.inject_proposals(proposals)?; + + // Request for next bunch + if !self.landmark_metadata.is_empty() { + self.request_missing_blocks()?; + } + + Ok(()) + } + + pub fn handle_multiblock_request( + &mut self, + from: PeerId, + request: Vec, + ) -> Result { + // ... + tracing::info!( + "blockstore::MultiBlockRequest : received a {} multiblock request from {}", + request.len(), + from + ); + + let batch_size: usize = self.max_batch_size.min(request.len()); // mitigate DOS by limiting the number of blocks we return + let mut proposals = Vec::with_capacity(batch_size); + for hash in request { + let Some(block) = self.db.get_block_by_hash(&hash)? else { + break; // that's all we have! + }; + proposals.push(self.block_to_proposal(block)); + } + + let message = ExternalMessage::MultiBlockResponse(proposals); + Ok(message) + } + /// Request missing blocks from the chain. /// /// It constructs a set of hashes, which constitute the series of blocks that are missing. /// These hashes are then sent to a Peer for retrieval. + /// This is Part 2 of the syncing algorithm. fn request_missing_blocks(&mut self) -> Result<()> { // ... tracing::info!("blockstore::RequestMissingBlocks : requesting missing blocks"); @@ -187,7 +262,6 @@ impl BlockStore { // If we have no landmarks, we have nothing to do if let Some(mut hash) = self.landmark_metadata.pop() { let mut request_hashes = Vec::with_capacity(self.max_batch_size); - request_hashes.push(hash); while let Some(meta) = self.chain_metadata.remove(&hash) { request_hashes.push(meta.block_hash); hash = meta.parent_hash; @@ -197,9 +271,9 @@ impl BlockStore { // Fire request if let Some(peer) = self.get_next_peer() { tracing::debug!( - "blockstore::RequestMissingBlocks : requesting {} blocks from {}", + "blockstore::RequestMissingBlocks : requesting {} blocks of {}", request_hashes.len(), - peer.peer_id + self.landmark_metadata.len(), ); self.message_sender.send_external_message( peer.peer_id, @@ -213,7 +287,7 @@ impl BlockStore { } /// Handle a response to a metadata request. - /// + /// /// This is the first step in the syncing algorithm, where we receive a set of metadata and use it to /// construct a chain history. We then request the missing blocks from the chain. pub fn handle_metadata_response( @@ -221,7 +295,6 @@ impl BlockStore { from: PeerId, response: Vec, ) -> Result<()> { - // ... tracing::info!( "blockstore::MetadataResponse : received {} metadata from {}", response.len(), @@ -234,7 +307,7 @@ impl BlockStore { tracing::warn!("blockstore::MetadataResponse : empty blocks {from}",); self.done_with_peer(DownGrade::Empty); return Ok(()); - } else if response.len() < self.max_blocks_in_flight { + } else if response.len() < self.max_batch_size { // Partial response, downgrade peer tracing::warn!("blockstore::MetadataResponse : partial blocks {from}",); self.done_with_peer(DownGrade::Partial); @@ -248,17 +321,18 @@ impl BlockStore { .sorted_by(|a, b| b.block_number.cmp(&a.block_number)) .collect_vec(); + self.last_metadata = Some(metadata.last().unwrap().clone()); + // Store the metadata for meta in metadata { // TODO: Check the linkage of the returned chain - if let Some(meta) = self.chain_metadata.insert(meta.block_hash, meta) { - self.last_metadata = Some(meta); - } + self.chain_metadata.insert(meta.block_hash, meta); } // If the last block does not exist in our canonical history, fire the next request if let Some(meta) = self.last_metadata.as_ref() { if self.db.get_block_by_hash(&meta.block_hash)?.is_none() { + // TODO: store the peer that provided this metadata self.request_missing_chain(None)?; } else { // Hit our internal history. Begin replicating chain. @@ -287,7 +361,7 @@ impl BlockStore { // TODO: Check if we should service this request // Validators could respond to this request if there is nothing else to do. - let batch_size = self.max_batch_size.min(request.batch_size); // mitigate DOS by limiting the number of blocks we return + let batch_size: usize = self.max_batch_size.min(request.batch_size); // mitigate DOS by limiting the number of blocks we return let mut metas = Vec::with_capacity(batch_size); let mut hash = request.from_hash; while metas.len() < batch_size { @@ -386,11 +460,6 @@ impl BlockStore { /// last known Proposal in the pipeline. This is used for speculative fetches, and also for /// knowing where to continue fetching from. fn inject_proposals(&mut self, proposals: Vec) -> Result<()> { - tracing::info!( - "blockstore::InjectProposals : injecting {} proposals", - proposals.len() - ); - if proposals.is_empty() { return Ok(()); } @@ -401,6 +470,7 @@ impl BlockStore { // Increment proposals injected self.injected = self.injected.saturating_add(proposals.len()); + let len = proposals.len(); // Just pump the Proposals back to ourselves. for p in proposals { @@ -418,6 +488,12 @@ impl BlockStore { }), )?; } + + tracing::info!( + "blockstore::InjectProposals : injected {}/{} proposals", + len, + self.injected + ); // return last proposal Ok(()) } @@ -576,7 +652,7 @@ impl BlockStore { if let Some(peer) = self.in_flight.as_ref() { if peer.last_used.elapsed() > self.request_timeout { tracing::warn!( - "blockstore::RequestMissingBlocks : in-flight request {} timed out, requesting from new peer", + "blockstore::RequestMissingChain : in-flight request {} timed out, requesting from new peer", peer.peer_id ); self.done_with_peer(DownGrade::Timeout); @@ -585,33 +661,42 @@ impl BlockStore { return Ok(()); } } else { + if self.injected > 0 { + tracing::warn!( + "blockstore::RequestMissingChain : too many {} blocks in flight", + self.injected + ); + return Ok(()); + } self.in_flight = self.get_next_peer(); if self.in_flight.is_none() { - tracing::warn!("blockstore::RequestMissingBlocks : insufficient peers to request missing blocks"); + tracing::warn!("blockstore::RequestMissingChain : insufficient peers to request missing blocks"); return Ok(()); } } let message = if let Some(omega_block) = omega_block { - self.landmark_metadata.push(omega_block.hash()); + let num = omega_block.number(); + let hash = omega_block.hash(); + self.landmark_metadata.push(hash); ExternalMessage::MetaDataRequest(RequestBlock { - from_number: omega_block.number(), - from_hash: omega_block.hash(), - batch_size: self.max_blocks_in_flight, + from_number: num, + from_hash: hash, + batch_size: self.max_batch_size, }) } else { let hash = self.last_metadata.as_ref().unwrap().parent_hash; self.landmark_metadata.push(hash); ExternalMessage::MetaDataRequest(RequestBlock { - from_number: self.last_metadata.as_ref().unwrap().block_number, + from_number: 0, from_hash: hash, - batch_size: self.max_blocks_in_flight, + batch_size: self.max_batch_size, }) }; let peer = self.in_flight.as_ref().unwrap(); tracing::info!( ?message, - "blockstore::RequestMissingBlocks : requesting missing chain from {}", + "blockstore::RequestMissingChain : requesting missing chain from {}", peer.peer_id ); self.message_sender diff --git a/zilliqa/src/node.rs b/zilliqa/src/node.rs index 440656849..816d9a5f4 100644 --- a/zilliqa/src/node.rs +++ b/zilliqa/src/node.rs @@ -273,6 +273,13 @@ impl Node { self.request_responses .send((response_channel, ExternalMessage::Acknowledgement))?; } + ExternalMessage::MultiBlockRequest(request) => { + let message = self + .consensus + .blockstore + .handle_multiblock_request(from, request)?; + self.request_responses.send((response_channel, message))?; + } ExternalMessage::MetaDataRequest(request) => { let message = self .consensus @@ -390,6 +397,11 @@ impl Node { pub fn handle_response(&mut self, from: PeerId, message: ExternalMessage) -> Result<()> { debug!(%from, to = %self.peer_id, %message, "handling response"); match message { + ExternalMessage::MultiBlockResponse(response) => { + self.consensus + .blockstore + .handle_multiblock_response(from, response)?; + } ExternalMessage::MetaDataResponse(response) => { self.consensus .blockstore From 936b0a426d69538df3417b8d74a33fa0c93647af Mon Sep 17 00:00:00 2001 From: Shawn Date: Fri, 3 Jan 2025 13:22:14 +0800 Subject: [PATCH 034/119] feat: [checkpoint - multi_block_request/response; never quite catching up.] --- zilliqa/src/blockstore.rs | 252 +++++++++++++++++++++++--------------- 1 file changed, 156 insertions(+), 96 deletions(-) diff --git a/zilliqa/src/blockstore.rs b/zilliqa/src/blockstore.rs index 7fbd8a0c1..b4597963e 100644 --- a/zilliqa/src/blockstore.rs +++ b/zilliqa/src/blockstore.rs @@ -46,6 +46,7 @@ enum DownGrade { // TODO: How to handle case where only single source of truth i.e. bootstrap node? const GAP_THRESHOLD: usize = 5; // How big is big/small gap. +const DO_SPECULATIVE: bool = false; // Speeds up syncing by speculatively fetching blocks, allowing it to catch up. #[derive(Debug)] pub struct BlockStore { @@ -73,8 +74,9 @@ pub struct BlockStore { // Chain metadata chain_metadata: BTreeMap, - last_metadata: Option, - landmark_metadata: Vec, + p1_metadata: Option, + p2_metadata: Option, + landmarks: Vec, } impl BlockStore { @@ -107,8 +109,9 @@ impl BlockStore { cache: HashMap::new(), latest_block: None, chain_metadata: BTreeMap::new(), - last_metadata: None, - landmark_metadata: Vec::new(), + p1_metadata: None, + landmarks: Vec::new(), + p2_metadata: None, }) } @@ -145,7 +148,18 @@ impl BlockStore { "blockstore::ProcessProposal : Parent block {} not found", block.parent_hash() ); - self.request_missing_chain(Some(block))?; + if self.p2_metadata.is_some() { + // Continue phase 2 + self.request_missing_blocks()?; + } else { + if self.p1_metadata.is_none() { + // Start phase 1 + self.request_missing_chain(Some(block))?; + } else { + // Continue phase 1 + self.request_missing_chain(None)?; + } + } return Ok(()); } Ok(()) @@ -197,7 +211,7 @@ impl BlockStore { // Empty response, downgrade peer tracing::warn!("blockstore::MultiBlockResponse : empty blocks {from}",); self.done_with_peer(DownGrade::Empty); - } else if response.len() < self.max_blocks_in_flight { + } else if response.len() < self.max_batch_size { // Partial response, downgrade peer tracing::warn!("blockstore::MultiBlockResponse : partial blocks {from}",); self.done_with_peer(DownGrade::Partial); @@ -210,6 +224,19 @@ impl BlockStore { .sorted_by_key(|p| p.number()) .collect_vec(); + if let Some(landmark) = self.landmarks.pop() { + // remove the last landmark, should match proposals.last() + let hash = proposals.last().as_ref().unwrap().hash(); + if hash != landmark { + tracing::warn!( + "blockstore::MultiBlockResponse : mismatched landmark {} != {}", + landmark, + hash, + ); + self.landmarks.push(landmark); // put it back + } + } + // Remove the blocks from the chain metadata, if they exist for p in &proposals { self.chain_metadata.remove(&p.hash()); @@ -217,8 +244,12 @@ impl BlockStore { self.inject_proposals(proposals)?; - // Request for next bunch - if !self.landmark_metadata.is_empty() { + // Done with phase 2, allow phase 1 to restart. + if self.landmarks.is_empty() { + self.p1_metadata = None; + self.chain_metadata.clear(); + } else if DO_SPECULATIVE { + // Speculatively request more blocks self.request_missing_blocks()?; } @@ -256,33 +287,60 @@ impl BlockStore { /// These hashes are then sent to a Peer for retrieval. /// This is Part 2 of the syncing algorithm. fn request_missing_blocks(&mut self) -> Result<()> { - // ... - tracing::info!("blockstore::RequestMissingBlocks : requesting missing blocks"); - - // If we have no landmarks, we have nothing to do - if let Some(mut hash) = self.landmark_metadata.pop() { - let mut request_hashes = Vec::with_capacity(self.max_batch_size); - while let Some(meta) = self.chain_metadata.remove(&hash) { - request_hashes.push(meta.block_hash); - hash = meta.parent_hash; - // re-insert the metadata so as not to lose it - // self.chain_metadata.insert(hash, meta); + // Early exit if there's a request in-flight; and if it has not expired. + if let Some(peer) = self.in_flight.as_ref() { + if peer.last_used.elapsed() > self.request_timeout { + tracing::warn!( + "blockstore::RequestMissingBlocks : in-flight request {} timed out, requesting from new peer", + peer.peer_id + ); + self.done_with_peer(DownGrade::Timeout); + } else { + return Ok(()); } - // Fire request - if let Some(peer) = self.get_next_peer() { + } else { + if self.p2_metadata.is_none() { + tracing::warn!( + "blockstore::RequestMissingBlocks : no metadata to request missing blocks" + ); + return Ok(()); + } + } + + if let Some(peer) = self.get_next_peer() { + // If we have no landmarks, we have nothing to do + self.p2_metadata = None; + if let Some(mut hash) = self.landmarks.pop() { + self.landmarks.push(hash); // we actually need to peek() at the last element + let mut request_hashes = Vec::with_capacity(self.max_batch_size); + while let Some(meta) = self.chain_metadata.remove(&hash) { + request_hashes.push(meta.block_hash); + hash = meta.parent_hash; + // re-insert the metadata so as not to lose it + // self.chain_metadata.insert(hash, meta); + self.p2_metadata = Some(meta); + } + + // Fire request tracing::debug!( - "blockstore::RequestMissingBlocks : requesting {} blocks of {}", + "blockstore::RequestMissingBlocks : requesting {} blocks of set #{}", request_hashes.len(), - self.landmark_metadata.len(), + self.landmarks.len(), ); self.message_sender.send_external_message( peer.peer_id, ExternalMessage::MultiBlockRequest(request_hashes), )?; self.in_flight = Some(peer); + } else { + // No more landmarks, we're done + self.peers.push(peer); } + } else { + tracing::warn!( + "blockstore::RequestMissingBlocks : insufficient peers to request missing blocks" + ); } - Ok(()) } @@ -321,7 +379,12 @@ impl BlockStore { .sorted_by(|a, b| b.block_number.cmp(&a.block_number)) .collect_vec(); - self.last_metadata = Some(metadata.last().unwrap().clone()); + let p1_metadata = metadata.last().unwrap().clone(); + let last_hash = p1_metadata.block_hash; + self.p1_metadata = Some(p1_metadata); + + self.landmarks + .push(metadata.first().as_ref().unwrap().block_hash); // Store the metadata for meta in metadata { @@ -330,14 +393,11 @@ impl BlockStore { } // If the last block does not exist in our canonical history, fire the next request - if let Some(meta) = self.last_metadata.as_ref() { - if self.db.get_block_by_hash(&meta.block_hash)?.is_none() { - // TODO: store the peer that provided this metadata - self.request_missing_chain(None)?; - } else { - // Hit our internal history. Begin replicating chain. - self.request_missing_blocks()?; - } + if self.db.get_block_by_hash(&last_hash)?.is_some() { + // Hit our internal history. Start phase 2. + self.p2_metadata = self.p1_metadata.clone(); + } else if DO_SPECULATIVE { + self.request_missing_chain(None)?; } Ok(()) @@ -381,6 +441,69 @@ impl BlockStore { Ok(message) } + /// Request missing chain from a peer. + /// + /// This constructs a chain history by requesting blocks from a peer, going backwards from a given block. + /// If phase 1 is in progress, it continues requesting blocks from the last known phase 1 block. + /// Otherwise, it requests blocks from the given omega_block. + pub fn request_missing_chain(&mut self, omega_block: Option) -> Result<()> { + // Early exit if there's a request in-flight; and if it has not expired. + if let Some(peer) = self.in_flight.as_ref() { + if peer.last_used.elapsed() > self.request_timeout { + tracing::warn!( + "blockstore::RequestMissingChain : in-flight request {} timed out, requesting from new peer", + peer.peer_id + ); + self.done_with_peer(DownGrade::Timeout); + } else { + return Ok(()); + } + } else { + if self.injected > 0 { + tracing::warn!( + "blockstore::RequestMissingChain : too many {} blocks in flight", + self.injected + ); + return Ok(()); + } + } + + if let Some(peer) = self.get_next_peer() { + let message = if let Some(meta) = self.p1_metadata.as_ref() { + ExternalMessage::MetaDataRequest(RequestBlock { + from_number: 0, + from_hash: meta.parent_hash, + batch_size: self.max_batch_size, + }) + } else if let Some(omega_block) = omega_block { + let num = omega_block.number(); + let hash = omega_block.hash(); + ExternalMessage::MetaDataRequest(RequestBlock { + from_number: num, + from_hash: hash, + batch_size: self.max_batch_size, + }) + } else { + todo!("blockstore::RequestMissingChain : no metadata to request missing blocks"); + }; + + tracing::info!( + ?message, + "blockstore::RequestMissingChain : requesting missing chain from {}", + peer.peer_id + ); + self.message_sender + .send_external_message(peer.peer_id, message)?; + + self.in_flight = Some(peer); + } else { + tracing::warn!( + "blockstore::RequestMissingChain : insufficient peers to request missing blocks" + ); + } + Ok(()) + } + /// Request blocks from a hash, backwards. /// /// It will collect N blocks by following the block.parent_hash() of each requested block. @@ -641,69 +764,6 @@ impl BlockStore { Ok(()) } - /// Request blocks between the current height and the given block. - /// - /// The approach is to request blocks in batches of `max_batch_size` blocks. - /// If None block is provided, we request blocks from the last known canonical block forwards. - /// If the block gap is large, we request blocks from the last known canonical block forwards. - /// If the block gap is small, we request blocks from the latest block backwards. - pub fn request_missing_chain(&mut self, omega_block: Option) -> Result<()> { - // Early exit if there's a request in-flight; and if it has not expired. - if let Some(peer) = self.in_flight.as_ref() { - if peer.last_used.elapsed() > self.request_timeout { - tracing::warn!( - "blockstore::RequestMissingChain : in-flight request {} timed out, requesting from new peer", - peer.peer_id - ); - self.done_with_peer(DownGrade::Timeout); - self.in_flight = self.get_next_peer(); - } else { - return Ok(()); - } - } else { - if self.injected > 0 { - tracing::warn!( - "blockstore::RequestMissingChain : too many {} blocks in flight", - self.injected - ); - return Ok(()); - } - self.in_flight = self.get_next_peer(); - if self.in_flight.is_none() { - tracing::warn!("blockstore::RequestMissingChain : insufficient peers to request missing blocks"); - return Ok(()); - } - } - - let message = if let Some(omega_block) = omega_block { - let num = omega_block.number(); - let hash = omega_block.hash(); - self.landmark_metadata.push(hash); - ExternalMessage::MetaDataRequest(RequestBlock { - from_number: num, - from_hash: hash, - batch_size: self.max_batch_size, - }) - } else { - let hash = self.last_metadata.as_ref().unwrap().parent_hash; - self.landmark_metadata.push(hash); - ExternalMessage::MetaDataRequest(RequestBlock { - from_number: 0, - from_hash: hash, - batch_size: self.max_batch_size, - }) - }; - let peer = self.in_flight.as_ref().unwrap(); - tracing::info!( - ?message, - "blockstore::RequestMissingChain : requesting missing chain from {}", - peer.peer_id - ); - self.message_sender - .send_external_message(peer.peer_id, message)?; - Ok(()) - } - /// Add a peer to the list of peers. pub fn add_peer(&mut self, peer: PeerId) { // new peers should be tried last, which gives them time to sync first. From 9ea6e41279ba9b1c6540def96cae47745a1e787e Mon Sep 17 00:00:00 2001 From: Shawn Date: Fri, 3 Jan 2025 15:52:15 +0800 Subject: [PATCH 035/119] chore: clippy. --- zilliqa/src/blockstore.rs | 83 +++++++++++++++++++-------------------- 1 file changed, 41 insertions(+), 42 deletions(-) diff --git a/zilliqa/src/blockstore.rs b/zilliqa/src/blockstore.rs index b4597963e..f6bfd46fa 100644 --- a/zilliqa/src/blockstore.rs +++ b/zilliqa/src/blockstore.rs @@ -5,13 +5,13 @@ use std::{ time::{Duration, Instant}, }; -use crate::crypto::Hash; use anyhow::Result; use itertools::Itertools; use libp2p::PeerId; use crate::{ cfg::NodeConfig, + crypto::Hash, db::Db, message::{ Block, ChainMetaData, ExternalMessage, InjectedProposal, Proposal, RequestBlock, @@ -32,18 +32,22 @@ enum DownGrade { // When a Proposal is received by Consensus, we check if the parent exists in our DB. // If not, then it triggers a syncing algorithm. // -// 1. We check if the gap between our last canonical block and the latest Proposal. -// a. If it is a small gap, we request for blocks, going backwards from Proposal. -// b. If it is a big gap, we request for blocks, going forwards from Canonical. -// 2. When we receive a forwards history response, we check for matches against the cache. -// This means that for a proposal to be injected, it must be corroborated by 2 sources. -// a. If it matches the cached value, we inject the proposal into the pipeline. -// b. If it does not match, we replace the cached value and request for more. -// b. If it does not exist in the cache, we cache the proposal. -// 3. When we receive a backwards history response, we inject it into the pipeline. -// a. If it does not line up with the existing Canonical, then it will be dropped. +// Phase 1: Request missing chain metadata. +// The entire chain metadata is stored in-memory, and is used to construct a chain of metadata. +// 1. We start with the latest Proposal and request the chain of metadata from a peer. +// 2. We construct the chain of metadata, based on the response received. +// 3. If the last block does not exist in our canonical history, we repeat from 1. +// 4. If the last block exists, we have hit our canonical history, we move to Phase 2. // -// TODO: How to handle case where only single source of truth i.e. bootstrap node? +// Phase 2: Request missing blocks. +// 1. We construct a set of hashes, from the in-memory chain metadata. +// 2. We send these block hashes to a Peer for retrieval. +// 3. We inject the Proposals into the pipeline, when the response is received. +// 4. If there are still missing blocks, we repeat from 1. +// 5. If there are no more missing blocks, we are done. +// +// Subsequent missing Proposals are treated as a new sync algorithm. +// Eventually, we get up to 99.9% of the chain. const GAP_THRESHOLD: usize = 5; // How big is big/small gap. const DO_SPECULATIVE: bool = false; // Speeds up syncing by speculatively fetching blocks, allowing it to catch up. @@ -151,14 +155,12 @@ impl BlockStore { if self.p2_metadata.is_some() { // Continue phase 2 self.request_missing_blocks()?; + } else if self.p1_metadata.is_none() { + // Start phase 1 + self.request_missing_chain(Some(block))?; } else { - if self.p1_metadata.is_none() { - // Start phase 1 - self.request_missing_chain(Some(block))?; - } else { - // Continue phase 1 - self.request_missing_chain(None)?; - } + // Continue phase 1 + self.request_missing_chain(None)?; } return Ok(()); } @@ -298,20 +300,18 @@ impl BlockStore { } else { return Ok(()); } - } else { - if self.p2_metadata.is_none() { - tracing::warn!( - "blockstore::RequestMissingBlocks : no metadata to request missing blocks" - ); - return Ok(()); - } + } else if self.p2_metadata.is_none() { + tracing::warn!( + "blockstore::RequestMissingBlocks : no metadata to request missing blocks" + ); + return Ok(()); } if let Some(peer) = self.get_next_peer() { // If we have no landmarks, we have nothing to do self.p2_metadata = None; - if let Some(mut hash) = self.landmarks.pop() { - self.landmarks.push(hash); // we actually need to peek() at the last element + if let Some(hash) = self.landmarks.last() { + let mut hash = *hash; // peek at the last value let mut request_hashes = Vec::with_capacity(self.max_batch_size); while let Some(meta) = self.chain_metadata.remove(&hash) { request_hashes.push(meta.block_hash); @@ -353,12 +353,6 @@ impl BlockStore { from: PeerId, response: Vec, ) -> Result<()> { - tracing::info!( - "blockstore::MetadataResponse : received {} metadata from {}", - response.len(), - from - ); - // Process whatever we have received. if response.is_empty() { // Empty response, downgrade peer @@ -386,6 +380,13 @@ impl BlockStore { self.landmarks .push(metadata.first().as_ref().unwrap().block_hash); + tracing::info!( + "blockstore::MetadataResponse : received {} metadata set #{} from {}", + metadata.len(), + self.landmarks.len(), + from + ); + // Store the metadata for meta in metadata { // TODO: Check the linkage of the returned chain @@ -458,14 +459,12 @@ impl BlockStore { } else { return Ok(()); } - } else { - if self.injected > 0 { - tracing::warn!( - "blockstore::RequestMissingChain : too many {} blocks in flight", - self.injected - ); - return Ok(()); - } + } else if self.injected > 0 { + tracing::warn!( + "blockstore::RequestMissingChain : too many {} blocks in flight", + self.injected + ); + return Ok(()); } if let Some(peer) = self.get_next_peer() { From c4c89febd6abcfdcfbbd85997b9d4a18f149c1b2 Mon Sep 17 00:00:00 2001 From: Shawn Date: Sat, 4 Jan 2025 11:53:16 +0800 Subject: [PATCH 036/119] feat: sync phase#3 - zip it up. works for syncing new nodes. --- zilliqa/src/blockstore.rs | 111 ++++++++++++++++++++++++-------------- zilliqa/src/consensus.rs | 1 - zilliqa/src/node.rs | 8 +-- 3 files changed, 76 insertions(+), 44 deletions(-) diff --git a/zilliqa/src/blockstore.rs b/zilliqa/src/blockstore.rs index f6bfd46fa..a6de12d5c 100644 --- a/zilliqa/src/blockstore.rs +++ b/zilliqa/src/blockstore.rs @@ -1,6 +1,6 @@ use std::{ cmp::Ordering, - collections::{BTreeMap, BinaryHeap, HashMap}, + collections::{BTreeMap, BinaryHeap, HashMap, VecDeque}, sync::Arc, time::{Duration, Instant}, }; @@ -40,16 +40,21 @@ enum DownGrade { // 4. If the last block exists, we have hit our canonical history, we move to Phase 2. // // Phase 2: Request missing blocks. +// Once the chain metadata is constructed, we request the missing blocks to replay the history. // 1. We construct a set of hashes, from the in-memory chain metadata. // 2. We send these block hashes to a Peer for retrieval. // 3. We inject the Proposals into the pipeline, when the response is received. // 4. If there are still missing blocks, we repeat from 1. -// 5. If there are no more missing blocks, we are done. +// 5. If there are no more missing blocks, we are done, ready for Phase 3. // -// Subsequent missing Proposals are treated as a new sync algorithm. -// Eventually, we get up to 99.9% of the chain. - -const GAP_THRESHOLD: usize = 5; // How big is big/small gap. +// Phase 3: Zip it up. +// Phase 1 & 2 brings up to 99% of the chain. This step closes the last gap. +// 1. We queue all newly received Proposals, while Phase 1 & 2 were in progress. +// 2. We check the head of the queue if it's parent exists in our canonical history. +// 3. If it does not, we trigger Phase 1. +// 4. If it does, we inject the entire queue into the pipeline. We are done. + +const GAP_THRESHOLD: usize = 10; // How big is big/small gap. const DO_SPECULATIVE: bool = false; // Speeds up syncing by speculatively fetching blocks, allowing it to catch up. #[derive(Debug)] @@ -81,6 +86,7 @@ pub struct BlockStore { p1_metadata: Option, p2_metadata: Option, landmarks: Vec, + zip_queue: VecDeque, } impl BlockStore { @@ -116,6 +122,7 @@ impl BlockStore { p1_metadata: None, landmarks: Vec::new(), p2_metadata: None, + zip_queue: VecDeque::with_capacity(GAP_THRESHOLD), }) } @@ -139,30 +146,55 @@ impl BlockStore { Ok(()) } - /// Process a block proposal. - /// Checks if the parent block exists, and if not, triggers a sync. - pub fn process_proposal(&mut self, block: Block) -> Result<()> { - // ... - // check if block parent exists - let parent_block = self.db.get_block_by_hash(&block.parent_hash())?; - - // no parent block, trigger sync - if parent_block.is_none() { - tracing::warn!( - "blockstore::ProcessProposal : Parent block {} not found", - block.parent_hash() - ); - if self.p2_metadata.is_some() { - // Continue phase 2 - self.request_missing_blocks()?; - } else if self.p1_metadata.is_none() { - // Start phase 1 - self.request_missing_chain(Some(block))?; + /// Sync a block proposal. + /// + /// This is the main entry point for syncing a block proposal. + /// We start by enqueuing all proposals, and then check if the parent block exists in history. + /// If the parent block exists, we do nothing. Ttherwise, we check the oldest one in the queue. + /// If we find its parent in history, we inject the entire queue. + /// + /// We do not perform checks on the Proposal here. This is done in the consensus layer. + pub fn sync_proposal(&mut self, proposal: Proposal) -> Result<()> { + // just stuff the latest proposal into the fixed-size queue. + while self.zip_queue.len() >= GAP_THRESHOLD { + self.zip_queue.pop_front(); + } + self.zip_queue.push_back(proposal); + + // TODO: Replace with single SQL query + // Check if block parent exist in history + let parent_hash = self.zip_queue.back().unwrap().header.qc.block_hash; + if self.db.get_block_by_hash(&parent_hash)?.is_none() { + // Check if oldes block exists in the history. If it does, we have synced up 99% of the chain. + let ancestor_hash = self.zip_queue.front().unwrap().header.qc.block_hash; + if self.zip_queue.len() == 1 || self.db.get_block_by_hash(&ancestor_hash)?.is_none() { + // No ancestor block, trigger sync + tracing::warn!( + "blockstore::SyncProposal : parent block {} not found", + parent_hash + ); + if self.p2_metadata.is_some() { + // Continue phase 2 + self.request_missing_blocks()?; + } else if self.p1_metadata.is_some() { + // Continue phase 1 + self.request_missing_chain(None)?; + } else { + // Start phase 1 + self.request_missing_chain(Some(parent_hash))?; + } } else { - // Continue phase 1 - self.request_missing_chain(None)?; + // 99% synced, zip it up! + tracing::info!( + "blockstore::SyncProposal : zip up {} blocks from {}", + self.zip_queue.len(), + ancestor_hash + ); + // parent block exists, inject the proposal + let proposals = self.zip_queue.drain(..).collect_vec(); + self.inject_proposals(proposals)?; + // we're done } - return Ok(()); } Ok(()) } @@ -201,13 +233,6 @@ impl BlockStore { from: PeerId, response: Vec, ) -> Result<()> { - // ... - tracing::info!( - "blockstore::MultiBlockResponse : received {} blocks from {}", - response.len(), - from - ); - // Process whatever we received if response.is_empty() { // Empty response, downgrade peer @@ -221,11 +246,19 @@ impl BlockStore { self.done_with_peer(DownGrade::None); } + // Sort proposals by number, ascending let proposals = response .into_iter() .sorted_by_key(|p| p.number()) .collect_vec(); + tracing::info!( + "blockstore::MultiBlockResponse : received {} blocks for set #{} from {}", + proposals.len(), + self.landmarks.len(), + from + ); + if let Some(landmark) = self.landmarks.pop() { // remove the last landmark, should match proposals.last() let hash = proposals.last().as_ref().unwrap().hash(); @@ -447,7 +480,7 @@ impl BlockStore { /// This constructs a chain history by requesting blocks from a peer, going backwards from a given block. /// If phase 1 is in progress, it continues requesting blocks from the last known phase 1 block. /// Otherwise, it requests blocks from the given omega_block. - pub fn request_missing_chain(&mut self, omega_block: Option) -> Result<()> { + pub fn request_missing_chain(&mut self, parent_hash: Option) -> Result<()> { // Early exit if there's a request in-flight; and if it has not expired. if let Some(peer) = self.in_flight.as_ref() { if peer.last_used.elapsed() > self.request_timeout { @@ -474,11 +507,9 @@ impl BlockStore { from_hash: meta.parent_hash, batch_size: self.max_batch_size, }) - } else if let Some(omega_block) = omega_block { - let num = omega_block.number(); - let hash = omega_block.hash(); + } else if let Some(hash) = parent_hash { ExternalMessage::MetaDataRequest(RequestBlock { - from_number: num, + from_number: 0, from_hash: hash, batch_size: self.max_batch_size, }) diff --git a/zilliqa/src/consensus.rs b/zilliqa/src/consensus.rs index e1f5db700..3ffc64204 100644 --- a/zilliqa/src/consensus.rs +++ b/zilliqa/src/consensus.rs @@ -651,7 +651,6 @@ impl Consensus { ); // FIXME: Cleanup - self.blockstore.process_proposal(block.clone())?; if self.block_store.contains_block(&block.hash())? { trace!("ignoring block proposal, block store contains this block already"); diff --git a/zilliqa/src/node.rs b/zilliqa/src/node.rs index 816d9a5f4..6ba0dc917 100644 --- a/zilliqa/src/node.rs +++ b/zilliqa/src/node.rs @@ -376,8 +376,8 @@ impl Node { self.request_responses .send((response_channel, ExternalMessage::Acknowledgement))?; } - _ => { - warn!("unexpected message type"); + msg => { + warn!(%msg, "unexpected message type"); } } @@ -968,7 +968,7 @@ impl Node { } fn handle_proposal(&mut self, from: PeerId, proposal: Proposal) -> Result<()> { - if let Some((to, message)) = self.consensus.proposal(from, proposal, false)? { + if let Some((to, message)) = self.consensus.proposal(from, proposal.clone(), false)? { self.reset_timeout .send(self.config.consensus.consensus_timeout)?; if let Some(to) = to { @@ -976,6 +976,8 @@ impl Node { } else { self.message_sender.broadcast_proposal(message)?; } + } else { + self.consensus.blockstore.sync_proposal(proposal)?; // proposal is already verified } Ok(()) From 39d2cd43fd59d4fefb81b9536a9bf0d3e6ebac31 Mon Sep 17 00:00:00 2001 From: Shawn Date: Mon, 6 Jan 2025 08:56:30 +0800 Subject: [PATCH 037/119] feat: rename blockstore.rs to sync.rs - makes clear that its job is to sync. --- zilliqa/src/consensus.rs | 8 +- zilliqa/src/lib.rs | 2 +- zilliqa/src/node.rs | 125 ++++--------------------- zilliqa/src/{blockstore.rs => sync.rs} | 4 +- 4 files changed, 25 insertions(+), 114 deletions(-) rename zilliqa/src/{blockstore.rs => sync.rs} (99%) diff --git a/zilliqa/src/consensus.rs b/zilliqa/src/consensus.rs index 3ffc64204..cca2c722e 100644 --- a/zilliqa/src/consensus.rs +++ b/zilliqa/src/consensus.rs @@ -22,7 +22,6 @@ use tracing::*; use crate::{ block_store::BlockStore, blockhooks, - blockstore::BlockStore as BlockStore2, cfg::{ConsensusConfig, NodeConfig}, constants::TIME_TO_ALLOW_PROPOSAL_BROADCAST, contracts, @@ -39,6 +38,7 @@ use crate::{ pool::{TransactionPool, TxAddResult, TxPoolContent}, range_map::RangeMap, state::State, + sync::Sync, time::SystemTime, transaction::{EvmGas, SignedTransaction, TransactionReceipt, VerifiedTransaction}, }; @@ -152,7 +152,7 @@ pub struct Consensus { config: NodeConfig, message_sender: MessageSender, reset_timeout: UnboundedSender, - pub blockstore: BlockStore2, + pub sync: Sync, pub block_store: BlockStore, latest_leader_cache: RefCell>, votes: BTreeMap, @@ -208,7 +208,7 @@ impl Consensus { )?; } - let blockstore = BlockStore2::new(&config, db.clone(), message_sender.clone(), Vec::new())?; + let sync = Sync::new(&config, db.clone(), message_sender.clone(), Vec::new())?; // It is important to create the `BlockStore` after the checkpoint has been loaded into the DB. The // `BlockStore` pre-loads and caches information about the currently stored blocks. @@ -328,7 +328,7 @@ impl Consensus { let mut consensus = Consensus { secret_key, config, - blockstore, + sync, block_store, latest_leader_cache: RefCell::new(None), message_sender, diff --git a/zilliqa/src/lib.rs b/zilliqa/src/lib.rs index 28445f822..bbb360644 100644 --- a/zilliqa/src/lib.rs +++ b/zilliqa/src/lib.rs @@ -1,7 +1,6 @@ pub mod api; pub mod block_store; mod blockhooks; -pub mod blockstore; pub mod cfg; pub mod consensus; pub mod constants; @@ -25,6 +24,7 @@ mod scilla_proto; pub mod serde_util; pub mod state; pub mod test_util; +pub mod sync; pub mod time; pub mod transaction; pub mod zq1_proto; diff --git a/zilliqa/src/node.rs b/zilliqa/src/node.rs index 6ba0dc917..b8d2f535f 100644 --- a/zilliqa/src/node.rs +++ b/zilliqa/src/node.rs @@ -226,10 +226,10 @@ impl Node { } } ExternalMessage::AddPeer => { - self.consensus.blockstore.add_peer(from); + self.consensus.sync.add_peer(from); } ExternalMessage::RemovePeer => { - self.consensus.blockstore.remove_peer(from); + self.consensus.sync.remove_peer(from); } // `Proposals` are re-routed to `handle_request()` _ => { @@ -276,91 +276,14 @@ impl Node { ExternalMessage::MultiBlockRequest(request) => { let message = self .consensus - .blockstore + .sync .handle_multiblock_request(from, request)?; self.request_responses.send((response_channel, message))?; } ExternalMessage::MetaDataRequest(request) => { - let message = self - .consensus - .blockstore - .handle_metadata_request(from, request)?; - self.request_responses.send((response_channel, message))?; - } - ExternalMessage::RequestFromNumber(request) => { - let message = self - .consensus - .blockstore - .handle_request_from_number(from, request)?; + let message = self.consensus.sync.handle_metadata_request(from, request)?; self.request_responses.send((response_channel, message))?; } - ExternalMessage::RequestFromHash(request) => { - let message = self - .consensus - .blockstore - .handle_request_from_hash(from, request)?; - self.request_responses.send((response_channel, message))?; - } - // Respond negatively to old BlockRequests. - ExternalMessage::BlockRequest(request) => { - self.request_responses.send(( - response_channel, - ExternalMessage::BlockResponse(BlockResponse { - proposals: vec![], - from_view: request.from_view, - availability: None, - }), - ))?; - return Ok(()); - - // if from == self.peer_id { - // debug!("block_store::BlockRequest : ignoring blocks request to self"); - // return Ok(()); - // } - - // trace!( - // "block_store::BlockRequest : received a block request - {}", - // self.peer_id - // ); - // // Note that it is very important that we limit this by number of blocks - // // returned, _not_ by max view range returned. If we don't, then any - // // view gap larger than block_request_limit will never be filliable - // // because no node will ever be prepared to return the block after it. - // let proposals: Vec = (request.from_view..=request.to_view) - // .take(self.config.block_request_limit) - // .filter_map(|view| { - // self.consensus - // .get_block_by_view(view) - // .transpose() - // .map(|block| Ok(self.block_to_proposal(block?))) - // }) - // .collect::>()?; - - // let availability = self.consensus.block_store.availability()?; - // trace!("block_store::BlockRequest - responding to new blocks request {id:?} from {from:?} of {request:?} with props {0:?} availability {availability:?}", - // proposals.iter().fold("".to_string(), |state, x| format!("{},{}", state, x.header.view))); - - // // Send the response to this block request. - // self.request_responses.send(( - // response_channel, - // ExternalMessage::BlockResponse(BlockResponse { - // proposals, - // from_view: request.from_view, - // availability, - // }), - // ))?; - } - // We don't usually expect a [BlockResponse] to be received as a request, however this can occur when our - // [BlockStore] has re-sent a previously unusable block because we didn't (yet) have the block's parent. - // Having knowledge of this here breaks our abstraction boundaries slightly, but it also keeps things - // simple. - ExternalMessage::BlockResponse(m) => { - self.handle_block_response(from, m)?; - // Acknowledge this block response. This does nothing because the `BlockResponse` request was sent by - // us, but we keep it here for symmetry with the other handlers. - self.request_responses - .send((response_channel, ExternalMessage::Acknowledgement))?; - } // This just breaks down group block messages into individual messages to stop them blocking threads // for long periods. ExternalMessage::InjectedProposal(p) => { @@ -397,30 +320,18 @@ impl Node { pub fn handle_response(&mut self, from: PeerId, message: ExternalMessage) -> Result<()> { debug!(%from, to = %self.peer_id, %message, "handling response"); match message { - ExternalMessage::MultiBlockResponse(response) => { - self.consensus - .blockstore - .handle_multiblock_response(from, response)?; - } - ExternalMessage::MetaDataResponse(response) => { - self.consensus - .blockstore - .handle_metadata_response(from, response)?; - } - ExternalMessage::ResponseFromNumber(response) => { - self.consensus - .blockstore - .handle_response_from_number(from, response)?; - } - ExternalMessage::ResponseFromHash(response) => { - self.consensus - .blockstore - .handle_response_from_hash(from, response)?; - } - ExternalMessage::BlockResponse(m) => self.handle_block_response(from, m)?, + ExternalMessage::MultiBlockResponse(response) => self + .consensus + .sync + .handle_multiblock_response(from, response)?, + + ExternalMessage::MetaDataResponse(response) => self + .consensus + .sync + .handle_metadata_response(from, response)?, ExternalMessage::Acknowledgement => {} - _ => { - warn!("unexpected message type"); + msg => { + warn!(%msg, "unexpected message type"); } } @@ -977,13 +888,13 @@ impl Node { self.message_sender.broadcast_proposal(message)?; } } else { - self.consensus.blockstore.sync_proposal(proposal)?; // proposal is already verified + self.consensus.sync.sync_proposal(proposal)?; // proposal is already verified } Ok(()) } - fn handle_block_response(&mut self, from: PeerId, response: BlockResponse) -> Result<()> { + fn _handle_block_response(&mut self, from: PeerId, response: BlockResponse) -> Result<()> { trace!( "block_store::handle_block_response - received blocks response of length {}", response.proposals.len() @@ -1009,7 +920,7 @@ impl Node { return Ok(()); } trace!("Handling proposal for view {0}", req.block.header.view); - self.consensus.blockstore.mark_received_proposal(&req)?; + self.consensus.sync.mark_received_proposal(&req)?; let proposal = self.consensus.receive_block(from, req.block)?; if let Some(proposal) = proposal { trace!( diff --git a/zilliqa/src/blockstore.rs b/zilliqa/src/sync.rs similarity index 99% rename from zilliqa/src/blockstore.rs rename to zilliqa/src/sync.rs index a6de12d5c..5a88e2e8e 100644 --- a/zilliqa/src/blockstore.rs +++ b/zilliqa/src/sync.rs @@ -58,7 +58,7 @@ const GAP_THRESHOLD: usize = 10; // How big is big/small gap. const DO_SPECULATIVE: bool = false; // Speeds up syncing by speculatively fetching blocks, allowing it to catch up. #[derive(Debug)] -pub struct BlockStore { +pub struct Sync { // database db: Arc, // message bus @@ -89,7 +89,7 @@ pub struct BlockStore { zip_queue: VecDeque, } -impl BlockStore { +impl Sync { pub fn new( config: &NodeConfig, db: Arc, From 6ccc7ca7d592c67c5a67253486019a4b640eb875 Mon Sep 17 00:00:00 2001 From: Shawn Date: Mon, 6 Jan 2025 09:34:44 +0800 Subject: [PATCH 038/119] nit: minor refactor - removing previous strategy. --- zilliqa/src/sync.rs | 313 ++++++++------------------------------------ 1 file changed, 52 insertions(+), 261 deletions(-) diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index 5a88e2e8e..decee9fb8 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -1,6 +1,6 @@ use std::{ cmp::Ordering, - collections::{BTreeMap, BinaryHeap, HashMap, VecDeque}, + collections::{BTreeMap, BinaryHeap, VecDeque}, sync::Arc, time::{Duration, Instant}, }; @@ -13,10 +13,7 @@ use crate::{ cfg::NodeConfig, crypto::Hash, db::Db, - message::{ - Block, ChainMetaData, ExternalMessage, InjectedProposal, Proposal, RequestBlock, - ResponseBlock, - }, + message::{Block, ChainMetaData, ExternalMessage, InjectedProposal, Proposal, RequestBlock}, node::MessageSender, }; @@ -32,30 +29,33 @@ enum DownGrade { // When a Proposal is received by Consensus, we check if the parent exists in our DB. // If not, then it triggers a syncing algorithm. // -// Phase 1: Request missing chain metadata. +// PHASE 1: Request missing chain metadata. // The entire chain metadata is stored in-memory, and is used to construct a chain of metadata. // 1. We start with the latest Proposal and request the chain of metadata from a peer. // 2. We construct the chain of metadata, based on the response received. -// 3. If the last block does not exist in our canonical history, we repeat from 1. -// 4. If the last block exists, we have hit our canonical history, we move to Phase 2. +// 3. If the last block does not exist in our canonical history, we request for additional metadata. +// 4. If the last block exists, we have hit our canonical history. +// 5. Move to Phase 2. // -// Phase 2: Request missing blocks. +// PHASE 2: Request missing blocks. // Once the chain metadata is constructed, we request the missing blocks to replay the history. // 1. We construct a set of hashes, from the in-memory chain metadata. -// 2. We send these block hashes to a Peer for retrieval. +// 2. We send these block hashes to the same Peer (that sent the metadata) for retrieval. // 3. We inject the Proposals into the pipeline, when the response is received. -// 4. If there are still missing blocks, we repeat from 1. -// 5. If there are no more missing blocks, we are done, ready for Phase 3. +// 4. If there are still missing blocks, we ask for more, from 1. +// 5. If there are no more missing blocks, we have filled up all blocks from the chain metadata. +// 6. Ready for Phase 3. // -// Phase 3: Zip it up. -// Phase 1 & 2 brings up to 99% of the chain. This step closes the last gap. +// PHASE 3: Zip it up. +// Phase 1&2 may run several times that brings up 99% of the chain. This closes the final gap. // 1. We queue all newly received Proposals, while Phase 1 & 2 were in progress. -// 2. We check the head of the queue if it's parent exists in our canonical history. -// 3. If it does not, we trigger Phase 1. -// 4. If it does, we inject the entire queue into the pipeline. We are done. +// 2. We check the head of the queue if its parent exists in our canonical history. +// 3. If it does not, we trigger Phase 1&2. +// 4. If it does, we inject the entire queue into the pipeline. +// 5. We are caught up. -const GAP_THRESHOLD: usize = 10; // How big is big/small gap. -const DO_SPECULATIVE: bool = false; // Speeds up syncing by speculatively fetching blocks, allowing it to catch up. +const GAP_THRESHOLD: usize = 20; // Size of internal Proposal cache. +const DO_SPECULATIVE: bool = false; // Speeds up syncing by speculatively fetching blocks. #[derive(Debug)] pub struct Sync { @@ -77,9 +77,6 @@ pub struct Sync { peer_id: PeerId, // how many injected proposals injected: usize, - // cache - cache: HashMap, - latest_block: Option, // Chain metadata chain_metadata: BTreeMap, @@ -116,8 +113,6 @@ impl Sync { max_blocks_in_flight: config.max_blocks_in_flight.min(3600), // cap to 1-hr worth of blocks in_flight: None, injected: 0, - cache: HashMap::new(), - latest_block: None, chain_metadata: BTreeMap::new(), p1_metadata: None, landmarks: Vec::new(), @@ -132,14 +127,14 @@ impl Sync { pub fn mark_received_proposal(&mut self, prop: &InjectedProposal) -> Result<()> { if prop.from != self.peer_id { tracing::error!( - "blockstore::MarkReceivedProposal : foreign InjectedProposal from {}", + "sync::MarkReceivedProposal : foreign InjectedProposal from {}", prop.from ); } - if let Some(p) = self.cache.remove(&prop.block.number()) { + if let Some(p) = self.chain_metadata.remove(&prop.block.hash()) { tracing::warn!( - "blockstore::MarkReceivedProposal : removing stale cache proposal {}", - p.number() + "sync::MarkReceivedProposal : removing stale metadata {}", + p.block_hash ); } self.injected = self.injected.saturating_sub(1); @@ -170,7 +165,7 @@ impl Sync { if self.zip_queue.len() == 1 || self.db.get_block_by_hash(&ancestor_hash)?.is_none() { // No ancestor block, trigger sync tracing::warn!( - "blockstore::SyncProposal : parent block {} not found", + "sync::SyncProposal : parent block {} not found", parent_hash ); if self.p2_metadata.is_some() { @@ -186,7 +181,7 @@ impl Sync { } else { // 99% synced, zip it up! tracing::info!( - "blockstore::SyncProposal : zip up {} blocks from {}", + "sync::SyncProposal : zip up {} blocks from {}", self.zip_queue.len(), ancestor_hash ); @@ -236,11 +231,12 @@ impl Sync { // Process whatever we received if response.is_empty() { // Empty response, downgrade peer - tracing::warn!("blockstore::MultiBlockResponse : empty blocks {from}",); + tracing::warn!("sync::MultiBlockResponse : empty blocks {from}",); self.done_with_peer(DownGrade::Empty); } else if response.len() < self.max_batch_size { // Partial response, downgrade peer - tracing::warn!("blockstore::MultiBlockResponse : partial blocks {from}",); + // TODO: Match against request numbers + tracing::warn!("sync::MultiBlockResponse : partial blocks {from}",); self.done_with_peer(DownGrade::Partial); } else { self.done_with_peer(DownGrade::None); @@ -253,7 +249,7 @@ impl Sync { .collect_vec(); tracing::info!( - "blockstore::MultiBlockResponse : received {} blocks for set #{} from {}", + "sync::MultiBlockResponse : received {} blocks for set #{} from {}", proposals.len(), self.landmarks.len(), from @@ -264,7 +260,7 @@ impl Sync { let hash = proposals.last().as_ref().unwrap().hash(); if hash != landmark { tracing::warn!( - "blockstore::MultiBlockResponse : mismatched landmark {} != {}", + "sync::MultiBlockResponse : mismatched landmark {} != {}", landmark, hash, ); @@ -283,7 +279,7 @@ impl Sync { if self.landmarks.is_empty() { self.p1_metadata = None; self.chain_metadata.clear(); - } else if DO_SPECULATIVE { + } else if DO_SPECULATIVE && self.injected < self.max_blocks_in_flight { // Speculatively request more blocks self.request_missing_blocks()?; } @@ -298,7 +294,7 @@ impl Sync { ) -> Result { // ... tracing::info!( - "blockstore::MultiBlockRequest : received a {} multiblock request from {}", + "sync::MultiBlockRequest : received a {} multiblock request from {}", request.len(), from ); @@ -326,7 +322,7 @@ impl Sync { if let Some(peer) = self.in_flight.as_ref() { if peer.last_used.elapsed() > self.request_timeout { tracing::warn!( - "blockstore::RequestMissingBlocks : in-flight request {} timed out, requesting from new peer", + "sync::RequestMissingBlocks : in-flight request {} timed out, requesting from new peer", peer.peer_id ); self.done_with_peer(DownGrade::Timeout); @@ -335,11 +331,12 @@ impl Sync { } } else if self.p2_metadata.is_none() { tracing::warn!( - "blockstore::RequestMissingBlocks : no metadata to request missing blocks" + "sync::RequestMissingBlocks : no metadata to request missing blocks" ); return Ok(()); } + // TODO: Use original peer, which would have the set of blocks if let Some(peer) = self.get_next_peer() { // If we have no landmarks, we have nothing to do self.p2_metadata = None; @@ -356,7 +353,7 @@ impl Sync { // Fire request tracing::debug!( - "blockstore::RequestMissingBlocks : requesting {} blocks of set #{}", + "sync::RequestMissingBlocks : requesting {} blocks of set #{}", request_hashes.len(), self.landmarks.len(), ); @@ -371,7 +368,7 @@ impl Sync { } } else { tracing::warn!( - "blockstore::RequestMissingBlocks : insufficient peers to request missing blocks" + "sync::RequestMissingBlocks : insufficient peers to request missing blocks" ); } Ok(()) @@ -389,17 +386,19 @@ impl Sync { // Process whatever we have received. if response.is_empty() { // Empty response, downgrade peer - tracing::warn!("blockstore::MetadataResponse : empty blocks {from}",); + tracing::warn!("sync::MetadataResponse : empty blocks {from}",); self.done_with_peer(DownGrade::Empty); return Ok(()); } else if response.len() < self.max_batch_size { // Partial response, downgrade peer - tracing::warn!("blockstore::MetadataResponse : partial blocks {from}",); + tracing::warn!("sync::MetadataResponse : partial blocks {from}",); self.done_with_peer(DownGrade::Partial); } else { self.done_with_peer(DownGrade::None); } + // TODO: Check the linkage of the returned chain + // Sort metadata by number, reversed let metadata = response .into_iter() @@ -410,11 +409,13 @@ impl Sync { let last_hash = p1_metadata.block_hash; self.p1_metadata = Some(p1_metadata); + // TODO: Store peer id. + // TODO: Insert intermediate landmarks self.landmarks .push(metadata.first().as_ref().unwrap().block_hash); tracing::info!( - "blockstore::MetadataResponse : received {} metadata set #{} from {}", + "sync::MetadataResponse : received {} metadata set #{} from {}", metadata.len(), self.landmarks.len(), from @@ -422,7 +423,6 @@ impl Sync { // Store the metadata for meta in metadata { - // TODO: Check the linkage of the returned chain self.chain_metadata.insert(meta.block_hash, meta); } @@ -448,7 +448,7 @@ impl Sync { request: RequestBlock, ) -> Result { tracing::info!( - "blockstore::MetadataRequest : received a metadata request from {}", + "sync::MetadataRequest : received a metadata request from {}", from ); @@ -470,7 +470,7 @@ impl Sync { let message = ExternalMessage::MetaDataResponse(metas); tracing::trace!( ?message, - "blockstore::MetadataFromHash : responding to block request" + "sync::MetadataFromHash : responding to block request" ); Ok(message) } @@ -485,7 +485,7 @@ impl Sync { if let Some(peer) = self.in_flight.as_ref() { if peer.last_used.elapsed() > self.request_timeout { tracing::warn!( - "blockstore::RequestMissingChain : in-flight request {} timed out, requesting from new peer", + "sync::RequestMissingChain : in-flight request {} timed out, requesting from new peer", peer.peer_id ); self.done_with_peer(DownGrade::Timeout); @@ -494,7 +494,7 @@ impl Sync { } } else if self.injected > 0 { tracing::warn!( - "blockstore::RequestMissingChain : too many {} blocks in flight", + "sync::RequestMissingChain : too many {} blocks in flight", self.injected ); return Ok(()); @@ -514,12 +514,12 @@ impl Sync { batch_size: self.max_batch_size, }) } else { - todo!("blockstore::RequestMissingChain : no metadata to request missing blocks"); + todo!("sync::RequestMissingChain : no metadata to request missing blocks"); }; tracing::info!( ?message, - "blockstore::RequestMissingChain : requesting missing chain from {}", + "sync::RequestMissingChain : requesting missing chain from {}", peer.peer_id ); self.message_sender @@ -528,85 +528,12 @@ impl Sync { self.in_flight = Some(peer); } else { tracing::warn!( - "blockstore::RequestMissingChain : insufficient peers to request missing blocks" + "sync::RequestMissingChain : insufficient peers to request missing blocks" ); } Ok(()) } - /// Request blocks from a hash, backwards. - /// - /// It will collect N blocks by following the block.parent_hash() of each requested block. - pub fn handle_request_from_hash( - &mut self, - from: PeerId, - request: RequestBlock, - ) -> Result { - tracing::debug!( - "blockstore::RequestFromHash : received a block request from {}", - from - ); - - // TODO: Check if we should service this request - // Validators could respond to this request if there is nothing else to do. - - let batch_size = self.max_batch_size.min(request.batch_size); // mitigate DOS by limiting the number of blocks we return - let mut proposals = Vec::with_capacity(batch_size); - let mut hash = request.from_hash; - while proposals.len() < batch_size { - // grab the parent - let Some(block) = self.db.get_block_by_hash(&hash)? else { - // that's all we have! - break; - }; - hash = block.parent_hash(); - proposals.push(self.block_to_proposal(block)); - } - - let message = ExternalMessage::ResponseFromHash(ResponseBlock { proposals }); - tracing::trace!( - ?message, - "blockstore::RequestFromHash : responding to block request from height" - ); - Ok(message) - } - - /// Request for blocks from a height, forwards. - pub fn handle_request_from_number( - &mut self, - from: PeerId, - request: RequestBlock, - ) -> Result { - // ... - tracing::debug!( - "blockstore::RequestFromNumber : received a block request from {}", - from - ); - - // TODO: Check if we should service this request. - // Validators shall not respond to this request. - - // TODO: Replace this with a single SQL query - let batch_size = self.max_batch_size.min(request.batch_size); // mitigate DOS attacks by limiting the number of blocks we send - let mut proposals = Vec::with_capacity(batch_size); - for num in request.from_number.saturating_add(1) - ..=request.from_number.saturating_add(batch_size as u64) - { - let Some(block) = self.db.get_canonical_block_by_number(num)? else { - // that's all we have! - break; - }; - proposals.push(self.block_to_proposal(block)); - } - - let message = ExternalMessage::ResponseFromNumber(ResponseBlock { proposals }); - tracing::trace!( - ?message, - "blockstore::RequestFromNumber : responding to block request from height" - ); - Ok(message) - } - /// Inject the proposals into the chain. /// /// Besides pumping the set of Proposals into the processing pipeline, it also records the @@ -617,10 +544,6 @@ impl Sync { return Ok(()); } - // Store the tip - let (last_block, _) = proposals.last().unwrap().clone().into_parts(); - self.latest_block = Some(last_block); - // Increment proposals injected self.injected = self.injected.saturating_add(proposals.len()); let len = proposals.len(); @@ -643,7 +566,7 @@ impl Sync { } tracing::info!( - "blockstore::InjectProposals : injected {}/{} proposals", + "sync::InjectProposals : injected {}/{} proposals", len, self.injected ); @@ -662,138 +585,6 @@ impl Sync { } } - pub fn handle_response_from_number( - &mut self, - from: PeerId, - response: ResponseBlock, - ) -> Result<()> { - // Process whatever we have received. - if response.proposals.is_empty() { - // Empty response, downgrade peer - tracing::warn!("blockstore::ResponseFromNumber : empty blocks {from}",); - self.done_with_peer(DownGrade::Empty); - return Ok(()); - } else if response.proposals.len() < self.max_batch_size { - // Partial response, downgrade peer - tracing::warn!("blockstore::ResponseFromNumber : partial blocks {from}",); - self.done_with_peer(DownGrade::Partial); - } else { - self.done_with_peer(DownGrade::None); - } - - tracing::info!( - "blockstore::ResponseFromNumber : received {} blocks from {}", - response.proposals.len(), - from - ); - - // TODO: Any additional checks we should do here? - - // Sort proposals by number - let proposals = response - .proposals - .into_iter() - .sorted_by_key(|p| p.number()) - .collect_vec(); - - // Insert into the cache. - // If current proposal matches another one in cache, from a different peer, inject the proposal. - // Else, replace the cached Proposal with the new one. - let mut corroborated_proposals = Vec::with_capacity(proposals.len()); - let mut props = proposals.into_iter(); - - // Collect corroborated proposals - for p in props.by_ref() { - if let Some(proposal) = self.cache.remove(&p.number()) { - // If the proposal already exists - if proposal.hash() == p.hash() { - // is corroborated proposal - corroborated_proposals.push(proposal); - } else { - // insert the different one and; - self.cache.insert(p.number(), p); - break; // replace the rest in the next loop - } - } else { - self.cache.insert(p.number(), p); - } - } - - // Replace/insert the rest of the proposals in the cache - for p in props { - self.cache.insert(p.number(), p); - } - - // Inject matched proposals - self.inject_proposals(corroborated_proposals)?; - - // Fire speculative request - if self.latest_block.is_some() && self.injected < self.max_blocks_in_flight { - if let Some(peer) = self.get_next_peer() { - // we're far from latest block - let message = RequestBlock { - from_number: self.latest_block.as_ref().unwrap().number(), - from_hash: self.latest_block.as_ref().unwrap().hash(), - batch_size: self.max_batch_size, - }; - tracing::info!( - "blockstore::RequestMissingBlocks : speculative fetch {} blocks at {} from {}", - message.batch_size, - message.from_number, - peer.peer_id, - ); - self.message_sender.send_external_message( - peer.peer_id, - ExternalMessage::RequestFromNumber(message), - )?; - self.in_flight = Some(peer); - } - } - - Ok(()) - } - - pub fn handle_response_from_hash( - &mut self, - from: PeerId, - response: ResponseBlock, - ) -> Result<()> { - // Check that we have enough to complete the process, otherwise ignore - if response.proposals.is_empty() { - // Empty response, downgrade peer, skip - tracing::warn!("blockstore::ResponseFromHash : empty blocks {from}",); - self.done_with_peer(DownGrade::Empty); - return Ok(()); - } else if response.proposals.len() < GAP_THRESHOLD { - // Partial response, downgrade peer - // Skip processing because we want to ensure that we have ALL the needed blocks to sync up. - tracing::warn!("blockstore::ResponseFromHash : partial blocks {from}",); - self.done_with_peer(DownGrade::Partial); - return Ok(()); - } else { - // only process full responses - self.done_with_peer(DownGrade::None); - } - - tracing::info!( - "blockstore::ResponseFromHash : received {} blocks from {}", - response.proposals.len(), - from - ); - - // TODO: Any additional checks we should do here? - // Sort proposals by number - let proposals = response - .proposals - .into_iter() - .sorted_by_key(|p| p.number()) - .collect_vec(); - - // Inject the proposals - self.inject_proposals(proposals)?; - Ok(()) - } - /// Add a peer to the list of peers. pub fn add_peer(&mut self, peer: PeerId) { // new peers should be tried last, which gives them time to sync first. From 1ee96851a9afa429d43b9f57059020fd3b97730c Mon Sep 17 00:00:00 2001 From: Shawn Date: Mon, 6 Jan 2025 10:17:52 +0800 Subject: [PATCH 039/119] feat: request multi-blocks from original meta-data peer. --- zilliqa/src/sync.rs | 40 ++++++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index decee9fb8..7fee18b7f 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -82,7 +82,7 @@ pub struct Sync { chain_metadata: BTreeMap, p1_metadata: Option, p2_metadata: Option, - landmarks: Vec, + landmarks: Vec<(Hash, PeerId)>, zip_queue: VecDeque, } @@ -255,16 +255,16 @@ impl Sync { from ); - if let Some(landmark) = self.landmarks.pop() { + if let Some((hash, peer_id)) = self.landmarks.pop() { // remove the last landmark, should match proposals.last() - let hash = proposals.last().as_ref().unwrap().hash(); - if hash != landmark { + let prop_hash = proposals.last().as_ref().unwrap().hash(); + if hash != prop_hash { tracing::warn!( "sync::MultiBlockResponse : mismatched landmark {} != {}", - landmark, hash, + prop_hash, ); - self.landmarks.push(landmark); // put it back + self.landmarks.push((hash, peer_id)); // put it back } } @@ -329,18 +329,18 @@ impl Sync { } else { return Ok(()); } + } else if self.injected > self.max_blocks_in_flight { + return Ok(()); } else if self.p2_metadata.is_none() { - tracing::warn!( - "sync::RequestMissingBlocks : no metadata to request missing blocks" - ); + tracing::warn!("sync::RequestMissingBlocks : no metadata to request missing blocks"); return Ok(()); } - // TODO: Use original peer, which would have the set of blocks + // Use original peer, which should have the blocks in the metadata if let Some(peer) = self.get_next_peer() { - // If we have no landmarks, we have nothing to do self.p2_metadata = None; - if let Some(hash) = self.landmarks.last() { + // If we have no landmarks, we have nothing to do + if let Some((hash, peer_id)) = self.landmarks.last() { let mut hash = *hash; // peek at the last value let mut request_hashes = Vec::with_capacity(self.max_batch_size); while let Some(meta) = self.chain_metadata.remove(&hash) { @@ -358,10 +358,15 @@ impl Sync { self.landmarks.len(), ); self.message_sender.send_external_message( - peer.peer_id, + *peer_id, ExternalMessage::MultiBlockRequest(request_hashes), )?; - self.in_flight = Some(peer); + self.peers.push(peer); // reinsert peer, as we will be using a faux peer below + self.in_flight = Some(PeerInfo { + peer_id: *peer_id, + last_used: std::time::Instant::now(), + score: u32::MAX, // used to indicate faux peer, will not be added to the group of peers + }); } else { // No more landmarks, we're done self.peers.push(peer); @@ -412,7 +417,7 @@ impl Sync { // TODO: Store peer id. // TODO: Insert intermediate landmarks self.landmarks - .push(metadata.first().as_ref().unwrap().block_hash); + .push((metadata.first().as_ref().unwrap().block_hash, from)); tracing::info!( "sync::MetadataResponse : received {} metadata set #{} from {}", @@ -581,7 +586,10 @@ impl Sync { peer.score = peer.score.saturating_add(downgrade as u32); // Ensure that the next peer is equal or better, to avoid a single source of truth. peer.score = peer.score.max(self.peers.peek().unwrap().score); - self.peers.push(peer); + // Reinsert peers that are good + if peer.score < u32::MAX { + self.peers.push(peer); + } } } From d016c05eaa7605912bbf5ee15ca353d757424060 Mon Sep 17 00:00:00 2001 From: Shawn Date: Mon, 6 Jan 2025 14:55:07 +0800 Subject: [PATCH 040/119] feat: validates the chain metadata as it is retrieved. --- zilliqa/src/sync.rs | 118 +++++++++++++++++++++++++++++++------------- 1 file changed, 85 insertions(+), 33 deletions(-) diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index 7fee18b7f..39413c827 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -15,6 +15,7 @@ use crate::{ db::Db, message::{Block, ChainMetaData, ExternalMessage, InjectedProposal, Proposal, RequestBlock}, node::MessageSender, + time::SystemTime, }; enum DownGrade { @@ -77,12 +78,15 @@ pub struct Sync { peer_id: PeerId, // how many injected proposals injected: usize, - - // Chain metadata + // complete chain metadata chain_metadata: BTreeMap, + // phase 1 cursor p1_metadata: Option, + // phase 2 cursor p2_metadata: Option, + // stack of chain landmarks landmarks: Vec<(Hash, PeerId)>, + // fixed-size queue of latest proposals zip_queue: VecDeque, } @@ -176,7 +180,8 @@ impl Sync { self.request_missing_chain(None)?; } else { // Start phase 1 - self.request_missing_chain(Some(parent_hash))?; + let block_number = self.zip_queue.back().unwrap().number(); + self.request_missing_chain(Some((parent_hash, block_number)))?; } } else { // 99% synced, zip it up! @@ -249,12 +254,13 @@ impl Sync { .collect_vec(); tracing::info!( - "sync::MultiBlockResponse : received {} blocks for set #{} from {}", + "sync::MultiBlockResponse : received {} blocks for segment #{} from {}", proposals.len(), self.landmarks.len(), from ); + // Check that this segment is for the expected landmark if let Some((hash, peer_id)) = self.landmarks.pop() { // remove the last landmark, should match proposals.last() let prop_hash = proposals.last().as_ref().unwrap().hash(); @@ -287,18 +293,24 @@ impl Sync { Ok(()) } + /// Returns a list of Proposals + /// + /// Given a set of block hashes, retrieve the list of proposals from its history. + /// Returns this list of proposals to the requestor. pub fn handle_multiblock_request( &mut self, from: PeerId, request: Vec, ) -> Result { - // ... - tracing::info!( + tracing::debug!( "sync::MultiBlockRequest : received a {} multiblock request from {}", request.len(), from ); + // TODO: Any additional checks + // Validators should not respond to this, unless they are free e.g. stuck in an exponential backoff. + let batch_size: usize = self.max_batch_size.min(request.len()); // mitigate DOS by limiting the number of blocks we return let mut proposals = Vec::with_capacity(batch_size); for hash in request { @@ -336,7 +348,7 @@ impl Sync { return Ok(()); } - // Use original peer, which should have the blocks in the metadata + // will be re-inserted below if let Some(peer) = self.get_next_peer() { self.p2_metadata = None; // If we have no landmarks, we have nothing to do @@ -346,16 +358,17 @@ impl Sync { while let Some(meta) = self.chain_metadata.remove(&hash) { request_hashes.push(meta.block_hash); hash = meta.parent_hash; - // re-insert the metadata so as not to lose it + // TODO: Allow retry of multi-block request // self.chain_metadata.insert(hash, meta); self.p2_metadata = Some(meta); } - // Fire request - tracing::debug!( - "sync::RequestMissingBlocks : requesting {} blocks of set #{}", + // Fire request, to the original peer that sent the segment metadata + tracing::info!( + "sync::RequestMissingBlocks : requesting {} blocks of segment #{} from {}", request_hashes.len(), self.landmarks.len(), + peer_id, ); self.message_sender.send_external_message( *peer_id, @@ -402,37 +415,69 @@ impl Sync { self.done_with_peer(DownGrade::None); } - // TODO: Check the linkage of the returned chain + // Check the linkage of the returned chain + let Some(p1) = self.p1_metadata.as_ref() else { + tracing::error!( + "no way to check chain linkage from {}", + response.first().unwrap().block_hash + ); + return Ok(()); + }; + let mut parent_hash = p1.parent_hash; + let mut parent_num = p1.block_number; + for meta in response.iter() { + // check that the block hash and number is as expected. + if meta.block_hash != Hash::ZERO + && meta.block_hash == parent_hash + && parent_num == meta.block_number + 1 + { + parent_hash = meta.parent_hash; + parent_num = meta.block_number; + } else { + // if something does not match, we will retry the request with the next peer. + // TODO: possibly, discard and rebuild entire chain + tracing::error!( + "sync::MetadataResponse : retry metadata history for {}", + parent_hash + ); + return Ok(()); + } + if meta.block_hash == response.last().unwrap().block_hash { + break; // done, we do not check the last parent, because that's outside this segment + } + } - // Sort metadata by number, reversed - let metadata = response - .into_iter() - .sorted_by(|a, b| b.block_number.cmp(&a.block_number)) - .collect_vec(); + // Chain segment is sane + let segment = response; - let p1_metadata = metadata.last().unwrap().clone(); - let last_hash = p1_metadata.block_hash; - self.p1_metadata = Some(p1_metadata); + // Record the oldest block in the chain + self.p1_metadata = Some(segment.last().unwrap().clone()); - // TODO: Store peer id. // TODO: Insert intermediate landmarks + // Record landmark, including peer that has this set of blocks self.landmarks - .push((metadata.first().as_ref().unwrap().block_hash, from)); + .push((segment.first().as_ref().unwrap().block_hash, from)); tracing::info!( - "sync::MetadataResponse : received {} metadata set #{} from {}", - metadata.len(), + "sync::MetadataResponse : received {} metadata segment #{} from {}", + segment.len(), self.landmarks.len(), from ); - // Store the metadata - for meta in metadata { - self.chain_metadata.insert(meta.block_hash, meta); + // Record the actual chain metadata + for meta in segment { + if self.chain_metadata.insert(meta.block_hash, meta).is_some() { + anyhow::bail!("loop in chain!"); // there is a possible loop in the chain + } } - // If the last block does not exist in our canonical history, fire the next request - if self.db.get_block_by_hash(&last_hash)?.is_some() { + // If the segment does not link to our canonical history, fire the next request + if self + .db + .get_block_by_hash(&self.p1_metadata.as_ref().unwrap().block_hash)? + .is_some() + { // Hit our internal history. Start phase 2. self.p2_metadata = self.p1_metadata.clone(); } else if DO_SPECULATIVE { @@ -452,7 +497,7 @@ impl Sync { from: PeerId, request: RequestBlock, ) -> Result { - tracing::info!( + tracing::debug!( "sync::MetadataRequest : received a metadata request from {}", from ); @@ -485,7 +530,7 @@ impl Sync { /// This constructs a chain history by requesting blocks from a peer, going backwards from a given block. /// If phase 1 is in progress, it continues requesting blocks from the last known phase 1 block. /// Otherwise, it requests blocks from the given omega_block. - pub fn request_missing_chain(&mut self, parent_hash: Option) -> Result<()> { + pub fn request_missing_chain(&mut self, block: Option<(Hash, u64)>) -> Result<()> { // Early exit if there's a request in-flight; and if it has not expired. if let Some(peer) = self.in_flight.as_ref() { if peer.last_used.elapsed() > self.request_timeout { @@ -512,7 +557,14 @@ impl Sync { from_hash: meta.parent_hash, batch_size: self.max_batch_size, }) - } else if let Some(hash) = parent_hash { + } else if let Some((hash, number)) = block { + // insert the starting point for phase 1 + self.p1_metadata = Some(ChainMetaData { + block_hash: Hash::ZERO, // invalid block hash + block_number: number, + parent_hash: hash, + block_timestamp: SystemTime::UNIX_EPOCH, + }); ExternalMessage::MetaDataRequest(RequestBlock { from_number: 0, from_hash: hash, @@ -570,7 +622,7 @@ impl Sync { )?; } - tracing::info!( + tracing::debug!( "sync::InjectProposals : injected {}/{} proposals", len, self.injected From d777a8a6272c8de9aeb0bec91899cc7ff70bb913 Mon Sep 17 00:00:00 2001 From: Shawn Date: Mon, 6 Jan 2025 16:17:30 +0800 Subject: [PATCH 041/119] chore: minor cleanup. --- zilliqa/src/message.rs | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/zilliqa/src/message.rs b/zilliqa/src/message.rs index 15ff364aa..26720461d 100644 --- a/zilliqa/src/message.rs +++ b/zilliqa/src/message.rs @@ -234,11 +234,6 @@ pub struct RequestBlock { pub batch_size: usize, } -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct ResponseBlock { - pub proposals: Vec, -} - /// Used to convey proposal processing internally, to avoid blocking threads for too long. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct InjectedProposal { @@ -291,10 +286,6 @@ pub enum ExternalMessage { Acknowledgement, AddPeer, RemovePeer, - RequestFromNumber(RequestBlock), - RequestFromHash(RequestBlock), - ResponseFromNumber(ResponseBlock), - ResponseFromHash(ResponseBlock), InjectedProposal(InjectedProposal), MetaDataRequest(RequestBlock), MetaDataResponse(Vec), @@ -332,22 +323,6 @@ impl Display for ExternalMessage { } ExternalMessage::AddPeer => write!(f, "AddPeer"), ExternalMessage::RemovePeer => write!(f, "RemovePeer"), - ExternalMessage::ResponseFromNumber(r) => { - write!(f, "ResponseFromNumber({})", r.proposals.len()) - } - ExternalMessage::ResponseFromHash(r) => { - write!(f, "ResponseFromHash({})", r.proposals.len()) - } - ExternalMessage::RequestFromNumber(r) => { - write!( - f, - "RequestFromNumber({}, num={})", - r.from_hash, r.batch_size - ) - } - ExternalMessage::RequestFromHash(r) => { - write!(f, "RequestFromHash({}, num={})", r.from_hash, r.batch_size) - } ExternalMessage::Proposal(p) => write!(f, "Proposal({})", p.view()), ExternalMessage::Vote(v) => write!(f, "Vote({})", v.view), ExternalMessage::NewView(n) => write!(f, "NewView({})", n.view), From 5c339e9abe92cdf551e384f0b16bca817e61f848 Mon Sep 17 00:00:00 2001 From: Shawn Date: Mon, 6 Jan 2025 17:40:00 +0800 Subject: [PATCH 042/119] feat: perform checks to ensure multi-block response matches multi-block request. --- zilliqa/src/sync.rs | 80 ++++++++++++++++++++++++++++++--------------- 1 file changed, 54 insertions(+), 26 deletions(-) diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index 39413c827..af2f31def 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -64,7 +64,7 @@ pub struct Sync { db: Arc, // message bus message_sender: MessageSender, - // internal peers + // internal list of peers, maintained with add_peer/remove_peer. peers: BinaryHeap, // in-flight in_flight: Option, @@ -83,7 +83,7 @@ pub struct Sync { // phase 1 cursor p1_metadata: Option, // phase 2 cursor - p2_metadata: Option, + p2_metadata: Option, // stack of chain landmarks landmarks: Vec<(Hash, PeerId)>, // fixed-size queue of latest proposals @@ -240,45 +240,65 @@ impl Sync { self.done_with_peer(DownGrade::Empty); } else if response.len() < self.max_batch_size { // Partial response, downgrade peer - // TODO: Match against request numbers tracing::warn!("sync::MultiBlockResponse : partial blocks {from}",); self.done_with_peer(DownGrade::Partial); } else { self.done_with_peer(DownGrade::None); } - // Sort proposals by number, ascending - let proposals = response - .into_iter() - .sorted_by_key(|p| p.number()) - .collect_vec(); - tracing::info!( "sync::MultiBlockResponse : received {} blocks for segment #{} from {}", - proposals.len(), + response.len(), self.landmarks.len(), from ); - // Check that this segment is for the expected landmark - if let Some((hash, peer_id)) = self.landmarks.pop() { - // remove the last landmark, should match proposals.last() - let prop_hash = proposals.last().as_ref().unwrap().hash(); - if hash != prop_hash { - tracing::warn!( - "sync::MultiBlockResponse : mismatched landmark {} != {}", - hash, - prop_hash, - ); - self.landmarks.push((hash, peer_id)); // put it back - } + let Some((hash, peer_id)) = self.landmarks.last() else { + tracing::error!("sync::MultiBlockResponse: no more landmarks!"); + return Ok(()); + }; + + // Check that this segment is from the requested peer. + if *peer_id != from { + tracing::error!("sync::MultiBlockResponse: response received from unknown peer {from}"); + return Ok(()); + } + + // Check that this segment starts at the expected landmark + let prop_hash = response.first().as_ref().unwrap().hash(); + if *hash != prop_hash { + tracing::warn!( + "sync::MultiBlockResponse : mismatched landmark {} != {}", + hash, + prop_hash, + ); + return Ok(()); + } + + // Check it matches request hashes + let checksum = response + .iter() + .fold(Hash::builder().with(Hash::ZERO.as_bytes()), |sum, p| { + sum.with(p.hash().as_bytes()) + }) + .finalize(); + if self.p2_metadata.unwrap_or_else(|| Hash::ZERO) != checksum { + tracing::error!("sync::MultiBlockResponse : mismatch request checksum {checksum}"); + return Ok(()); } + // Sort proposals by number, ascending + let proposals = response + .into_iter() + .sorted_by_key(|p| p.number()) + .collect_vec(); + // Remove the blocks from the chain metadata, if they exist for p in &proposals { self.chain_metadata.remove(&p.hash()); } + self.landmarks.pop(); self.inject_proposals(proposals)?; // Done with phase 2, allow phase 1 to restart. @@ -358,11 +378,19 @@ impl Sync { while let Some(meta) = self.chain_metadata.remove(&hash) { request_hashes.push(meta.block_hash); hash = meta.parent_hash; - // TODO: Allow retry of multi-block request - // self.chain_metadata.insert(hash, meta); - self.p2_metadata = Some(meta); + // TODO: Implement retry mechanism + // self.chain_metadata.insert(hash, meta); // reinsert, for retries } + // Checksum of the request hashes + let checksum = request_hashes + .iter() + .fold(Hash::builder().with(Hash::ZERO.as_bytes()), |sum, h| { + sum.with(h.as_bytes()) + }) + .finalize(); + self.p2_metadata = Some(checksum); + // Fire request, to the original peer that sent the segment metadata tracing::info!( "sync::RequestMissingBlocks : requesting {} blocks of segment #{} from {}", @@ -479,7 +507,7 @@ impl Sync { .is_some() { // Hit our internal history. Start phase 2. - self.p2_metadata = self.p1_metadata.clone(); + self.p2_metadata = Some(self.p1_metadata.as_ref().unwrap().block_hash); } else if DO_SPECULATIVE { self.request_missing_chain(None)?; } From b7bc13bd84216541ab0ff2a14fcf74acb1444303 Mon Sep 17 00:00:00 2001 From: Shawn Date: Mon, 6 Jan 2025 17:54:15 +0800 Subject: [PATCH 043/119] feat: allow retries of request_missing_blocks(). --- zilliqa/src/sync.rs | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index af2f31def..75c0ccbaf 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -282,20 +282,22 @@ impl Sync { sum.with(p.hash().as_bytes()) }) .finalize(); - if self.p2_metadata.unwrap_or_else(|| Hash::ZERO) != checksum { + if self.p2_metadata.unwrap_or(Hash::ZERO) != checksum { tracing::error!("sync::MultiBlockResponse : mismatch request checksum {checksum}"); return Ok(()); } - // Sort proposals by number, ascending + // Response seems sane. let proposals = response .into_iter() .sorted_by_key(|p| p.number()) .collect_vec(); - // Remove the blocks from the chain metadata, if they exist + // Remove the blocks from the chain metadata for p in &proposals { - self.chain_metadata.remove(&p.hash()); + if self.chain_metadata.remove(&p.hash()).is_none() { + anyhow::bail!("missing chain data for proposal"); // this should never happen! + } } self.landmarks.pop(); @@ -362,6 +364,10 @@ impl Sync { return Ok(()); } } else if self.injected > self.max_blocks_in_flight { + tracing::warn!( + "sync::RequestMissingBlocks : too many {} blocks in flight", + self.injected + ); return Ok(()); } else if self.p2_metadata.is_none() { tracing::warn!("sync::RequestMissingBlocks : no metadata to request missing blocks"); @@ -373,13 +379,12 @@ impl Sync { self.p2_metadata = None; // If we have no landmarks, we have nothing to do if let Some((hash, peer_id)) = self.landmarks.last() { - let mut hash = *hash; // peek at the last value let mut request_hashes = Vec::with_capacity(self.max_batch_size); - while let Some(meta) = self.chain_metadata.remove(&hash) { + let mut key = *hash; // start from this block + while let Some(meta) = self.chain_metadata.remove(&key) { request_hashes.push(meta.block_hash); - hash = meta.parent_hash; - // TODO: Implement retry mechanism - // self.chain_metadata.insert(hash, meta); // reinsert, for retries + key = meta.parent_hash; + self.chain_metadata.insert(meta.block_hash, meta); // reinsert, for retries } // Checksum of the request hashes From 2c35504c0f8be8384c5a7872580a18ef0014b616 Mon Sep 17 00:00:00 2001 From: Shawn Date: Tue, 7 Jan 2025 15:35:44 +0800 Subject: [PATCH 044/119] feat: added ability to retry phase 1, during phase 2 error. --- zilliqa/src/message.rs | 3 +- zilliqa/src/sync.rs | 254 +++++++++++++++++++++++------------------ 2 files changed, 141 insertions(+), 116 deletions(-) diff --git a/zilliqa/src/message.rs b/zilliqa/src/message.rs index 26720461d..4fbcbc6d5 100644 --- a/zilliqa/src/message.rs +++ b/zilliqa/src/message.rs @@ -229,7 +229,7 @@ impl fmt::Debug for BlockResponse { #[derive(Debug, Clone, Serialize, Deserialize)] pub struct RequestBlock { - pub from_number: u64, + pub request_at: SystemTime, pub from_hash: Hash, pub batch_size: usize, } @@ -249,7 +249,6 @@ pub struct ChainMetaData { pub block_hash: Hash, pub parent_hash: Hash, pub block_number: u64, - pub block_timestamp: SystemTime, } /// Used to convey proposal processing internally, to avoid blocking threads for too long. diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index 75c0ccbaf..15528d0da 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -66,28 +66,28 @@ pub struct Sync { message_sender: MessageSender, // internal list of peers, maintained with add_peer/remove_peer. peers: BinaryHeap, - // in-flight + // peer handling an in-flight request in_flight: Option, - // in-flight timeout + // in-flight request timeout, before retry request_timeout: Duration, // how many blocks to request at once max_batch_size: usize, // how many blocks to inject into the queue max_blocks_in_flight: usize, + // count of injected proposals pending processing + injected: usize, // our peer id peer_id: PeerId, - // how many injected proposals - injected: usize, - // complete chain metadata + // complete chain metadata, in-memory chain_metadata: BTreeMap, - // phase 1 cursor - p1_metadata: Option, - // phase 2 cursor - p2_metadata: Option, - // stack of chain landmarks - landmarks: Vec<(Hash, PeerId)>, - // fixed-size queue of latest proposals - zip_queue: VecDeque, + // markers to segments in the chain, and the source peer for that segment. + chain_segments: Vec<(PeerId, Hash, u64)>, + // phase 1 cursor containing parent hash, and block number. + p1_cursor: Option<(Hash, u64)>, + // phase 2 cursor containing a hash of a set of hashes. + p2_cursor: Option, + // fixed-size queue of the most recent proposals + recent_proposals: VecDeque, } impl Sync { @@ -118,10 +118,10 @@ impl Sync { in_flight: None, injected: 0, chain_metadata: BTreeMap::new(), - p1_metadata: None, - landmarks: Vec::new(), - p2_metadata: None, - zip_queue: VecDeque::with_capacity(GAP_THRESHOLD), + p1_cursor: None, + chain_segments: Vec::new(), + p2_cursor: None, + recent_proposals: VecDeque::with_capacity(GAP_THRESHOLD), }) } @@ -155,43 +155,44 @@ impl Sync { /// We do not perform checks on the Proposal here. This is done in the consensus layer. pub fn sync_proposal(&mut self, proposal: Proposal) -> Result<()> { // just stuff the latest proposal into the fixed-size queue. - while self.zip_queue.len() >= GAP_THRESHOLD { - self.zip_queue.pop_front(); + while self.recent_proposals.len() >= GAP_THRESHOLD { + self.recent_proposals.pop_front(); } - self.zip_queue.push_back(proposal); + self.recent_proposals.push_back(proposal); // TODO: Replace with single SQL query // Check if block parent exist in history - let parent_hash = self.zip_queue.back().unwrap().header.qc.block_hash; + let parent_hash = self.recent_proposals.back().unwrap().header.qc.block_hash; if self.db.get_block_by_hash(&parent_hash)?.is_none() { // Check if oldes block exists in the history. If it does, we have synced up 99% of the chain. - let ancestor_hash = self.zip_queue.front().unwrap().header.qc.block_hash; - if self.zip_queue.len() == 1 || self.db.get_block_by_hash(&ancestor_hash)?.is_none() { + let ancestor_hash = self.recent_proposals.front().unwrap().header.qc.block_hash; + if self.recent_proposals.len() == 1 + || self.db.get_block_by_hash(&ancestor_hash)?.is_none() + { // No ancestor block, trigger sync tracing::warn!( "sync::SyncProposal : parent block {} not found", parent_hash ); - if self.p2_metadata.is_some() { + if self.p2_cursor.is_some() { // Continue phase 2 self.request_missing_blocks()?; - } else if self.p1_metadata.is_some() { + } else if self.p1_cursor.is_some() { // Continue phase 1 - self.request_missing_chain(None)?; + self.request_missing_metadata(None)?; } else { // Start phase 1 - let block_number = self.zip_queue.back().unwrap().number(); - self.request_missing_chain(Some((parent_hash, block_number)))?; + let block_number = self.recent_proposals.back().unwrap().number(); + self.request_missing_metadata(Some((parent_hash, block_number)))?; } } else { // 99% synced, zip it up! tracing::info!( - "sync::SyncProposal : zip up {} blocks from {}", - self.zip_queue.len(), - ancestor_hash + "sync::SyncProposal : finishing up {} blocks for segment #0 from {ancestor_hash}", + self.recent_proposals.len() ); // parent block exists, inject the proposal - let proposals = self.zip_queue.drain(..).collect_vec(); + let proposals = self.recent_proposals.drain(..).collect_vec(); self.inject_proposals(proposals)?; // we're done } @@ -220,26 +221,53 @@ impl Sync { block_number: block.number(), block_hash: block.hash(), parent_hash: block.parent_hash(), - block_timestamp: block.timestamp(), } } + /// Retry phase 1 + /// + /// If something went wrong, phase 1 may need to be retried for the most recent segment. + /// Pop the segment from the landmark, and continue phase 1. + fn retry_phase1(&mut self) -> Result<()> { + if self.chain_segments.is_empty() { + tracing::error!("sync::RetryPhase1 : cannot retry phase 1 without chain_segments!"); + return Ok(()); + } + + // remove the last segment from the chain metadata + let (peer, hash, num) = self.chain_segments.pop().unwrap(); + let mut key = hash; + while let Some(p) = self.chain_metadata.remove(&key) { + key = p.parent_hash; + } + + // set the p1/p2 cursor value, to allow retry from p1 + self.p1_cursor = Some((hash, num)); + self.p2_cursor = None; + tracing::info!("sync::RetryPhase1 : retrying block {hash} from {peer}"); + if DO_SPECULATIVE { + self.request_missing_metadata(None)?; + } + Ok(()) + } + /// Handle a multi-block response. /// - /// This is the final step in the syncing algorithm, where we receive a set of blocks and inject them into - /// the pipeline. We also remove the blocks from the chain metadata, because they are now in the pipeline. + /// This is phase 2 in the syncing algorithm, where we receive a set of blocks and inject them into the pipeline. + /// We also remove the blocks from the chain metadata, because they are now in the pipeline. pub fn handle_multiblock_response( &mut self, from: PeerId, response: Vec, ) -> Result<()> { - // Process whatever we received + // Process only a full response if response.is_empty() { - // Empty response, downgrade peer + // Empty response, downgrade peer and retry phase 1. tracing::warn!("sync::MultiBlockResponse : empty blocks {from}",); self.done_with_peer(DownGrade::Empty); + return self.retry_phase1(); } else if response.len() < self.max_batch_size { - // Partial response, downgrade peer + // Partial response, downgrade peer but process the block. tracing::warn!("sync::MultiBlockResponse : partial blocks {from}",); self.done_with_peer(DownGrade::Partial); } else { @@ -249,42 +277,38 @@ impl Sync { tracing::info!( "sync::MultiBlockResponse : received {} blocks for segment #{} from {}", response.len(), - self.landmarks.len(), + self.chain_segments.len(), from ); - let Some((hash, peer_id)) = self.landmarks.last() else { - tracing::error!("sync::MultiBlockResponse: no more landmarks!"); - return Ok(()); + // Spurious response + let Some((peer_id, hash, _)) = self.chain_segments.last() else { + anyhow::bail!("sync::MultiBlockResponse: no more chain_segments!"); }; - // Check that this segment is from the requested peer. + // If the response is not from the expected peer, retry phase 2. if *peer_id != from { - tracing::error!("sync::MultiBlockResponse: response received from unknown peer {from}"); + tracing::warn!("sync::MultiBlockResponse: unknown peer {from}, will retry"); return Ok(()); } - // Check that this segment starts at the expected landmark + // Segment history does not match, retry phase 1. let prop_hash = response.first().as_ref().unwrap().hash(); if *hash != prop_hash { - tracing::warn!( - "sync::MultiBlockResponse : mismatched landmark {} != {}", - hash, - prop_hash, - ); - return Ok(()); + tracing::error!("sync::MultiBlockResponse : mismatched landmark {hash} != {prop_hash}"); + return self.retry_phase1(); } - // Check it matches request hashes + // If the checksum does not match, retry phase 1. Maybe the node has pruned the segment. let checksum = response .iter() .fold(Hash::builder().with(Hash::ZERO.as_bytes()), |sum, p| { sum.with(p.hash().as_bytes()) }) .finalize(); - if self.p2_metadata.unwrap_or(Hash::ZERO) != checksum { - tracing::error!("sync::MultiBlockResponse : mismatch request checksum {checksum}"); - return Ok(()); + if self.p2_cursor.unwrap_or(Hash::ZERO) != checksum { + tracing::error!("sync::MultiBlockResponse : mismatch history {checksum}"); + return self.retry_phase1(); } // Response seems sane. @@ -300,12 +324,13 @@ impl Sync { } } - self.landmarks.pop(); + // Done with this segment + self.chain_segments.pop(); self.inject_proposals(proposals)?; // Done with phase 2, allow phase 1 to restart. - if self.landmarks.is_empty() { - self.p1_metadata = None; + if self.chain_segments.is_empty() { + self.p1_cursor = None; self.chain_metadata.clear(); } else if DO_SPECULATIVE && self.injected < self.max_blocks_in_flight { // Speculatively request more blocks @@ -369,16 +394,16 @@ impl Sync { self.injected ); return Ok(()); - } else if self.p2_metadata.is_none() { + } else if self.p2_cursor.is_none() { tracing::warn!("sync::RequestMissingBlocks : no metadata to request missing blocks"); return Ok(()); } // will be re-inserted below if let Some(peer) = self.get_next_peer() { - self.p2_metadata = None; - // If we have no landmarks, we have nothing to do - if let Some((hash, peer_id)) = self.landmarks.last() { + self.p2_cursor = None; + // If we have no chain_segments, we have nothing to do + if let Some((peer_id, hash, _)) = self.chain_segments.last() { let mut request_hashes = Vec::with_capacity(self.max_batch_size); let mut key = *hash; // start from this block while let Some(meta) = self.chain_metadata.remove(&key) { @@ -394,13 +419,13 @@ impl Sync { sum.with(h.as_bytes()) }) .finalize(); - self.p2_metadata = Some(checksum); + self.p2_cursor = Some(checksum); // Fire request, to the original peer that sent the segment metadata tracing::info!( "sync::RequestMissingBlocks : requesting {} blocks of segment #{} from {}", request_hashes.len(), - self.landmarks.len(), + self.chain_segments.len(), peer_id, ); self.message_sender.send_external_message( @@ -414,7 +439,7 @@ impl Sync { score: u32::MAX, // used to indicate faux peer, will not be added to the group of peers }); } else { - // No more landmarks, we're done + // No more chain_segments, we're done self.peers.push(peer); } } else { @@ -428,7 +453,7 @@ impl Sync { /// Handle a response to a metadata request. /// /// This is the first step in the syncing algorithm, where we receive a set of metadata and use it to - /// construct a chain history. We then request the missing blocks from the chain. + /// construct a chain history. pub fn handle_metadata_response( &mut self, from: PeerId, @@ -436,12 +461,12 @@ impl Sync { ) -> Result<()> { // Process whatever we have received. if response.is_empty() { - // Empty response, downgrade peer + // Empty response, downgrade peer and retry with a new peer. tracing::warn!("sync::MetadataResponse : empty blocks {from}",); self.done_with_peer(DownGrade::Empty); return Ok(()); } else if response.len() < self.max_batch_size { - // Partial response, downgrade peer + // Partial response, downgrade peer but accept the response. tracing::warn!("sync::MetadataResponse : partial blocks {from}",); self.done_with_peer(DownGrade::Partial); } else { @@ -449,29 +474,28 @@ impl Sync { } // Check the linkage of the returned chain - let Some(p1) = self.p1_metadata.as_ref() else { + let Some((p1_hash, p1_num)) = self.p1_cursor.as_ref() else { tracing::error!( - "no way to check chain linkage from {}", + "synce::MetadataResponse : no way to check chain history from {}", response.first().unwrap().block_hash ); return Ok(()); }; - let mut parent_hash = p1.parent_hash; - let mut parent_num = p1.block_number; + let mut block_hash = *p1_hash; + let mut block_num = *p1_num; for meta in response.iter() { // check that the block hash and number is as expected. if meta.block_hash != Hash::ZERO - && meta.block_hash == parent_hash - && parent_num == meta.block_number + 1 + && block_hash == meta.block_hash + && block_num == meta.block_number + 1 { - parent_hash = meta.parent_hash; - parent_num = meta.block_number; + block_hash = meta.parent_hash; + block_num = meta.block_number; } else { - // if something does not match, we will retry the request with the next peer. // TODO: possibly, discard and rebuild entire chain + // if something does not match, do nothing and retry the request with the next peer. tracing::error!( - "sync::MetadataResponse : retry metadata history for {}", - parent_hash + "sync::MetadataResponse : retry metadata history for {block_hash}/{block_num}" ); return Ok(()); } @@ -483,22 +507,24 @@ impl Sync { // Chain segment is sane let segment = response; - // Record the oldest block in the chain - self.p1_metadata = Some(segment.last().unwrap().clone()); - - // TODO: Insert intermediate landmarks // Record landmark, including peer that has this set of blocks - self.landmarks - .push((segment.first().as_ref().unwrap().block_hash, from)); + self.chain_segments.push((from, *p1_hash, *p1_num)); + + // Record the oldest block in the chain's parent + self.p1_cursor = Some(( + segment.last().unwrap().parent_hash, + segment.last().unwrap().block_number, + )); tracing::info!( "sync::MetadataResponse : received {} metadata segment #{} from {}", segment.len(), - self.landmarks.len(), + self.chain_segments.len(), from ); // Record the actual chain metadata + let last_block_hash = segment.last().as_ref().unwrap().block_hash; for meta in segment { if self.chain_metadata.insert(meta.block_hash, meta).is_some() { anyhow::bail!("loop in chain!"); // there is a possible loop in the chain @@ -506,15 +532,11 @@ impl Sync { } // If the segment does not link to our canonical history, fire the next request - if self - .db - .get_block_by_hash(&self.p1_metadata.as_ref().unwrap().block_hash)? - .is_some() - { - // Hit our internal history. Start phase 2. - self.p2_metadata = Some(self.p1_metadata.as_ref().unwrap().block_hash); + if self.db.get_block_by_hash(&last_block_hash)?.is_some() { + // Hit our internal history. Next, phase 2. + self.p2_cursor = Some(Hash::ZERO); } else if DO_SPECULATIVE { - self.request_missing_chain(None)?; + self.request_missing_metadata(None)?; } Ok(()) @@ -535,6 +557,12 @@ impl Sync { from ); + // Do not respond to stale requests + if request.request_at.elapsed()? > self.request_timeout { + tracing::warn!("sync::MetadataRequest : stale request"); + return Ok(ExternalMessage::Acknowledgement); + } + // TODO: Check if we should service this request // Validators could respond to this request if there is nothing else to do. @@ -563,53 +591,51 @@ impl Sync { /// This constructs a chain history by requesting blocks from a peer, going backwards from a given block. /// If phase 1 is in progress, it continues requesting blocks from the last known phase 1 block. /// Otherwise, it requests blocks from the given omega_block. - pub fn request_missing_chain(&mut self, block: Option<(Hash, u64)>) -> Result<()> { + pub fn request_missing_metadata(&mut self, block: Option<(Hash, u64)>) -> Result<()> { // Early exit if there's a request in-flight; and if it has not expired. if let Some(peer) = self.in_flight.as_ref() { if peer.last_used.elapsed() > self.request_timeout { tracing::warn!( - "sync::RequestMissingChain : in-flight request {} timed out, requesting from new peer", + "sync::RequestMissingMetadata : in-flight request {} timed out, requesting from new peer", peer.peer_id ); self.done_with_peer(DownGrade::Timeout); } else { return Ok(()); } - } else if self.injected > 0 { - tracing::warn!( - "sync::RequestMissingChain : too many {} blocks in flight", - self.injected - ); + } else if self.p2_cursor.is_some() { + tracing::warn!("sync::RequestMissingMetadata : phase 2 in progress"); return Ok(()); + // } else if self.injected > 0 { + // tracing::warn!( + // "sync::RequestMissingMetadata : too many {} blocks in flight", + // self.injected + // ); + // return Ok(()); } if let Some(peer) = self.get_next_peer() { - let message = if let Some(meta) = self.p1_metadata.as_ref() { + let message = if let Some((hash, _)) = self.p1_cursor.as_ref() { ExternalMessage::MetaDataRequest(RequestBlock { - from_number: 0, - from_hash: meta.parent_hash, + request_at: SystemTime::now(), + from_hash: *hash, batch_size: self.max_batch_size, }) } else if let Some((hash, number)) = block { // insert the starting point for phase 1 - self.p1_metadata = Some(ChainMetaData { - block_hash: Hash::ZERO, // invalid block hash - block_number: number, - parent_hash: hash, - block_timestamp: SystemTime::UNIX_EPOCH, - }); + self.p1_cursor = Some((hash, number)); ExternalMessage::MetaDataRequest(RequestBlock { - from_number: 0, + request_at: SystemTime::now(), from_hash: hash, batch_size: self.max_batch_size, }) } else { - todo!("sync::RequestMissingChain : no metadata to request missing blocks"); + todo!("sync::RequestMissingMetadata : no metadata to request missing blocks"); }; tracing::info!( ?message, - "sync::RequestMissingChain : requesting missing chain from {}", + "sync::RequestMissingMetadata : requesting missing chain from {}", peer.peer_id ); self.message_sender @@ -618,7 +644,7 @@ impl Sync { self.in_flight = Some(peer); } else { tracing::warn!( - "sync::RequestMissingChain : insufficient peers to request missing blocks" + "sync::RequestMissingMetadata : insufficient peers to request missing blocks" ); } Ok(()) From 3fae169c6ab222d7380d680943eb6ba7af3090ba Mon Sep 17 00:00:00 2001 From: Shawn Date: Tue, 7 Jan 2025 21:28:19 +0800 Subject: [PATCH 045/119] feat: combined p1_cursor/p2_cursor into a self.state value. --- zilliqa/src/sync.rs | 141 +++++++++++++++++++++++--------------------- 1 file changed, 75 insertions(+), 66 deletions(-) diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index 15528d0da..2f33bebc9 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -18,13 +18,6 @@ use crate::{ time::SystemTime, }; -enum DownGrade { - None, - Partial, - Timeout, - Empty, -} - // Syncing Algorithm // // When a Proposal is received by Consensus, we check if the parent exists in our DB. @@ -78,14 +71,12 @@ pub struct Sync { injected: usize, // our peer id peer_id: PeerId, + // internal sync state + state: SyncState, // complete chain metadata, in-memory chain_metadata: BTreeMap, // markers to segments in the chain, and the source peer for that segment. chain_segments: Vec<(PeerId, Hash, u64)>, - // phase 1 cursor containing parent hash, and block number. - p1_cursor: Option<(Hash, u64)>, - // phase 2 cursor containing a hash of a set of hashes. - p2_cursor: Option, // fixed-size queue of the most recent proposals recent_proposals: VecDeque, } @@ -118,9 +109,8 @@ impl Sync { in_flight: None, injected: 0, chain_metadata: BTreeMap::new(), - p1_cursor: None, chain_segments: Vec::new(), - p2_cursor: None, + state: SyncState::Phase0, recent_proposals: VecDeque::with_capacity(GAP_THRESHOLD), }) } @@ -174,16 +164,22 @@ impl Sync { "sync::SyncProposal : parent block {} not found", parent_hash ); - if self.p2_cursor.is_some() { - // Continue phase 2 - self.request_missing_blocks()?; - } else if self.p1_cursor.is_some() { - // Continue phase 1 - self.request_missing_metadata(None)?; - } else { - // Start phase 1 - let block_number = self.recent_proposals.back().unwrap().number(); - self.request_missing_metadata(Some((parent_hash, block_number)))?; + // TODO: Move this up + match self.state { + SyncState::Phase0 => { + // Start phase 1 + let block_number = self.recent_proposals.back().unwrap().number(); + self.request_missing_metadata(Some((parent_hash, block_number)))?; + } + SyncState::Phase1(_, _) => { + // Continue phase 1 + self.request_missing_metadata(None)?; + } + SyncState::Phase2(_) => { + // Continue phase 2 + self.request_missing_blocks()?; + } + SyncState::Phase3 => {} } } else { // 99% synced, zip it up! @@ -241,9 +237,8 @@ impl Sync { key = p.parent_hash; } - // set the p1/p2 cursor value, to allow retry from p1 - self.p1_cursor = Some((hash, num)); - self.p2_cursor = None; + // allow retry from p1 + self.state = SyncState::Phase1(hash, num); tracing::info!("sync::RetryPhase1 : retrying block {hash} from {peer}"); if DO_SPECULATIVE { self.request_missing_metadata(None)?; @@ -274,6 +269,10 @@ impl Sync { self.done_with_peer(DownGrade::None); } + let SyncState::Phase2(p2_hash) = self.state else { + anyhow::bail!("sync::MultiBlockResponse : invalid state"); + }; + tracing::info!( "sync::MultiBlockResponse : received {} blocks for segment #{} from {}", response.len(), @@ -306,7 +305,8 @@ impl Sync { sum.with(p.hash().as_bytes()) }) .finalize(); - if self.p2_cursor.unwrap_or(Hash::ZERO) != checksum { + + if p2_hash != checksum { tracing::error!("sync::MultiBlockResponse : mismatch history {checksum}"); return self.retry_phase1(); } @@ -330,8 +330,7 @@ impl Sync { // Done with phase 2, allow phase 1 to restart. if self.chain_segments.is_empty() { - self.p1_cursor = None; - self.chain_metadata.clear(); + self.state = SyncState::Phase3; } else if DO_SPECULATIVE && self.injected < self.max_blocks_in_flight { // Speculatively request more blocks self.request_missing_blocks()?; @@ -394,14 +393,15 @@ impl Sync { self.injected ); return Ok(()); - } else if self.p2_cursor.is_none() { + }; + + let SyncState::Phase2(_) = self.state else { tracing::warn!("sync::RequestMissingBlocks : no metadata to request missing blocks"); return Ok(()); - } + }; // will be re-inserted below if let Some(peer) = self.get_next_peer() { - self.p2_cursor = None; // If we have no chain_segments, we have nothing to do if let Some((peer_id, hash, _)) = self.chain_segments.last() { let mut request_hashes = Vec::with_capacity(self.max_batch_size); @@ -419,7 +419,7 @@ impl Sync { sum.with(h.as_bytes()) }) .finalize(); - self.p2_cursor = Some(checksum); + self.state = SyncState::Phase2(checksum); // Fire request, to the original peer that sent the segment metadata tracing::info!( @@ -474,15 +474,12 @@ impl Sync { } // Check the linkage of the returned chain - let Some((p1_hash, p1_num)) = self.p1_cursor.as_ref() else { - tracing::error!( - "synce::MetadataResponse : no way to check chain history from {}", - response.first().unwrap().block_hash - ); - return Ok(()); + let SyncState::Phase1(p1_hash, p1_num) = self.state else { + anyhow::bail!("sync::MetadataResponse : invalid state"); }; - let mut block_hash = *p1_hash; - let mut block_num = *p1_num; + + let mut block_hash = p1_hash; + let mut block_num = p1_num; for meta in response.iter() { // check that the block hash and number is as expected. if meta.block_hash != Hash::ZERO @@ -508,13 +505,13 @@ impl Sync { let segment = response; // Record landmark, including peer that has this set of blocks - self.chain_segments.push((from, *p1_hash, *p1_num)); + self.chain_segments.push((from, p1_hash, p1_num)); // Record the oldest block in the chain's parent - self.p1_cursor = Some(( + self.state = SyncState::Phase1( segment.last().unwrap().parent_hash, segment.last().unwrap().block_number, - )); + ); tracing::info!( "sync::MetadataResponse : received {} metadata segment #{} from {}", @@ -534,7 +531,7 @@ impl Sync { // If the segment does not link to our canonical history, fire the next request if self.db.get_block_by_hash(&last_block_hash)?.is_some() { // Hit our internal history. Next, phase 2. - self.p2_cursor = Some(Hash::ZERO); + self.state = SyncState::Phase2(Hash::ZERO); } else if DO_SPECULATIVE { self.request_missing_metadata(None)?; } @@ -603,39 +600,33 @@ impl Sync { } else { return Ok(()); } - } else if self.p2_cursor.is_some() { + } else if let SyncState::Phase2(_) = self.state { tracing::warn!("sync::RequestMissingMetadata : phase 2 in progress"); return Ok(()); - // } else if self.injected > 0 { - // tracing::warn!( - // "sync::RequestMissingMetadata : too many {} blocks in flight", - // self.injected - // ); - // return Ok(()); } if let Some(peer) = self.get_next_peer() { - let message = if let Some((hash, _)) = self.p1_cursor.as_ref() { - ExternalMessage::MetaDataRequest(RequestBlock { - request_at: SystemTime::now(), - from_hash: *hash, - batch_size: self.max_batch_size, - }) - } else if let Some((hash, number)) = block { - // insert the starting point for phase 1 - self.p1_cursor = Some((hash, number)); - ExternalMessage::MetaDataRequest(RequestBlock { + let message = match self.state { + SyncState::Phase1(hash, _) => ExternalMessage::MetaDataRequest(RequestBlock { request_at: SystemTime::now(), from_hash: hash, batch_size: self.max_batch_size, - }) - } else { - todo!("sync::RequestMissingMetadata : no metadata to request missing blocks"); + }), + SyncState::Phase0 if block.is_some() => { + let (hash, number) = block.unwrap(); + self.state = SyncState::Phase1(hash, number); + ExternalMessage::MetaDataRequest(RequestBlock { + request_at: SystemTime::now(), + from_hash: hash, + batch_size: self.max_batch_size, + }) + } + _ => anyhow::bail!("sync::MissingMetadata : invalid state"), }; tracing::info!( ?message, - "sync::RequestMissingMetadata : requesting missing chain from {}", + "sync::RequestMissingMetadata : requesting {} missing chain from {}", peer.peer_id ); self.message_sender @@ -753,3 +744,21 @@ impl PartialOrd for PeerInfo { Some(self.cmp(other)) } } + +/// Peer downgrade states/values, for downgrading an internal peer from selection. +#[derive(Debug)] +enum DownGrade { + None, + Partial, + Timeout, + Empty, +} + +/// Sync state +#[derive(Debug)] +enum SyncState { + Phase0, + Phase1(Hash, u64), + Phase2(Hash), + Phase3, +} From 4c0a274d928b63cf76bd4e93a6bebaa2b20430f6 Mon Sep 17 00:00:00 2001 From: Shawn Date: Tue, 7 Jan 2025 22:29:28 +0800 Subject: [PATCH 046/119] feat: restructure sync_proposal() to make it legible. --- zilliqa/src/sync.rs | 102 ++++++++++++++++++++++---------------------- 1 file changed, 51 insertions(+), 51 deletions(-) diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index 2f33bebc9..ba1110b0d 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -150,49 +150,44 @@ impl Sync { } self.recent_proposals.push_back(proposal); - // TODO: Replace with single SQL query - // Check if block parent exist in history - let parent_hash = self.recent_proposals.back().unwrap().header.qc.block_hash; - if self.db.get_block_by_hash(&parent_hash)?.is_none() { - // Check if oldes block exists in the history. If it does, we have synced up 99% of the chain. - let ancestor_hash = self.recent_proposals.front().unwrap().header.qc.block_hash; - if self.recent_proposals.len() == 1 - || self.db.get_block_by_hash(&ancestor_hash)?.is_none() - { - // No ancestor block, trigger sync - tracing::warn!( - "sync::SyncProposal : parent block {} not found", - parent_hash - ); - // TODO: Move this up - match self.state { - SyncState::Phase0 => { - // Start phase 1 - let block_number = self.recent_proposals.back().unwrap().number(); - self.request_missing_metadata(Some((parent_hash, block_number)))?; - } - SyncState::Phase1(_, _) => { - // Continue phase 1 - self.request_missing_metadata(None)?; - } - SyncState::Phase2(_) => { - // Continue phase 2 - self.request_missing_blocks()?; - } - SyncState::Phase3 => {} + match self.state { + // Check if we are out of sync + SyncState::Phase0 if self.injected == 0 => { + let parent_hash = self.recent_proposals.back().unwrap().header.qc.block_hash; + if self.db.get_block_by_hash(&parent_hash)?.is_none() { + // No parent block, trigger sync + tracing::warn!("sync::SyncProposal : syncing from {parent_hash}",); + let block_number = self.recent_proposals.back().unwrap().number(); + self.request_missing_metadata(Some((parent_hash, block_number)))?; } - } else { - // 99% synced, zip it up! - tracing::info!( - "sync::SyncProposal : finishing up {} blocks for segment #0 from {ancestor_hash}", - self.recent_proposals.len() - ); - // parent block exists, inject the proposal - let proposals = self.recent_proposals.drain(..).collect_vec(); - self.inject_proposals(proposals)?; - // we're done + } + // Continue phase 1, until we hit history/genesis. + SyncState::Phase1(_, _) if self.injected < self.max_batch_size => { + self.request_missing_metadata(None)?; + } + // Continue phase 2, until we have all segments. + SyncState::Phase2(_) if self.injected < self.max_blocks_in_flight => { + self.request_missing_blocks()?; + } + // Wait till 99% synced, zip it up! + SyncState::Phase3 if self.injected == 0 => { + let ancestor_hash = self.recent_proposals.front().unwrap().header.qc.block_hash; + if self.db.get_block_by_hash(&ancestor_hash)?.is_some() { + tracing::info!( + "sync::SyncProposal : finishing up {} blocks for segment #0 from {ancestor_hash}", + self.recent_proposals.len() + ); + // inject the proposals + let proposals = self.recent_proposals.drain(..).collect_vec(); + self.inject_proposals(proposals)?; + } + self.state = SyncState::Phase0; + } + _ => { + tracing::debug!("sync::SyncProposal : syncing {} blocks", self.injected); } } + Ok(()) } @@ -328,7 +323,7 @@ impl Sync { self.chain_segments.pop(); self.inject_proposals(proposals)?; - // Done with phase 2, allow phase 1 to restart. + // Done with phase 2 if self.chain_segments.is_empty() { self.state = SyncState::Phase3; } else if DO_SPECULATIVE && self.injected < self.max_blocks_in_flight { @@ -376,6 +371,9 @@ impl Sync { /// These hashes are then sent to a Peer for retrieval. /// This is Part 2 of the syncing algorithm. fn request_missing_blocks(&mut self) -> Result<()> { + if !matches!(self.state, SyncState::Phase2(_)) { + anyhow::bail!("sync::RequestMissingBlocks : invalid state"); + } // Early exit if there's a request in-flight; and if it has not expired. if let Some(peer) = self.in_flight.as_ref() { if peer.last_used.elapsed() > self.request_timeout { @@ -389,16 +387,11 @@ impl Sync { } } else if self.injected > self.max_blocks_in_flight { tracing::warn!( - "sync::RequestMissingBlocks : too many {} blocks in flight", + "sync::RequestMissingBlocks : syncing {} blocks in flight", self.injected ); return Ok(()); - }; - - let SyncState::Phase2(_) = self.state else { - tracing::warn!("sync::RequestMissingBlocks : no metadata to request missing blocks"); - return Ok(()); - }; + } // will be re-inserted below if let Some(peer) = self.get_next_peer() { @@ -589,6 +582,9 @@ impl Sync { /// If phase 1 is in progress, it continues requesting blocks from the last known phase 1 block. /// Otherwise, it requests blocks from the given omega_block. pub fn request_missing_metadata(&mut self, block: Option<(Hash, u64)>) -> Result<()> { + if matches!(self.state, SyncState::Phase2(_)) || matches!(self.state, SyncState::Phase3) { + anyhow::bail!("sync::RequestMissingMetadata : invalid state"); + } // Early exit if there's a request in-flight; and if it has not expired. if let Some(peer) = self.in_flight.as_ref() { if peer.last_used.elapsed() > self.request_timeout { @@ -600,8 +596,12 @@ impl Sync { } else { return Ok(()); } - } else if let SyncState::Phase2(_) = self.state { - tracing::warn!("sync::RequestMissingMetadata : phase 2 in progress"); + } else if self.injected > self.max_batch_size { + // anything more than this and we cannot check whether the segment hits history + tracing::warn!( + "sync::RequestMissingMetadata : syncing {} blocks in flight", + self.injected + ); return Ok(()); } @@ -626,7 +626,7 @@ impl Sync { tracing::info!( ?message, - "sync::RequestMissingMetadata : requesting {} missing chain from {}", + "sync::RequestMissingMetadata : requesting missing chain from {}", peer.peer_id ); self.message_sender From 2d1b044d8dccadb76fa2eabd9214880dd504a51a Mon Sep 17 00:00:00 2001 From: Shawn Date: Wed, 8 Jan 2025 00:00:38 +0800 Subject: [PATCH 047/119] checkpoint: working sync with state machine. --- zilliqa/src/sync.rs | 79 +++++++++++++++++++++++---------------------- 1 file changed, 41 insertions(+), 38 deletions(-) diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index ba1110b0d..cb428a38f 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -67,8 +67,8 @@ pub struct Sync { max_batch_size: usize, // how many blocks to inject into the queue max_blocks_in_flight: usize, - // count of injected proposals pending processing - injected: usize, + // count of proposals pending in the pipeline + in_pipeline: usize, // our peer id peer_id: PeerId, // internal sync state @@ -107,7 +107,7 @@ impl Sync { max_batch_size: config.block_request_batch_size.max(31), // between 30 seconds and 3 days of blocks. max_blocks_in_flight: config.max_blocks_in_flight.min(3600), // cap to 1-hr worth of blocks in_flight: None, - injected: 0, + in_pipeline: usize::MIN, chain_metadata: BTreeMap::new(), chain_segments: Vec::new(), state: SyncState::Phase0, @@ -115,26 +115,6 @@ impl Sync { }) } - /// Mark a received proposal - /// - /// Mark a proposal as received, and remove it from the cache. - pub fn mark_received_proposal(&mut self, prop: &InjectedProposal) -> Result<()> { - if prop.from != self.peer_id { - tracing::error!( - "sync::MarkReceivedProposal : foreign InjectedProposal from {}", - prop.from - ); - } - if let Some(p) = self.chain_metadata.remove(&prop.block.hash()) { - tracing::warn!( - "sync::MarkReceivedProposal : removing stale metadata {}", - p.block_hash - ); - } - self.injected = self.injected.saturating_sub(1); - Ok(()) - } - /// Sync a block proposal. /// /// This is the main entry point for syncing a block proposal. @@ -152,7 +132,7 @@ impl Sync { match self.state { // Check if we are out of sync - SyncState::Phase0 if self.injected == 0 => { + SyncState::Phase0 if self.in_pipeline == 0 => { let parent_hash = self.recent_proposals.back().unwrap().header.qc.block_hash; if self.db.get_block_by_hash(&parent_hash)?.is_none() { // No parent block, trigger sync @@ -162,15 +142,15 @@ impl Sync { } } // Continue phase 1, until we hit history/genesis. - SyncState::Phase1(_, _) if self.injected < self.max_batch_size => { + SyncState::Phase1(_, _) if self.in_pipeline < self.max_batch_size => { self.request_missing_metadata(None)?; } // Continue phase 2, until we have all segments. - SyncState::Phase2(_) if self.injected < self.max_blocks_in_flight => { + SyncState::Phase2(_) if self.in_pipeline < self.max_blocks_in_flight => { self.request_missing_blocks()?; } // Wait till 99% synced, zip it up! - SyncState::Phase3 if self.injected == 0 => { + SyncState::Phase3 if self.in_pipeline == 0 => { let ancestor_hash = self.recent_proposals.front().unwrap().header.qc.block_hash; if self.db.get_block_by_hash(&ancestor_hash)?.is_some() { tracing::info!( @@ -184,7 +164,10 @@ impl Sync { self.state = SyncState::Phase0; } _ => { - tracing::debug!("sync::SyncProposal : syncing {} blocks", self.injected); + tracing::debug!( + "sync::SyncProposal : syncing {} blocks in pipeline", + self.in_pipeline + ); } } @@ -326,7 +309,7 @@ impl Sync { // Done with phase 2 if self.chain_segments.is_empty() { self.state = SyncState::Phase3; - } else if DO_SPECULATIVE && self.injected < self.max_blocks_in_flight { + } else if DO_SPECULATIVE { // Speculatively request more blocks self.request_missing_blocks()?; } @@ -385,10 +368,10 @@ impl Sync { } else { return Ok(()); } - } else if self.injected > self.max_blocks_in_flight { + } else if self.in_pipeline > self.max_blocks_in_flight { tracing::warn!( - "sync::RequestMissingBlocks : syncing {} blocks in flight", - self.injected + "sync::RequestMissingBlocks : syncing {} blocks in pipeline", + self.in_pipeline ); return Ok(()); } @@ -459,7 +442,7 @@ impl Sync { self.done_with_peer(DownGrade::Empty); return Ok(()); } else if response.len() < self.max_batch_size { - // Partial response, downgrade peer but accept the response. + // Partial response, downgrade peer but process the response. tracing::warn!("sync::MetadataResponse : partial blocks {from}",); self.done_with_peer(DownGrade::Partial); } else { @@ -596,11 +579,11 @@ impl Sync { } else { return Ok(()); } - } else if self.injected > self.max_batch_size { + } else if self.in_pipeline > self.max_batch_size { // anything more than this and we cannot check whether the segment hits history tracing::warn!( - "sync::RequestMissingMetadata : syncing {} blocks in flight", - self.injected + "sync::RequestMissingMetadata : syncing {} blocks in pipeline", + self.in_pipeline ); return Ok(()); } @@ -652,7 +635,7 @@ impl Sync { } // Increment proposals injected - self.injected = self.injected.saturating_add(proposals.len()); + self.in_pipeline = self.in_pipeline.saturating_add(proposals.len()); let len = proposals.len(); // Just pump the Proposals back to ourselves. @@ -675,12 +658,32 @@ impl Sync { tracing::debug!( "sync::InjectProposals : injected {}/{} proposals", len, - self.injected + self.in_pipeline ); // return last proposal Ok(()) } + /// Mark a received proposal + /// + /// Mark a proposal as received, and remove it from the cache. + pub fn mark_received_proposal(&mut self, prop: &InjectedProposal) -> Result<()> { + if prop.from != self.peer_id { + tracing::error!( + "sync::MarkReceivedProposal : foreign InjectedProposal from {}", + prop.from + ); + } + if let Some(p) = self.chain_metadata.remove(&prop.block.hash()) { + tracing::warn!( + "sync::MarkReceivedProposal : removing stale metadata {}", + p.block_hash + ); + } + self.in_pipeline = self.in_pipeline.saturating_sub(1); + Ok(()) + } + /// Downgrade a peer based on the response received. fn done_with_peer(&mut self, downgrade: DownGrade) { if let Some(mut peer) = self.in_flight.take() { From b583758adcb6582a6c643d68e196494aa72ac63e Mon Sep 17 00:00:00 2001 From: Shawn Date: Wed, 8 Jan 2025 16:07:54 +0800 Subject: [PATCH 048/119] Revert "sec: make RequestId random, to mitigate response injections." This reverts commit 33d45f6d516f9cb4247d224ebfc948a6259d89c9. --- zilliqa/src/node.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/zilliqa/src/node.rs b/zilliqa/src/node.rs index b8d2f535f..2e712ad52 100644 --- a/zilliqa/src/node.rs +++ b/zilliqa/src/node.rs @@ -84,7 +84,9 @@ impl MessageSender { } pub fn next_request_id(&mut self) -> RequestId { - RequestId(rand::random()) // TODO: make this more secure, non-predictable + let request_id = self.request_id; + self.request_id.0 = self.request_id.0.wrapping_add(1); + request_id } /// Send a message to a remote node of the same shard. From bbfaf530bd6d04f3a26163087c036e957c4d2619 Mon Sep 17 00:00:00 2001 From: Shawn Date: Wed, 8 Jan 2025 16:13:46 +0800 Subject: [PATCH 049/119] feat: make fixed-sized queue size, configurable. --- zilliqa/src/sync.rs | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index cb428a38f..d2b445e71 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -48,7 +48,6 @@ use crate::{ // 4. If it does, we inject the entire queue into the pipeline. // 5. We are caught up. -const GAP_THRESHOLD: usize = 20; // Size of internal Proposal cache. const DO_SPECULATIVE: bool = false; // Speeds up syncing by speculatively fetching blocks. #[derive(Debug)] @@ -97,6 +96,8 @@ impl Sync { }) .collect(); let peer_id = message_sender.our_peer_id; + let max_batch_size = config.block_request_batch_size.clamp(30, 180); // 30-180 sec of blocks at a time. + let max_blocks_in_flight = config.max_blocks_in_flight.clamp(max_batch_size, 1800); // up to 30-mins worth of blocks in-pipeline. Ok(Self { db, @@ -104,14 +105,14 @@ impl Sync { peers, peer_id, request_timeout: config.consensus.consensus_timeout, - max_batch_size: config.block_request_batch_size.max(31), // between 30 seconds and 3 days of blocks. - max_blocks_in_flight: config.max_blocks_in_flight.min(3600), // cap to 1-hr worth of blocks + max_batch_size, + max_blocks_in_flight, in_flight: None, in_pipeline: usize::MIN, chain_metadata: BTreeMap::new(), chain_segments: Vec::new(), state: SyncState::Phase0, - recent_proposals: VecDeque::with_capacity(GAP_THRESHOLD), + recent_proposals: VecDeque::with_capacity(max_batch_size), }) } @@ -125,7 +126,7 @@ impl Sync { /// We do not perform checks on the Proposal here. This is done in the consensus layer. pub fn sync_proposal(&mut self, proposal: Proposal) -> Result<()> { // just stuff the latest proposal into the fixed-size queue. - while self.recent_proposals.len() >= GAP_THRESHOLD { + while self.recent_proposals.len() >= self.max_batch_size { self.recent_proposals.pop_front(); } self.recent_proposals.push_back(proposal); From 179cb4d48b11c6c211be1e5b63b2d11dc5023c91 Mon Sep 17 00:00:00 2001 From: Shawn Date: Thu, 9 Jan 2025 15:36:43 +0800 Subject: [PATCH 050/119] feat: v1 sync compatibility. --- zilliqa/src/node.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/zilliqa/src/node.rs b/zilliqa/src/node.rs index 2e712ad52..16c961681 100644 --- a/zilliqa/src/node.rs +++ b/zilliqa/src/node.rs @@ -291,6 +291,15 @@ impl Node { ExternalMessage::InjectedProposal(p) => { self.handle_injected_proposal(from, p)?; } + // Respond negatively to old block requests + ExternalMessage::BlockRequest(req) => { + let message = ExternalMessage::BlockResponse(BlockResponse { + availability: None, + proposals: vec![], + from_view: req.from_view, + }); + self.request_responses.send((response_channel, message))?; + } // Handle requests which contain a block proposal. Initially sent as a broadcast, it is re-routed into // a Request by the underlying layer, with a faux request-id. This is to mitigate issues when there are // too many transactions in the broadcast queue. From c93028364ad34e737f871e81ad5e644e3024887e Mon Sep 17 00:00:00 2001 From: Shawn Date: Thu, 9 Jan 2025 17:58:29 +0800 Subject: [PATCH 051/119] feat: use ChainMetaData as the main state variable structure. --- zilliqa/src/message.rs | 4 +- zilliqa/src/node.rs | 3 + zilliqa/src/sync.rs | 132 ++++++++++++++++++++++++++++++----------- 3 files changed, 101 insertions(+), 38 deletions(-) diff --git a/zilliqa/src/message.rs b/zilliqa/src/message.rs index 4fbcbc6d5..2826190d5 100644 --- a/zilliqa/src/message.rs +++ b/zilliqa/src/message.rs @@ -245,10 +245,10 @@ pub struct InjectedProposal { /// Used to hold metadata about the chain #[derive(Debug, Clone, Serialize, Deserialize)] pub struct ChainMetaData { - // An encoded PeerId - pub block_hash: Hash, pub parent_hash: Hash, + pub block_hash: Hash, pub block_number: u64, + pub view_number: u64, } /// Used to convey proposal processing internally, to avoid blocking threads for too long. diff --git a/zilliqa/src/node.rs b/zilliqa/src/node.rs index 16c961681..558cab097 100644 --- a/zilliqa/src/node.rs +++ b/zilliqa/src/node.rs @@ -340,6 +340,9 @@ impl Node { .consensus .sync .handle_metadata_response(from, response)?, + ExternalMessage::BlockResponse(response) => { + self.consensus.sync.handle_block_response(from, response)? + } ExternalMessage::Acknowledgement => {} msg => { warn!(%msg, "unexpected message type"); diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index d2b445e71..7d27c237c 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -13,7 +13,10 @@ use crate::{ cfg::NodeConfig, crypto::Hash, db::Db, - message::{Block, ChainMetaData, ExternalMessage, InjectedProposal, Proposal, RequestBlock}, + message::{ + Block, BlockResponse, ChainMetaData, ExternalMessage, InjectedProposal, Proposal, + RequestBlock, + }, node::MessageSender, time::SystemTime, }; @@ -75,7 +78,7 @@ pub struct Sync { // complete chain metadata, in-memory chain_metadata: BTreeMap, // markers to segments in the chain, and the source peer for that segment. - chain_segments: Vec<(PeerId, Hash, u64)>, + chain_segments: Vec<(PeerId, ChainMetaData)>, // fixed-size queue of the most recent proposals recent_proposals: VecDeque, } @@ -90,6 +93,7 @@ impl Sync { let peers = peers .into_iter() .map(|peer_id| PeerInfo { + version: PeerVer::V2, // default to V2 peer score: 0, peer_id, last_used: Instant::now(), @@ -138,12 +142,20 @@ impl Sync { if self.db.get_block_by_hash(&parent_hash)?.is_none() { // No parent block, trigger sync tracing::warn!("sync::SyncProposal : syncing from {parent_hash}",); + let block_hash = self.recent_proposals.back().unwrap().hash(); let block_number = self.recent_proposals.back().unwrap().number(); - self.request_missing_metadata(Some((parent_hash, block_number)))?; + let view_number = self.recent_proposals.back().unwrap().view(); + let meta = ChainMetaData { + block_hash, + parent_hash, + block_number, + view_number, + }; + self.request_missing_metadata(Some(meta))?; } } // Continue phase 1, until we hit history/genesis. - SyncState::Phase1(_, _) if self.in_pipeline < self.max_batch_size => { + SyncState::Phase1(_) if self.in_pipeline < self.max_batch_size => { self.request_missing_metadata(None)?; } // Continue phase 2, until we have all segments. @@ -193,9 +205,10 @@ impl Sync { /// Convenience function to extract metadata from the block. fn block_to_metadata(&self, block: Block) -> ChainMetaData { ChainMetaData { - block_number: block.number(), - block_hash: block.hash(), parent_hash: block.parent_hash(), + block_hash: block.hash(), + block_number: block.number(), + view_number: block.view(), } } @@ -210,15 +223,18 @@ impl Sync { } // remove the last segment from the chain metadata - let (peer, hash, num) = self.chain_segments.pop().unwrap(); - let mut key = hash; + let (peer, meta) = self.chain_segments.pop().unwrap(); + let mut key = meta.parent_hash; while let Some(p) = self.chain_metadata.remove(&key) { key = p.parent_hash; } // allow retry from p1 - self.state = SyncState::Phase1(hash, num); - tracing::info!("sync::RetryPhase1 : retrying block {hash} from {peer}"); + tracing::info!( + "sync::RetryPhase1 : retrying block {} from {peer}", + meta.parent_hash + ); + self.state = SyncState::Phase1(meta); if DO_SPECULATIVE { self.request_missing_metadata(None)?; } @@ -260,7 +276,7 @@ impl Sync { ); // Spurious response - let Some((peer_id, hash, _)) = self.chain_segments.last() else { + let Some((peer_id, meta)) = self.chain_segments.last() else { anyhow::bail!("sync::MultiBlockResponse: no more chain_segments!"); }; @@ -272,8 +288,11 @@ impl Sync { // Segment history does not match, retry phase 1. let prop_hash = response.first().as_ref().unwrap().hash(); - if *hash != prop_hash { - tracing::error!("sync::MultiBlockResponse : mismatched landmark {hash} != {prop_hash}"); + if meta.parent_hash != prop_hash { + tracing::error!( + "sync::MultiBlockResponse : mismatched landmark {} != {prop_hash}", + meta.parent_hash + ); return self.retry_phase1(); } @@ -380,9 +399,9 @@ impl Sync { // will be re-inserted below if let Some(peer) = self.get_next_peer() { // If we have no chain_segments, we have nothing to do - if let Some((peer_id, hash, _)) = self.chain_segments.last() { + if let Some((peer_id, meta)) = self.chain_segments.last() { let mut request_hashes = Vec::with_capacity(self.max_batch_size); - let mut key = *hash; // start from this block + let mut key = meta.parent_hash; // start from this block while let Some(meta) = self.chain_metadata.remove(&key) { request_hashes.push(meta.block_hash); key = meta.parent_hash; @@ -411,6 +430,7 @@ impl Sync { )?; self.peers.push(peer); // reinsert peer, as we will be using a faux peer below self.in_flight = Some(PeerInfo { + version: PeerVer::V2, peer_id: *peer_id, last_used: std::time::Instant::now(), score: u32::MAX, // used to indicate faux peer, will not be added to the group of peers @@ -427,6 +447,26 @@ impl Sync { Ok(()) } + /// Handle a V1 block response + /// + /// This will be called during both Phase 1 & Phase 2 block responses. + /// In phase 1, it will extract the metadata and feed it into handle_metadata_response. + /// In phase 2, it will extract the blocks and feed it into handle_multiblock_response. + pub fn handle_block_response(&mut self, from: PeerId, response: BlockResponse) -> Result<()> { + // ... + match self.state { + // Phase 1 + // Phase 2 + _ => { + tracing::debug!( + "sync::HandleBlockResponse : from={from} response={:?}", + response + ); + } + } + Ok(()) + } + /// Handle a response to a metadata request. /// /// This is the first step in the syncing algorithm, where we receive a set of metadata and use it to @@ -451,12 +491,12 @@ impl Sync { } // Check the linkage of the returned chain - let SyncState::Phase1(p1_hash, p1_num) = self.state else { + let SyncState::Phase1(meta) = &self.state else { anyhow::bail!("sync::MetadataResponse : invalid state"); }; - let mut block_hash = p1_hash; - let mut block_num = p1_num; + let mut block_hash = meta.parent_hash; + let mut block_num = meta.block_number; for meta in response.iter() { // check that the block hash and number is as expected. if meta.block_hash != Hash::ZERO @@ -482,13 +522,11 @@ impl Sync { let segment = response; // Record landmark, including peer that has this set of blocks - self.chain_segments.push((from, p1_hash, p1_num)); + self.chain_segments.push((from, meta.clone())); // Record the oldest block in the chain's parent - self.state = SyncState::Phase1( - segment.last().unwrap().parent_hash, - segment.last().unwrap().block_number, - ); + self.state = SyncState::Phase1(segment.last().cloned().unwrap()); + let last_block_hash = segment.last().as_ref().unwrap().block_hash; tracing::info!( "sync::MetadataResponse : received {} metadata segment #{} from {}", @@ -498,7 +536,6 @@ impl Sync { ); // Record the actual chain metadata - let last_block_hash = segment.last().as_ref().unwrap().block_hash; for meta in segment { if self.chain_metadata.insert(meta.block_hash, meta).is_some() { anyhow::bail!("loop in chain!"); // there is a possible loop in the chain @@ -565,7 +602,7 @@ impl Sync { /// This constructs a chain history by requesting blocks from a peer, going backwards from a given block. /// If phase 1 is in progress, it continues requesting blocks from the last known phase 1 block. /// Otherwise, it requests blocks from the given omega_block. - pub fn request_missing_metadata(&mut self, block: Option<(Hash, u64)>) -> Result<()> { + pub fn request_missing_metadata(&mut self, meta: Option) -> Result<()> { if matches!(self.state, SyncState::Phase2(_)) || matches!(self.state, SyncState::Phase3) { anyhow::bail!("sync::RequestMissingMetadata : invalid state"); } @@ -591,17 +628,20 @@ impl Sync { if let Some(peer) = self.get_next_peer() { let message = match self.state { - SyncState::Phase1(hash, _) => ExternalMessage::MetaDataRequest(RequestBlock { - request_at: SystemTime::now(), - from_hash: hash, - batch_size: self.max_batch_size, - }), - SyncState::Phase0 if block.is_some() => { - let (hash, number) = block.unwrap(); - self.state = SyncState::Phase1(hash, number); + SyncState::Phase1(ChainMetaData { parent_hash, .. }) => { ExternalMessage::MetaDataRequest(RequestBlock { request_at: SystemTime::now(), - from_hash: hash, + from_hash: parent_hash, + batch_size: self.max_batch_size, + }) + } + SyncState::Phase0 if meta.is_some() => { + let meta = meta.unwrap(); + self.state = SyncState::Phase1(meta.clone()); + let ChainMetaData { parent_hash, .. } = meta; + ExternalMessage::MetaDataRequest(RequestBlock { + request_at: SystemTime::now(), + from_hash: parent_hash, batch_size: self.max_batch_size, }) } @@ -688,6 +728,16 @@ impl Sync { /// Downgrade a peer based on the response received. fn done_with_peer(&mut self, downgrade: DownGrade) { if let Some(mut peer) = self.in_flight.take() { + // TODO: Double-check version logic + peer.version = match downgrade { + // a V1 will not respond with anything to a V2 request. + DownGrade::Timeout if matches!(peer.version, PeerVer::V2) => PeerVer::V1, + // a V2 will respond with availability = None to a V1 request. + DownGrade::Unavailable if matches!(peer.version, PeerVer::V1) => PeerVer::V2, + // Otherwise, maintain + _ => peer.version, + }; + // Downgrade peer, if necessary peer.score = peer.score.saturating_add(downgrade as u32); // Ensure that the next peer is equal or better, to avoid a single source of truth. @@ -703,6 +753,7 @@ impl Sync { pub fn add_peer(&mut self, peer: PeerId) { // new peers should be tried last, which gives them time to sync first. let new_peer = PeerInfo { + version: PeerVer::V2, // default V2 score: self.peers.iter().map(|p| p.score).max().unwrap_or_default(), peer_id: peer, last_used: Instant::now(), @@ -727,11 +778,12 @@ impl Sync { } } -#[derive(Debug, Clone, PartialEq, Eq)] +#[derive(Debug, Clone, Eq, PartialEq)] struct PeerInfo { score: u32, peer_id: PeerId, last_used: Instant, + version: PeerVer, } impl Ord for PeerInfo { @@ -753,6 +805,7 @@ impl PartialOrd for PeerInfo { #[derive(Debug)] enum DownGrade { None, + Unavailable, Partial, Timeout, Empty, @@ -762,7 +815,14 @@ enum DownGrade { #[derive(Debug)] enum SyncState { Phase0, - Phase1(Hash, u64), + Phase1(ChainMetaData), Phase2(Hash), Phase3, } + +/// Peer Version +#[derive(Debug, Clone, Eq, PartialEq)] +enum PeerVer { + V1, + V2, +} From b643541f6419ed9a1de5091562162965006d6f6c Mon Sep 17 00:00:00 2001 From: Shawn Date: Thu, 9 Jan 2025 19:08:43 +0800 Subject: [PATCH 052/119] feat: make sync compatible with older nodes. --- zilliqa/src/node.rs | 4 +- zilliqa/src/sync.rs | 114 +++++++++++++++++++++++++++++++++----------- 2 files changed, 90 insertions(+), 28 deletions(-) diff --git a/zilliqa/src/node.rs b/zilliqa/src/node.rs index 558cab097..410d29c21 100644 --- a/zilliqa/src/node.rs +++ b/zilliqa/src/node.rs @@ -275,6 +275,7 @@ impl Node { self.request_responses .send((response_channel, ExternalMessage::Acknowledgement))?; } + // RFC-161 sync algorithm, phase 2. ExternalMessage::MultiBlockRequest(request) => { let message = self .consensus @@ -282,6 +283,7 @@ impl Node { .handle_multiblock_request(from, request)?; self.request_responses.send((response_channel, message))?; } + // RFC-161 sync algorithm, phase 1. ExternalMessage::MetaDataRequest(request) => { let message = self.consensus.sync.handle_metadata_request(from, request)?; self.request_responses.send((response_channel, message))?; @@ -291,7 +293,7 @@ impl Node { ExternalMessage::InjectedProposal(p) => { self.handle_injected_proposal(from, p)?; } - // Respond negatively to old block requests + // Respond negatively to block request from old nodes ExternalMessage::BlockRequest(req) => { let message = ExternalMessage::BlockResponse(BlockResponse { availability: None, diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index 7d27c237c..56333424d 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -14,8 +14,8 @@ use crate::{ crypto::Hash, db::Db, message::{ - Block, BlockResponse, ChainMetaData, ExternalMessage, InjectedProposal, Proposal, - RequestBlock, + Block, BlockRequest, BlockResponse, ChainMetaData, ExternalMessage, InjectedProposal, + Proposal, RequestBlock, }, node::MessageSender, time::SystemTime, @@ -78,7 +78,7 @@ pub struct Sync { // complete chain metadata, in-memory chain_metadata: BTreeMap, // markers to segments in the chain, and the source peer for that segment. - chain_segments: Vec<(PeerId, ChainMetaData)>, + chain_segments: Vec<(PeerInfo, ChainMetaData)>, // fixed-size queue of the most recent proposals recent_proposals: VecDeque, } @@ -223,7 +223,7 @@ impl Sync { } // remove the last segment from the chain metadata - let (peer, meta) = self.chain_segments.pop().unwrap(); + let (peer_info, meta) = self.chain_segments.pop().unwrap(); let mut key = meta.parent_hash; while let Some(p) = self.chain_metadata.remove(&key) { key = p.parent_hash; @@ -231,8 +231,9 @@ impl Sync { // allow retry from p1 tracing::info!( - "sync::RetryPhase1 : retrying block {} from {peer}", - meta.parent_hash + "sync::RetryPhase1 : retrying block {} from {}", + meta.parent_hash, + peer_info.peer_id, ); self.state = SyncState::Phase1(meta); if DO_SPECULATIVE { @@ -276,12 +277,12 @@ impl Sync { ); // Spurious response - let Some((peer_id, meta)) = self.chain_segments.last() else { + let Some((peer_info, meta)) = self.chain_segments.last() else { anyhow::bail!("sync::MultiBlockResponse: no more chain_segments!"); }; // If the response is not from the expected peer, retry phase 2. - if *peer_id != from { + if peer_info.peer_id != from { tracing::warn!("sync::MultiBlockResponse: unknown peer {from}, will retry"); return Ok(()); } @@ -399,12 +400,15 @@ impl Sync { // will be re-inserted below if let Some(peer) = self.get_next_peer() { // If we have no chain_segments, we have nothing to do - if let Some((peer_id, meta)) = self.chain_segments.last() { + if let Some((peer_info, meta)) = self.chain_segments.last() { + let to_view = meta.view_number.saturating_sub(1); + let mut from_view = meta.view_number; let mut request_hashes = Vec::with_capacity(self.max_batch_size); let mut key = meta.parent_hash; // start from this block while let Some(meta) = self.chain_metadata.remove(&key) { request_hashes.push(meta.block_hash); key = meta.parent_hash; + from_view = meta.view_number; self.chain_metadata.insert(meta.block_hash, meta); // reinsert, for retries } @@ -422,19 +426,33 @@ impl Sync { "sync::RequestMissingBlocks : requesting {} blocks of segment #{} from {}", request_hashes.len(), self.chain_segments.len(), - peer_id, + peer_info.peer_id, ); - self.message_sender.send_external_message( - *peer_id, - ExternalMessage::MultiBlockRequest(request_hashes), - )?; + self.peers.push(peer); // reinsert peer, as we will be using a faux peer below - self.in_flight = Some(PeerInfo { - version: PeerVer::V2, - peer_id: *peer_id, - last_used: std::time::Instant::now(), - score: u32::MAX, // used to indicate faux peer, will not be added to the group of peers - }); + + let message = match peer_info.version { + PeerVer::V2 => { + self.in_flight = Some(PeerInfo { + version: PeerVer::V2, + peer_id: peer_info.peer_id, + last_used: std::time::Instant::now(), + score: u32::MAX, // used to indicate faux peer, will not be added to the group of peers + }); + ExternalMessage::MultiBlockRequest(request_hashes) + } + PeerVer::V1 => { + self.in_flight = Some(PeerInfo { + version: PeerVer::V1, + peer_id: peer_info.peer_id, + last_used: std::time::Instant::now(), + score: u32::MAX, // used to indicate faux peer, will not be added to the group of peers + }); + ExternalMessage::BlockRequest(BlockRequest { to_view, from_view }) + } + }; + self.message_sender + .send_external_message(peer_info.peer_id, message)?; } else { // No more chain_segments, we're done self.peers.push(peer); @@ -456,9 +474,32 @@ impl Sync { // ... match self.state { // Phase 1 + SyncState::Phase1(_) => { + // TODO: Should be buffer the proposals? Probably not! + let metadata = response + .proposals + .into_iter() + .sorted_by(|a, b| b.view().cmp(&a.view())) + .map(|p| ChainMetaData { + block_hash: p.hash(), + parent_hash: p.header.qc.block_hash, + block_number: p.number(), + view_number: p.view(), + }) + .collect_vec(); + self.handle_metadata_response(from, metadata)?; + } // Phase 2 + SyncState::Phase2(_) => { + let multi_blocks = response + .proposals + .into_iter() + .sorted_by(|a, b| b.view().cmp(&a.view())) + .collect_vec(); + self.handle_multiblock_response(from, multi_blocks)?; + } _ => { - tracing::debug!( + tracing::error!( "sync::HandleBlockResponse : from={from} response={:?}", response ); @@ -476,6 +517,7 @@ impl Sync { from: PeerId, response: Vec, ) -> Result<()> { + let segment_peer = self.in_flight.as_ref().unwrap().clone(); // Process whatever we have received. if response.is_empty() { // Empty response, downgrade peer and retry with a new peer. @@ -522,7 +564,7 @@ impl Sync { let segment = response; // Record landmark, including peer that has this set of blocks - self.chain_segments.push((from, meta.clone())); + self.chain_segments.push((segment_peer, meta.clone())); // Record the oldest block in the chain's parent self.state = SyncState::Phase1(segment.last().cloned().unwrap()); @@ -628,26 +670,44 @@ impl Sync { if let Some(peer) = self.get_next_peer() { let message = match self.state { - SyncState::Phase1(ChainMetaData { parent_hash, .. }) => { + SyncState::Phase1(ChainMetaData { parent_hash, .. }) + if matches!(peer.version, PeerVer::V2) => + { ExternalMessage::MetaDataRequest(RequestBlock { request_at: SystemTime::now(), from_hash: parent_hash, batch_size: self.max_batch_size, }) } - SyncState::Phase0 if meta.is_some() => { + SyncState::Phase1(ChainMetaData { view_number, .. }) + if matches!(peer.version, PeerVer::V1) => + { + ExternalMessage::BlockRequest(BlockRequest { + to_view: view_number.saturating_sub(1), // we want the parent i.e. earlier view + from_view: view_number.saturating_sub(self.max_batch_size as u64), + }) + } + SyncState::Phase0 if meta.is_some() && matches!(peer.version, PeerVer::V2) => { let meta = meta.unwrap(); - self.state = SyncState::Phase1(meta.clone()); - let ChainMetaData { parent_hash, .. } = meta; + let parent_hash = meta.parent_hash; + self.state = SyncState::Phase1(meta); ExternalMessage::MetaDataRequest(RequestBlock { request_at: SystemTime::now(), from_hash: parent_hash, batch_size: self.max_batch_size, }) } + SyncState::Phase0 if meta.is_some() && matches!(peer.version, PeerVer::V1) => { + let meta = meta.unwrap(); + let view_number = meta.view_number; + self.state = SyncState::Phase1(meta); + ExternalMessage::BlockRequest(BlockRequest { + to_view: view_number.saturating_sub(1), // we want the parent i.e. earlier view + from_view: view_number.saturating_sub(self.max_batch_size as u64), + }) + } _ => anyhow::bail!("sync::MissingMetadata : invalid state"), }; - tracing::info!( ?message, "sync::RequestMissingMetadata : requesting missing chain from {}", From feecd20a3ad56d2f0363f1691246bade88a86324 Mon Sep 17 00:00:00 2001 From: Shawn Date: Thu, 9 Jan 2025 20:21:19 +0800 Subject: [PATCH 053/119] feat: default to V1 peer; upgrade to V2 peer upon getting invalid response. --- zilliqa/src/message.rs | 4 +- zilliqa/src/node.rs | 5 +- zilliqa/src/sync.rs | 104 ++++++++++++++++++++++------------------- 3 files changed, 61 insertions(+), 52 deletions(-) diff --git a/zilliqa/src/message.rs b/zilliqa/src/message.rs index 2826190d5..c9a758a31 100644 --- a/zilliqa/src/message.rs +++ b/zilliqa/src/message.rs @@ -228,7 +228,7 @@ impl fmt::Debug for BlockResponse { } #[derive(Debug, Clone, Serialize, Deserialize)] -pub struct RequestBlock { +pub struct BlockRequestV2 { pub request_at: SystemTime, pub from_hash: Hash, pub batch_size: usize, @@ -286,7 +286,7 @@ pub enum ExternalMessage { AddPeer, RemovePeer, InjectedProposal(InjectedProposal), - MetaDataRequest(RequestBlock), + MetaDataRequest(BlockRequestV2), MetaDataResponse(Vec), MultiBlockRequest(Vec), MultiBlockResponse(Vec), diff --git a/zilliqa/src/node.rs b/zilliqa/src/node.rs index 410d29c21..589d7065c 100644 --- a/zilliqa/src/node.rs +++ b/zilliqa/src/node.rs @@ -294,11 +294,12 @@ impl Node { self.handle_injected_proposal(from, p)?; } // Respond negatively to block request from old nodes - ExternalMessage::BlockRequest(req) => { + ExternalMessage::BlockRequest(_) => { + // respond with an invalid response let message = ExternalMessage::BlockResponse(BlockResponse { availability: None, proposals: vec![], - from_view: req.from_view, + from_view: u64::MAX, }); self.request_responses.send((response_channel, message))?; } diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index 56333424d..5a688ef72 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -14,8 +14,8 @@ use crate::{ crypto::Hash, db::Db, message::{ - Block, BlockRequest, BlockResponse, ChainMetaData, ExternalMessage, InjectedProposal, - Proposal, RequestBlock, + Block, BlockRequest, BlockRequestV2, BlockResponse, ChainMetaData, ExternalMessage, + InjectedProposal, Proposal, }, node::MessageSender, time::SystemTime, @@ -51,7 +51,10 @@ use crate::{ // 4. If it does, we inject the entire queue into the pipeline. // 5. We are caught up. -const DO_SPECULATIVE: bool = false; // Speeds up syncing by speculatively fetching blocks. +#[cfg(debug_assertions)] +const DO_SPECULATIVE: bool = false; +#[cfg(not(debug_assertions))] +const DO_SPECULATIVE: bool = true; // Speeds up syncing by speculatively fetching blocks. #[derive(Debug)] pub struct Sync { @@ -81,6 +84,8 @@ pub struct Sync { chain_segments: Vec<(PeerInfo, ChainMetaData)>, // fixed-size queue of the most recent proposals recent_proposals: VecDeque, + // for statistics only + inject_at: Option<(std::time::Instant, usize)>, } impl Sync { @@ -93,7 +98,7 @@ impl Sync { let peers = peers .into_iter() .map(|peer_id| PeerInfo { - version: PeerVer::V2, // default to V2 peer + version: PeerVer::V1, // default to V1 peer, until otherwise proven. score: 0, peer_id, last_used: Instant::now(), @@ -117,6 +122,7 @@ impl Sync { chain_segments: Vec::new(), state: SyncState::Phase0, recent_proposals: VecDeque::with_capacity(max_batch_size), + inject_at: None, }) } @@ -215,10 +221,11 @@ impl Sync { /// Retry phase 1 /// /// If something went wrong, phase 1 may need to be retried for the most recent segment. - /// Pop the segment from the landmark, and continue phase 1. + /// Pop the segment from the segment marker, and continue phase 1. fn retry_phase1(&mut self) -> Result<()> { if self.chain_segments.is_empty() { tracing::error!("sync::RetryPhase1 : cannot retry phase 1 without chain_segments!"); + self.state = SyncState::Phase0; return Ok(()); } @@ -277,26 +284,16 @@ impl Sync { ); // Spurious response - let Some((peer_info, meta)) = self.chain_segments.last() else { + let Some((peer_info, _)) = self.chain_segments.last() else { anyhow::bail!("sync::MultiBlockResponse: no more chain_segments!"); }; - // If the response is not from the expected peer, retry phase 2. + // If the response is not from the expected peer e.g. delayed response, retry phase 2. if peer_info.peer_id != from { tracing::warn!("sync::MultiBlockResponse: unknown peer {from}, will retry"); return Ok(()); } - // Segment history does not match, retry phase 1. - let prop_hash = response.first().as_ref().unwrap().hash(); - if meta.parent_hash != prop_hash { - tracing::error!( - "sync::MultiBlockResponse : mismatched landmark {} != {prop_hash}", - meta.parent_hash - ); - return self.retry_phase1(); - } - // If the checksum does not match, retry phase 1. Maybe the node has pruned the segment. let checksum = response .iter() @@ -373,7 +370,7 @@ impl Sync { /// /// It constructs a set of hashes, which constitute the series of blocks that are missing. /// These hashes are then sent to a Peer for retrieval. - /// This is Part 2 of the syncing algorithm. + /// This is phase 2 of the syncing algorithm. fn request_missing_blocks(&mut self) -> Result<()> { if !matches!(self.state, SyncState::Phase2(_)) { anyhow::bail!("sync::RequestMissingBlocks : invalid state"); @@ -399,6 +396,9 @@ impl Sync { // will be re-inserted below if let Some(peer) = self.get_next_peer() { + // reinsert peer, as we will use a faux peer below, to force the request to go to the original responder + self.peers.push(peer); + // If we have no chain_segments, we have nothing to do if let Some((peer_info, meta)) = self.chain_segments.last() { let to_view = meta.view_number.saturating_sub(1); @@ -429,8 +429,6 @@ impl Sync { peer_info.peer_id, ); - self.peers.push(peer); // reinsert peer, as we will be using a faux peer below - let message = match peer_info.version { PeerVer::V2 => { self.in_flight = Some(PeerInfo { @@ -453,9 +451,6 @@ impl Sync { }; self.message_sender .send_external_message(peer_info.peer_id, message)?; - } else { - // No more chain_segments, we're done - self.peers.push(peer); } } else { tracing::warn!( @@ -468,14 +463,26 @@ impl Sync { /// Handle a V1 block response /// /// This will be called during both Phase 1 & Phase 2 block responses. + /// If the response if from a V2 peer, it will upgrade that peer to V2. /// In phase 1, it will extract the metadata and feed it into handle_metadata_response. /// In phase 2, it will extract the blocks and feed it into handle_multiblock_response. pub fn handle_block_response(&mut self, from: PeerId, response: BlockResponse) -> Result<()> { - // ... + // Upgrade to V2 peer. + if response.availability.is_none() + && response.proposals.is_empty() + && response.from_view == u64::MAX + { + tracing::info!("sync::HandleBlockResponse : upgrading {from} to V2",); + self.in_flight.as_mut().unwrap().version = PeerVer::V2; + self.done_with_peer(DownGrade::None); + return Ok(()); + } + + // Convert the V1 response into a V2 response. match self.state { // Phase 1 SyncState::Phase1(_) => { - // TODO: Should be buffer the proposals? Probably not! + // We do not buffer the proposals, as it takes 250MB/day! let metadata = response .proposals .into_iter() @@ -491,11 +498,12 @@ impl Sync { } // Phase 2 SyncState::Phase2(_) => { - let multi_blocks = response + let mut multi_blocks = response .proposals .into_iter() .sorted_by(|a, b| b.view().cmp(&a.view())) .collect_vec(); + multi_blocks.retain(|p| self.chain_metadata.contains_key(&p.hash())); self.handle_multiblock_response(from, multi_blocks)?; } _ => { @@ -511,7 +519,8 @@ impl Sync { /// Handle a response to a metadata request. /// /// This is the first step in the syncing algorithm, where we receive a set of metadata and use it to - /// construct a chain history. + /// construct a chain history. We check that the metadata does indeed constitute a chain. If it does, + /// we record its segment marker and store the entire chain in-memory. pub fn handle_metadata_response( &mut self, from: PeerId, @@ -603,7 +612,7 @@ impl Sync { pub fn handle_metadata_request( &mut self, from: PeerId, - request: RequestBlock, + request: BlockRequestV2, ) -> Result { tracing::debug!( "sync::MetadataRequest : received a metadata request from {}", @@ -643,7 +652,7 @@ impl Sync { /// /// This constructs a chain history by requesting blocks from a peer, going backwards from a given block. /// If phase 1 is in progress, it continues requesting blocks from the last known phase 1 block. - /// Otherwise, it requests blocks from the given omega_block. + /// Otherwise, it requests blocks from the given starting metadata. pub fn request_missing_metadata(&mut self, meta: Option) -> Result<()> { if matches!(self.state, SyncState::Phase2(_)) || matches!(self.state, SyncState::Phase3) { anyhow::bail!("sync::RequestMissingMetadata : invalid state"); @@ -660,7 +669,7 @@ impl Sync { return Ok(()); } } else if self.in_pipeline > self.max_batch_size { - // anything more than this and we cannot check whether the segment hits history + // anything more than this and we cannot be sure whether the segment hits history tracing::warn!( "sync::RequestMissingMetadata : syncing {} blocks in pipeline", self.in_pipeline @@ -673,7 +682,7 @@ impl Sync { SyncState::Phase1(ChainMetaData { parent_hash, .. }) if matches!(peer.version, PeerVer::V2) => { - ExternalMessage::MetaDataRequest(RequestBlock { + ExternalMessage::MetaDataRequest(BlockRequestV2 { request_at: SystemTime::now(), from_hash: parent_hash, batch_size: self.max_batch_size, @@ -691,7 +700,7 @@ impl Sync { let meta = meta.unwrap(); let parent_hash = meta.parent_hash; self.state = SyncState::Phase1(meta); - ExternalMessage::MetaDataRequest(RequestBlock { + ExternalMessage::MetaDataRequest(BlockRequestV2 { request_at: SystemTime::now(), from_hash: parent_hash, batch_size: self.max_batch_size, @@ -727,14 +736,22 @@ impl Sync { /// Inject the proposals into the chain. /// - /// Besides pumping the set of Proposals into the processing pipeline, it also records the - /// last known Proposal in the pipeline. This is used for speculative fetches, and also for - /// knowing where to continue fetching from. + /// It adds the list of proposals into the pipeline for execution. + /// It also outputs some syncing statistics. fn inject_proposals(&mut self, proposals: Vec) -> Result<()> { if proposals.is_empty() { return Ok(()); } + // Output some stats + if let Some((when, injected)) = self.inject_at { + tracing::debug!( + "sync::InjectProposals : synced {}/{:?}", + injected - self.in_pipeline, + when.elapsed() + ); + } + // Increment proposals injected self.in_pipeline = self.in_pipeline.saturating_add(proposals.len()); let len = proposals.len(); @@ -742,7 +759,7 @@ impl Sync { // Just pump the Proposals back to ourselves. for p in proposals { tracing::trace!( - "Injecting proposal number: {} hash: {}", + "sync::InjectProposals : injecting number: {} hash: {}", p.number(), p.hash(), ); @@ -756,6 +773,8 @@ impl Sync { )?; } + self.inject_at = Some((std::time::Instant::now(), self.in_pipeline)); + tracing::debug!( "sync::InjectProposals : injected {}/{} proposals", len, @@ -788,16 +807,6 @@ impl Sync { /// Downgrade a peer based on the response received. fn done_with_peer(&mut self, downgrade: DownGrade) { if let Some(mut peer) = self.in_flight.take() { - // TODO: Double-check version logic - peer.version = match downgrade { - // a V1 will not respond with anything to a V2 request. - DownGrade::Timeout if matches!(peer.version, PeerVer::V2) => PeerVer::V1, - // a V2 will respond with availability = None to a V1 request. - DownGrade::Unavailable if matches!(peer.version, PeerVer::V1) => PeerVer::V2, - // Otherwise, maintain - _ => peer.version, - }; - // Downgrade peer, if necessary peer.score = peer.score.saturating_add(downgrade as u32); // Ensure that the next peer is equal or better, to avoid a single source of truth. @@ -813,7 +822,7 @@ impl Sync { pub fn add_peer(&mut self, peer: PeerId) { // new peers should be tried last, which gives them time to sync first. let new_peer = PeerInfo { - version: PeerVer::V2, // default V2 + version: PeerVer::V1, // default V2 score: self.peers.iter().map(|p| p.score).max().unwrap_or_default(), peer_id: peer, last_used: Instant::now(), @@ -865,7 +874,6 @@ impl PartialOrd for PeerInfo { #[derive(Debug)] enum DownGrade { None, - Unavailable, Partial, Timeout, Empty, From ad4cc3bae3734163cec2c62efdf023c7c0f7614a Mon Sep 17 00:00:00 2001 From: Shawn Date: Fri, 10 Jan 2025 10:05:35 +0800 Subject: [PATCH 054/119] feat: filter V1 responses for gaps and forks. --- zilliqa/src/sync.rs | 53 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 40 insertions(+), 13 deletions(-) diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index 5a688ef72..e64794b12 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -401,7 +401,7 @@ impl Sync { // If we have no chain_segments, we have nothing to do if let Some((peer_info, meta)) = self.chain_segments.last() { - let to_view = meta.view_number.saturating_sub(1); + let to_view = meta.view_number.saturating_add(Self::VIEW_DRIFT); let mut from_view = meta.view_number; let mut request_hashes = Vec::with_capacity(self.max_batch_size); let mut key = meta.parent_hash; // start from this block @@ -460,6 +460,11 @@ impl Sync { Ok(()) } + // we request a little more than we need, due to drift + // 10 ~ 1min + // 20 ~ 1hr + const VIEW_DRIFT: u64 = 10; + /// Handle a V1 block response /// /// This will be called during both Phase 1 & Phase 2 block responses. @@ -478,15 +483,33 @@ impl Sync { return Ok(()); } + if response.proposals.is_empty() { + tracing::info!("sync::HandleBlockResponse : empty V1 from {from}"); + self.done_with_peer(DownGrade::Empty); + return Ok(()); + } + // Convert the V1 response into a V2 response. match self.state { - // Phase 1 - SyncState::Phase1(_) => { + // Phase 1 - extract metadata from the set of proposals + SyncState::Phase1(ChainMetaData { + block_number, + mut parent_hash, + .. + }) => { // We do not buffer the proposals, as it takes 250MB/day! let metadata = response .proposals .into_iter() - .sorted_by(|a, b| b.view().cmp(&a.view())) + .filter(|p| p.number() < block_number) // filter extras + .sorted_by(|a, b| b.number().cmp(&a.number())) + .filter(|p| { + if parent_hash != p.hash() { + return false; + } + parent_hash = p.header.qc.block_hash; + true + }) // filter forks .map(|p| ChainMetaData { block_hash: p.hash(), parent_hash: p.header.qc.block_hash, @@ -494,16 +517,17 @@ impl Sync { view_number: p.view(), }) .collect_vec(); + self.handle_metadata_response(from, metadata)?; } - // Phase 2 + // Phase 2 - extract the requested blocks only SyncState::Phase2(_) => { - let mut multi_blocks = response + let multi_blocks = response .proposals .into_iter() - .sorted_by(|a, b| b.view().cmp(&a.view())) + .filter(|p| self.chain_metadata.contains_key(&p.hash())) // filter extras + .sorted_by(|a, b| b.number().cmp(&a.number())) .collect_vec(); - multi_blocks.retain(|p| self.chain_metadata.contains_key(&p.hash())); self.handle_multiblock_response(from, multi_blocks)?; } _ => { @@ -560,7 +584,9 @@ impl Sync { // TODO: possibly, discard and rebuild entire chain // if something does not match, do nothing and retry the request with the next peer. tracing::error!( - "sync::MetadataResponse : retry metadata history for {block_hash}/{block_num}" + "sync::MetadataResponse : retry metadata expected hash={block_hash} != {} num={block_num} != {}", + meta.block_hash, + meta.block_number, ); return Ok(()); } @@ -692,7 +718,7 @@ impl Sync { if matches!(peer.version, PeerVer::V1) => { ExternalMessage::BlockRequest(BlockRequest { - to_view: view_number.saturating_sub(1), // we want the parent i.e. earlier view + to_view: view_number.saturating_add(Self::VIEW_DRIFT), from_view: view_number.saturating_sub(self.max_batch_size as u64), }) } @@ -711,7 +737,7 @@ impl Sync { let view_number = meta.view_number; self.state = SyncState::Phase1(meta); ExternalMessage::BlockRequest(BlockRequest { - to_view: view_number.saturating_sub(1), // we want the parent i.e. earlier view + to_view: view_number.saturating_add(Self::VIEW_DRIFT), from_view: view_number.saturating_sub(self.max_batch_size as u64), }) } @@ -870,13 +896,14 @@ impl PartialOrd for PeerInfo { } } -/// Peer downgrade states/values, for downgrading an internal peer from selection. +/// For downgrading a peer from being selected in get_next_peer(). +/// Ordered by degree of offence i.e. None is good, Timeout is worst #[derive(Debug)] enum DownGrade { None, Partial, - Timeout, Empty, + Timeout, } /// Sync state From ee3e322682ad67240e45089d919d0fc942a4f27d Mon Sep 17 00:00:00 2001 From: Shawn Date: Fri, 10 Jan 2025 15:18:52 +0800 Subject: [PATCH 055/119] feat: working phase 1 with protomainnet. --- zilliqa/src/sync.rs | 91 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 82 insertions(+), 9 deletions(-) diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index e64794b12..c0af4d5b3 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -3,8 +3,10 @@ use std::{ collections::{BTreeMap, BinaryHeap, VecDeque}, sync::Arc, time::{Duration, Instant}, + u64, }; +use alloy::primitives::BlockNumber; use anyhow::Result; use itertools::Itertools; use libp2p::PeerId; @@ -86,6 +88,8 @@ pub struct Sync { recent_proposals: VecDeque, // for statistics only inject_at: Option<(std::time::Instant, usize)>, + // record starting number, for eth_syncing() RPC call. + started_at_block_number: u64, } impl Sync { @@ -123,6 +127,7 @@ impl Sync { state: SyncState::Phase0, recent_proposals: VecDeque::with_capacity(max_batch_size), inject_at: None, + started_at_block_number: u64::MIN, }) } @@ -158,6 +163,16 @@ impl Sync { view_number, }; self.request_missing_metadata(Some(meta))?; + + let highest_block = self + .db + .get_canonical_block_by_number( + self.db + .get_highest_canonical_block_number()? + .expect("no highest block"), + )? + .expect("missing highest block"); + self.started_at_block_number = highest_block.number(); } } // Continue phase 1, until we hit history/genesis. @@ -258,6 +273,19 @@ impl Sync { from: PeerId, response: Vec, ) -> Result<()> { + if let Some(peer) = self.in_flight.as_ref() { + if peer.peer_id != from { + tracing::warn!( + "sync::MultiBlockResponse : unexpected peer={} != {from}", + peer.peer_id + ); + return Ok(()); + } + } else { + tracing::warn!("sync::MultiBlockResponse : spurious response {from}"); + return Ok(()); + } + // Process only a full response if response.is_empty() { // Empty response, downgrade peer and retry phase 1. @@ -265,9 +293,9 @@ impl Sync { self.done_with_peer(DownGrade::Empty); return self.retry_phase1(); } else if response.len() < self.max_batch_size { - // Partial response, downgrade peer but process the block. + // Partial response, process blocks. tracing::warn!("sync::MultiBlockResponse : partial blocks {from}",); - self.done_with_peer(DownGrade::Partial); + self.done_with_peer(DownGrade::None); } else { self.done_with_peer(DownGrade::None); } @@ -550,6 +578,19 @@ impl Sync { from: PeerId, response: Vec, ) -> Result<()> { + if let Some(peer) = self.in_flight.as_ref() { + if peer.peer_id != from { + tracing::warn!( + "sync::MetadataResponse : unexpected peer={} != {from}", + peer.peer_id + ); + return Ok(()); + } + } else { + tracing::warn!("sync::MetadataResponse : spurious response {from}"); + return Ok(()); + } + let segment_peer = self.in_flight.as_ref().unwrap().clone(); // Process whatever we have received. if response.is_empty() { @@ -558,9 +599,9 @@ impl Sync { self.done_with_peer(DownGrade::Empty); return Ok(()); } else if response.len() < self.max_batch_size { - // Partial response, downgrade peer but process the response. + // Partial response, process the response. tracing::warn!("sync::MetadataResponse : partial blocks {from}",); - self.done_with_peer(DownGrade::Partial); + self.done_with_peer(DownGrade::None); } else { self.done_with_peer(DownGrade::None); } @@ -704,6 +745,7 @@ impl Sync { } if let Some(peer) = self.get_next_peer() { + let peer_id = peer.peer_id; let message = match self.state { SyncState::Phase1(ChainMetaData { parent_hash, .. }) if matches!(peer.version, PeerVer::V2) => @@ -746,12 +788,11 @@ impl Sync { tracing::info!( ?message, "sync::RequestMissingMetadata : requesting missing chain from {}", - peer.peer_id + peer_id ); - self.message_sender - .send_external_message(peer.peer_id, message)?; - self.in_flight = Some(peer); + self.message_sender + .send_external_message(peer_id, message)?; } else { tracing::warn!( "sync::RequestMissingMetadata : insufficient peers to request missing blocks" @@ -871,6 +912,39 @@ impl Sync { peer.last_used = std::time::Instant::now(); // used to determine stale in-flight requests. Some(peer) } + + /// Returns (am_syncing, current_highest_block) + pub fn am_syncing(&self) -> Result<(bool, Block)> { + let highest_block = self + .db + .get_canonical_block_by_number( + self.db + .get_highest_canonical_block_number()? + .expect("no highest block"), + )? + .expect("missing highest block"); + Ok(( + self.in_pipeline > 0 || !matches!(self.state, SyncState::Phase0), + highest_block, + )) + } + + // Returns (starting_block, current_block, highest_block) if we're syncing, + // None if we're not. + pub fn get_sync_data(&self) -> Result> { + let (flag, highest_block) = self.am_syncing()?; + if !flag { + Ok(None) + } else { + let highest_saved_block_number = highest_block.number(); + let highest_block_number_seen = self.recent_proposals.back().unwrap().number(); + Ok(Some(( + self.started_at_block_number, + highest_saved_block_number, + highest_block_number_seen, + ))) + } + } } #[derive(Debug, Clone, Eq, PartialEq)] @@ -901,7 +975,6 @@ impl PartialOrd for PeerInfo { #[derive(Debug)] enum DownGrade { None, - Partial, Empty, Timeout, } From 2e779e90d35d8e33d0ac4b102365c459df652215 Mon Sep 17 00:00:00 2001 From: Shawn Date: Fri, 10 Jan 2025 15:19:50 +0800 Subject: [PATCH 056/119] feat: removed sending BlockRequest from block_store.rs --- zilliqa/src/block_store.rs | 209 +------------------------------------ zilliqa/src/sync.rs | 5 +- 2 files changed, 5 insertions(+), 209 deletions(-) diff --git a/zilliqa/src/block_store.rs b/zilliqa/src/block_store.rs index 32bf71a71..e17b60e96 100644 --- a/zilliqa/src/block_store.rs +++ b/zilliqa/src/block_store.rs @@ -388,8 +388,6 @@ pub struct BlockStore { peers: BTreeMap, /// The maximum number of blocks to send requests for at a time. max_blocks_in_flight: u64, - /// When a request to a peer fails, do not send another request to this peer for this amount of time. - failed_request_sleep_duration: Duration, /// Our block strategies. strategies: Vec, /// The block views we have available. This is read once from the DB at start-up and incrementally updated whenever @@ -402,9 +400,6 @@ pub struct BlockStore { unserviceable_requests: Option, message_sender: MessageSender, - /// Clock pointer - see request_blocks() - clock: usize, - /// Where we last started syncing, so we can report it in get_sync_data() started_syncing_at: BlockNumber, /// Previous sync flag, so we can tell when it changes. @@ -426,8 +421,6 @@ struct PeerInfo { availability: BlockAvailability, /// When did we last update availability? availability_updated_at: Option, - /// Last availability query - don't send them too often. - availability_requested_at: Option, /// Requests we've sent to the peer. pending_requests: HashMap, /// If `Some`, the time of the most recently failed request. @@ -439,50 +432,10 @@ impl PeerInfo { Self { availability: BlockAvailability::new(), availability_updated_at: None, - availability_requested_at: None, pending_requests: HashMap::new(), last_request_failed_at: None, } } - - /// Do we have availability, or should we get it again? - fn have_availability(&self) -> bool { - self.availability_updated_at.is_some() - } - - /// Converts a set of block strategies into a rangemap - fn get_ranges(&self, max_view: Option) -> RangeMap { - let mut result = RangeMap::new(); - if let Some(strat) = &self.availability.strategies { - let mut max_end: Option = None; - let mut last_n: Option = None; - for s in strat { - match s { - BlockStrategy::CachedViewRange(views, until_view) => { - if until_view.map_or(true, |x| self.availability.highest_known_view <= x) { - result.with_range(views); - max_end = Some( - max_end.map_or(views.end - 1, |v| std::cmp::max(v, views.end - 1)), - ); - } - } - BlockStrategy::Latest(n) => { - last_n = Some(last_n.map_or(*n, |x| std::cmp::max(x, *n))); - } - } - } - if let Some(the_n) = last_n { - if let Some(max_view_nr) = max_view { - let start = max_view_nr.saturating_sub(the_n); - result.with_range(&Range { - start, - end: max_view_nr, - }); - } - } - } - result - } } /// Data about a peer @@ -568,13 +521,11 @@ impl BlockStore { highest_confirmed_view: 0, peers: BTreeMap::new(), max_blocks_in_flight: config.max_blocks_in_flight as u64, - failed_request_sleep_duration: config.failed_request_sleep_duration, strategies: vec![BlockStrategy::Latest(constants::RETAINS_LAST_N_BLOCKS)], available_blocks, buffered: BlockCache::new(config.max_blocks_in_flight as u64), unserviceable_requests: None, message_sender, - clock: 0, started_syncing_at: 0, last_sync_flag: false, }) @@ -604,13 +555,11 @@ impl BlockStore { highest_confirmed_view: 0, peers: BTreeMap::new(), max_blocks_in_flight: 0, - failed_request_sleep_duration: Duration::ZERO, strategies: self.strategies.clone(), available_blocks: RangeMap::new(), buffered: BlockCache::new(0), unserviceable_requests: None, message_sender: self.message_sender.clone(), - clock: 0, started_syncing_at: self.started_syncing_at, last_sync_flag: self.last_sync_flag, }) @@ -809,162 +758,8 @@ impl BlockStore { /// Make a request for the blocks associated with a range of views. Returns `true` if a request was made and `false` if the request had to be /// buffered because no peers were available. /// Public so we can trigger it from the debug API - pub fn request_blocks(&mut self, req: &RangeMap) -> Result { - let mut remain = req.clone(); - let to = req.max(); - - // Prune the pending requests - self.prune_pending_requests()?; - - // If it's in our input queue, don't expect it again. - let expected = self.buffered.expectant_block_ranges(); - (_, remain) = remain.diff_inter(&expected); - - // If it's already buffered, don't request it again - wait for us to reject it and - // then we can re-request. - let extant = self.buffered.extant_block_ranges(); - - (_, remain) = remain.diff_inter(&extant); - (_, remain) = remain.diff_inter(&self.buffered.empty_view_ranges); - - // If it's in flight, don't request it again. - let mut in_flight = RangeMap::new(); - for peer in self.peers.values() { - for (_, start, end) in peer.pending_requests.values() { - in_flight.with_range(&Range { - start: *start, - end: end + 1, - }); - } - } - (_, remain) = remain.diff_inter(&in_flight); - - let now = SystemTime::now(); - let failed_request_sleep_duration = self.failed_request_sleep_duration; - - // If everything we have is in flight, we'll skip trying to request them (or update availability) - if remain.is_empty() { - trace!("block_store::request_blocks() : .. no non in_flight requests. Returning early"); - return Ok(true); - } - - for chance in 0..2 { - trace!( - "block_store::request_blocks() : chance = {chance} clock = {} peers = {}", - self.clock, - self.peers.len() - ); - // There may be no peers ... - self.clock = (self.clock + 1) % std::cmp::max(1, self.peers.len()); - // Slightly horrid - generate a list of peers which is the BTreeMap's list, shifted by clock. - let peers = self - .peers - .keys() - .skip(self.clock) - .chain(self.peers.keys().take(self.clock)) - .cloned() - .collect::>(); - - for peer in &peers { - debug!("block_store::request_blocks() : considering peer = {peer}"); - // If the last request failed < 10s or so ago, skip this peer, unless we're second-chance in - // which case, hey, why not? - let (requests, rem, query_availability) = { - let peer_info = self.peer_info(*peer); - if chance == 0 - && !peer_info - .last_request_failed_at - .and_then(|at| at.elapsed().ok()) - .map(|time_since| time_since > failed_request_sleep_duration) - .unwrap_or(true) - { - trace!("block_store::request_blocks() : .. Last request failed; skipping this peer"); - continue; - } - - if peer_info.pending_requests.len() - >= constants::MAX_PENDING_BLOCK_REQUESTS_PER_PEER - { - trace!( - "block_store::request_blocks() : .. Skipping peer {peer} - too many pending requests {0}", - peer_info.pending_requests.len() - ); - continue; - } - // Split .. - let left = constants::MAX_PENDING_BLOCK_REQUESTS_PER_PEER - - peer_info.pending_requests.len(); - let ranges = peer_info.get_ranges(to); - let (req, rem) = remain.diff_inter_limited(&ranges, Some(left)); - // If we are not about to make a request, and we do not have recent availability then - // make a synthetic request to get that availability. - let query_availability = req.is_empty() - && peer_info.pending_requests.is_empty() - && (!peer_info.have_availability() - || peer_info.availability_requested_at.map_or(true, |x| { - x.elapsed() - .map(|v| { - v > constants::REQUEST_PEER_VIEW_AVAILABILITY_NOT_BEFORE - }) - .unwrap_or(true) - })); - (req, rem, query_availability) - }; - - let mut request_sent = false; - // Send all requests now .. - for request in requests.ranges.iter() { - if !request.is_empty() { - trace!( - "block_store::request_blocks() : peer = {:?} request = {:?}: sending block request", - peer, - request, - ); - // Yay! - let message = ExternalMessage::BlockRequest(BlockRequest { - from_view: request.start, - to_view: request.end, - }); - let request_id = - self.message_sender.send_external_message(*peer, message)?; - self.peer_info(*peer) - .pending_requests - .insert(request_id, (now, request.start, request.end)); - request_sent = true; - } - } - // If we haven't got recent availability, and we haven't already asked for it, ask .. - if !request_sent && chance == 0 && query_availability { - trace!("block_store::request_blocks() : Querying availability"); - // Executive decision: Don't ask for any blocks here, because we are about to do so in duplicate - // later and we don't want to duplicate work - you could viably go for a slightly faster - // sync by just asking for all the blocks and letting the peer send what it has. - let message = ExternalMessage::BlockRequest(BlockRequest { - from_view: 0, - to_view: 0, - }); - let peer_info = self.peer_info(*peer); - peer_info.availability_requested_at = Some(now); - let _ = self.message_sender.send_external_message(*peer, message); - } - - // We only need to request stuff from peers if we haven't already done so. - remain = rem; - } - } - trace!("block_store::request_blocks() : all done"); - if !remain.is_empty() { - warn!( - "block_store::request_blocks() : Could not find peers for views {:?}", - remain - ); - if let Some(us) = &mut self.unserviceable_requests { - us.with_range_map(&remain); - } else { - self.unserviceable_requests = Some(remain); - } - } - Ok(true) + pub fn request_blocks(&mut self, _req: &RangeMap) -> Result { + Ok(false) // FIXME: Stub } pub fn get_block(&self, hash: Hash) -> Result> { diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index c0af4d5b3..cfc6140bc 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -3,7 +3,6 @@ use std::{ collections::{BTreeMap, BinaryHeap, VecDeque}, sync::Arc, time::{Duration, Instant}, - u64, }; use alloy::primitives::BlockNumber; @@ -924,7 +923,9 @@ impl Sync { )? .expect("missing highest block"); Ok(( - self.in_pipeline > 0 || !matches!(self.state, SyncState::Phase0), + !self.chain_metadata.is_empty() + || !self.chain_segments.is_empty() + || !self.recent_proposals.is_empty(), highest_block, )) } From d213e263e004a072c1e9595afd847c1537d1021b Mon Sep 17 00:00:00 2001 From: Shawn Date: Fri, 10 Jan 2025 16:32:15 +0800 Subject: [PATCH 057/119] chore: comments, cleanup. --- zilliqa/src/sync.rs | 145 +++++++++++++++++++++++++------------------- 1 file changed, 81 insertions(+), 64 deletions(-) diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index cfc6140bc..c300780ef 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -29,33 +29,29 @@ use crate::{ // // PHASE 1: Request missing chain metadata. // The entire chain metadata is stored in-memory, and is used to construct a chain of metadata. +// Each metadata basically contains the block_hash, block_number, parent_hash, and view_number. // 1. We start with the latest Proposal and request the chain of metadata from a peer. // 2. We construct the chain of metadata, based on the response received. -// 3. If the last block does not exist in our canonical history, we request for additional metadata. -// 4. If the last block exists, we have hit our canonical history. -// 5. Move to Phase 2. +// 3. If the last block does not exist in our history, we request for additional metadata. +// 4. If the last block exists, we have hit our history, we move to Phase 2. // // PHASE 2: Request missing blocks. -// Once the chain metadata is constructed, we request the missing blocks to replay the history. +// Once the chain metadata is constructed, we fill in the missing blocks to replay the history. +// We do not make any judgements (other than sanity) on the block and leave that up to consensus. // 1. We construct a set of hashes, from the in-memory chain metadata. -// 2. We send these block hashes to the same Peer (that sent the metadata) for retrieval. -// 3. We inject the Proposals into the pipeline, when the response is received. -// 4. If there are still missing blocks, we ask for more, from 1. -// 5. If there are no more missing blocks, we have filled up all blocks from the chain metadata. -// 6. Ready for Phase 3. +// 2. We request these blocks from the same Peer that sent the metadata. +// 3. We inject the received Proposals into the pipeline. +// 4. If there are still missing blocks, we ask for more. +// 5. If there are no more missing blocks, we move to Phase 3. // // PHASE 3: Zip it up. -// Phase 1&2 may run several times that brings up 99% of the chain. This closes the final gap. -// 1. We queue all newly received Proposals, while Phase 1 & 2 were in progress. -// 2. We check the head of the queue if its parent exists in our canonical history. -// 3. If it does not, we trigger Phase 1&2. +// Phase 1&2 may run several times and bring up 99% of the chain, but it will never catch up. +// This closes the final gap. +// 1. We queue all recently received Proposals, while Phase 1 & 2 were in progress. +// 2. We check the head of the queue, if its parent exists in our history. +// 3. If it does not, our history is too far away, we run Phase 1 again. // 4. If it does, we inject the entire queue into the pipeline. -// 5. We are caught up. - -#[cfg(debug_assertions)] -const DO_SPECULATIVE: bool = false; -#[cfg(not(debug_assertions))] -const DO_SPECULATIVE: bool = true; // Speeds up syncing by speculatively fetching blocks. +// 5. We are fully synced. #[derive(Debug)] pub struct Sync { @@ -92,6 +88,20 @@ pub struct Sync { } impl Sync { + // Speed up syncing by speculatively fetching blocks in Phase 1 & 2. + #[cfg(not(debug_assertions))] + const DO_SPECULATIVE: bool = true; + #[cfg(debug_assertions)] + const DO_SPECULATIVE: bool = false; + + // For V1 BlockRequest, we request a little more than we need, due to drift + // Since the view number is an 'internal' clock, it is possible for the same block number + // to have different view numbers. + // 10 ~ 1-min + // 20 ~ 1-hr + // 30 ~ 2-days + const VIEW_DRIFT: u64 = 10; + pub fn new( config: &NodeConfig, db: Arc, @@ -108,7 +118,9 @@ impl Sync { }) .collect(); let peer_id = message_sender.our_peer_id; - let max_batch_size = config.block_request_batch_size.clamp(30, 180); // 30-180 sec of blocks at a time. + let max_batch_size = config + .block_request_batch_size + .clamp(Self::VIEW_DRIFT as usize * 2, 180); // up to 180 sec of blocks at a time. let max_blocks_in_flight = config.max_blocks_in_flight.clamp(max_batch_size, 1800); // up to 30-mins worth of blocks in-pipeline. Ok(Self { @@ -130,12 +142,12 @@ impl Sync { }) } - /// Sync a block proposal. + /// Phase 0: Sync a block proposal. /// /// This is the main entry point for syncing a block proposal. /// We start by enqueuing all proposals, and then check if the parent block exists in history. - /// If the parent block exists, we do nothing. Ttherwise, we check the oldest one in the queue. - /// If we find its parent in history, we inject the entire queue. + /// If the parent block exists, we do nothing. Otherwise, we check the least recent one. + /// If we find its parent in history, we inject the entire queue. Otherwise, we start syncing. /// /// We do not perform checks on the Proposal here. This is done in the consensus layer. pub fn sync_proposal(&mut self, proposal: Proposal) -> Result<()> { @@ -232,10 +244,13 @@ impl Sync { } } - /// Retry phase 1 + /// Phase 2: Retry Phase 1 /// - /// If something went wrong, phase 1 may need to be retried for the most recent segment. - /// Pop the segment from the segment marker, and continue phase 1. + /// If something went wrong in Phase 2, Phase 1 may need to be retried for the recently used segment. + /// Things that could go wrong e.g. the peer went offline, the peer pruned history, etc. + /// + /// Pop the most recently used segment from the segment marker, and retry phase 1. + /// This will rebuild history from the previous marker, with another peer. fn retry_phase1(&mut self) -> Result<()> { if self.chain_segments.is_empty() { tracing::error!("sync::RetryPhase1 : cannot retry phase 1 without chain_segments!"); @@ -250,22 +265,22 @@ impl Sync { key = p.parent_hash; } - // allow retry from p1 + // retry from Phase 1 tracing::info!( "sync::RetryPhase1 : retrying block {} from {}", meta.parent_hash, peer_info.peer_id, ); self.state = SyncState::Phase1(meta); - if DO_SPECULATIVE { + if Self::DO_SPECULATIVE { self.request_missing_metadata(None)?; } Ok(()) } - /// Handle a multi-block response. + /// Phase 2: Handle a multi-block response. /// - /// This is phase 2 in the syncing algorithm, where we receive a set of blocks and inject them into the pipeline. + /// This is Phase 2 in the syncing algorithm, where we receive a set of blocks and inject them into the pipeline. /// We also remove the blocks from the chain metadata, because they are now in the pipeline. pub fn handle_multiblock_response( &mut self, @@ -354,7 +369,7 @@ impl Sync { // Done with phase 2 if self.chain_segments.is_empty() { self.state = SyncState::Phase3; - } else if DO_SPECULATIVE { + } else if Self::DO_SPECULATIVE { // Speculatively request more blocks self.request_missing_blocks()?; } @@ -393,7 +408,7 @@ impl Sync { Ok(message) } - /// Request missing blocks from the chain. + /// Phase 2: Request missing blocks from the chain. /// /// It constructs a set of hashes, which constitute the series of blocks that are missing. /// These hashes are then sent to a Peer for retrieval. @@ -487,14 +502,8 @@ impl Sync { Ok(()) } - // we request a little more than we need, due to drift - // 10 ~ 1min - // 20 ~ 1hr - const VIEW_DRIFT: u64 = 10; - - /// Handle a V1 block response + /// Phase 1 / 2: Handle a V1 block response /// - /// This will be called during both Phase 1 & Phase 2 block responses. /// If the response if from a V2 peer, it will upgrade that peer to V2. /// In phase 1, it will extract the metadata and feed it into handle_metadata_response. /// In phase 2, it will extract the blocks and feed it into handle_multiblock_response. @@ -510,6 +519,7 @@ impl Sync { return Ok(()); } + // Downgrade empty responses if response.proposals.is_empty() { tracing::info!("sync::HandleBlockResponse : empty V1 from {from}"); self.done_with_peer(DownGrade::Empty); @@ -518,25 +528,28 @@ impl Sync { // Convert the V1 response into a V2 response. match self.state { - // Phase 1 - extract metadata from the set of proposals + // Phase 1 - construct the metadata chain from the set of received proposals SyncState::Phase1(ChainMetaData { block_number, mut parent_hash, .. }) => { // We do not buffer the proposals, as it takes 250MB/day! + // Instead, we will re-request the proposals again, in Phase 2. let metadata = response .proposals .into_iter() - .filter(|p| p.number() < block_number) // filter extras + // filter extras due to drift + .filter(|p| p.number() < block_number) .sorted_by(|a, b| b.number().cmp(&a.number())) + // filter any forks .filter(|p| { if parent_hash != p.hash() { return false; } parent_hash = p.header.qc.block_hash; true - }) // filter forks + }) .map(|p| ChainMetaData { block_hash: p.hash(), parent_hash: p.header.qc.block_hash, @@ -547,14 +560,17 @@ impl Sync { self.handle_metadata_response(from, metadata)?; } - // Phase 2 - extract the requested blocks only + + // Phase 2 - extract the requested proposals only. SyncState::Phase2(_) => { let multi_blocks = response .proposals .into_iter() - .filter(|p| self.chain_metadata.contains_key(&p.hash())) // filter extras + // filter any blocks that are not needed + .filter(|p| self.chain_metadata.contains_key(&p.hash())) .sorted_by(|a, b| b.number().cmp(&a.number())) .collect_vec(); + self.handle_multiblock_response(from, multi_blocks)?; } _ => { @@ -567,17 +583,18 @@ impl Sync { Ok(()) } - /// Handle a response to a metadata request. + /// Phase 1: Handle a response to a metadata request. /// /// This is the first step in the syncing algorithm, where we receive a set of metadata and use it to - /// construct a chain history. We check that the metadata does indeed constitute a chain. If it does, - /// we record its segment marker and store the entire chain in-memory. + /// construct a chain history. We check that the metadata does indeed constitute a segment of a chain. + /// If it does, we record its segment marker and store the entire chain in-memory. pub fn handle_metadata_response( &mut self, from: PeerId, response: Vec, ) -> Result<()> { - if let Some(peer) = self.in_flight.as_ref() { + // Check for expected response + let segment_peer = if let Some(peer) = self.in_flight.as_ref() { if peer.peer_id != from { tracing::warn!( "sync::MetadataResponse : unexpected peer={} != {from}", @@ -585,12 +602,13 @@ impl Sync { ); return Ok(()); } + peer.clone() } else { + // We ignore any responses that arrived late, since the original request has already 'timed-out'. tracing::warn!("sync::MetadataResponse : spurious response {from}"); return Ok(()); - } + }; - let segment_peer = self.in_flight.as_ref().unwrap().clone(); // Process whatever we have received. if response.is_empty() { // Empty response, downgrade peer and retry with a new peer. @@ -624,7 +642,7 @@ impl Sync { // TODO: possibly, discard and rebuild entire chain // if something does not match, do nothing and retry the request with the next peer. tracing::error!( - "sync::MetadataResponse : retry metadata expected hash={block_hash} != {} num={block_num} != {}", + "sync::MetadataResponse : unexpected metadata hash={block_hash} != {}, num={block_num} != {}", meta.block_hash, meta.block_number, ); @@ -652,18 +670,17 @@ impl Sync { from ); - // Record the actual chain metadata + // Record the constructed chain metadata, check for loops for meta in segment { if self.chain_metadata.insert(meta.block_hash, meta).is_some() { - anyhow::bail!("loop in chain!"); // there is a possible loop in the chain + anyhow::bail!("sync::MetadataResponse : loop in chain!"); // there is a possible loop in the chain } } - // If the segment does not link to our canonical history, fire the next request + // If the segment hits our history, start Phase 2. if self.db.get_block_by_hash(&last_block_hash)?.is_some() { - // Hit our internal history. Next, phase 2. self.state = SyncState::Phase2(Hash::ZERO); - } else if DO_SPECULATIVE { + } else if Self::DO_SPECULATIVE { self.request_missing_metadata(None)?; } @@ -714,10 +731,10 @@ impl Sync { Ok(message) } - /// Request missing chain from a peer. + /// Phase 1: Request chain metadata from a peer. /// /// This constructs a chain history by requesting blocks from a peer, going backwards from a given block. - /// If phase 1 is in progress, it continues requesting blocks from the last known phase 1 block. + /// If Phase 1 is in progress, it continues requesting blocks from the last known Phase 1 block. /// Otherwise, it requests blocks from the given starting metadata. pub fn request_missing_metadata(&mut self, meta: Option) -> Result<()> { if matches!(self.state, SyncState::Phase2(_)) || matches!(self.state, SyncState::Phase3) { @@ -800,7 +817,7 @@ impl Sync { Ok(()) } - /// Inject the proposals into the chain. + /// Phase 2 / 3: Inject the proposals into the chain. /// /// It adds the list of proposals into the pipeline for execution. /// It also outputs some syncing statistics. @@ -852,7 +869,7 @@ impl Sync { /// Mark a received proposal /// - /// Mark a proposal as received, and remove it from the cache. + /// Mark a proposal as received, and remove it from the chain. pub fn mark_received_proposal(&mut self, prop: &InjectedProposal) -> Result<()> { if prop.from != self.peer_id { tracing::error!( @@ -873,7 +890,6 @@ impl Sync { /// Downgrade a peer based on the response received. fn done_with_peer(&mut self, downgrade: DownGrade) { if let Some(mut peer) = self.in_flight.take() { - // Downgrade peer, if necessary peer.score = peer.score.saturating_add(downgrade as u32); // Ensure that the next peer is equal or better, to avoid a single source of truth. peer.score = peer.score.max(self.peers.peek().unwrap().score); @@ -886,10 +902,10 @@ impl Sync { /// Add a peer to the list of peers. pub fn add_peer(&mut self, peer: PeerId) { - // new peers should be tried last, which gives them time to sync first. + // new peers should be tried later, which gives them time to sync first. let new_peer = PeerInfo { version: PeerVer::V1, // default V2 - score: self.peers.iter().map(|p| p.score).max().unwrap_or_default(), + score: self.peers.iter().map(|p| p.score).min().unwrap_or_default(), peer_id: peer, last_used: Instant::now(), }; @@ -901,6 +917,7 @@ impl Sync { self.peers.retain(|p| p.peer_id != peer); } + /// Get the next best peer to use fn get_next_peer(&mut self) -> Option { // Minimum of 2 peers to avoid single source of truth. if self.peers.len() < 2 { @@ -908,7 +925,7 @@ impl Sync { } let mut peer = self.peers.pop()?; - peer.last_used = std::time::Instant::now(); // used to determine stale in-flight requests. + peer.last_used = std::time::Instant::now(); // used to determine stale requests. Some(peer) } From 3f44c0a3b951de448043c0d3b299478a104f5247 Mon Sep 17 00:00:00 2001 From: Shawn Date: Fri, 10 Jan 2025 17:15:40 +0800 Subject: [PATCH 058/119] fix: correct Phase 2 range, the stored value is accurate. --- zilliqa/src/sync.rs | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index c300780ef..86f66c8c5 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -306,10 +306,6 @@ impl Sync { tracing::warn!("sync::MultiBlockResponse : empty blocks {from}",); self.done_with_peer(DownGrade::Empty); return self.retry_phase1(); - } else if response.len() < self.max_batch_size { - // Partial response, process blocks. - tracing::warn!("sync::MultiBlockResponse : partial blocks {from}",); - self.done_with_peer(DownGrade::None); } else { self.done_with_peer(DownGrade::None); } @@ -443,14 +439,12 @@ impl Sync { // If we have no chain_segments, we have nothing to do if let Some((peer_info, meta)) = self.chain_segments.last() { - let to_view = meta.view_number.saturating_add(Self::VIEW_DRIFT); - let mut from_view = meta.view_number; + // let mut from_view = meta.view_number; let mut request_hashes = Vec::with_capacity(self.max_batch_size); let mut key = meta.parent_hash; // start from this block while let Some(meta) = self.chain_metadata.remove(&key) { request_hashes.push(meta.block_hash); key = meta.parent_hash; - from_view = meta.view_number; self.chain_metadata.insert(meta.block_hash, meta); // reinsert, for retries } @@ -488,7 +482,11 @@ impl Sync { last_used: std::time::Instant::now(), score: u32::MAX, // used to indicate faux peer, will not be added to the group of peers }); - ExternalMessage::BlockRequest(BlockRequest { to_view, from_view }) + // do not add VIEW_DRIFT - the stored marker is accurate! + ExternalMessage::BlockRequest(BlockRequest { + to_view: meta.view_number.saturating_sub(1), + from_view: meta.view_number.saturating_sub(self.max_batch_size as u64), + }) } }; self.message_sender @@ -615,10 +613,6 @@ impl Sync { tracing::warn!("sync::MetadataResponse : empty blocks {from}",); self.done_with_peer(DownGrade::Empty); return Ok(()); - } else if response.len() < self.max_batch_size { - // Partial response, process the response. - tracing::warn!("sync::MetadataResponse : partial blocks {from}",); - self.done_with_peer(DownGrade::None); } else { self.done_with_peer(DownGrade::None); } From 866794c23bfea5796c2503915891396bbed64bf6 Mon Sep 17 00:00:00 2001 From: Shawn Date: Sat, 11 Jan 2025 13:11:32 +0800 Subject: [PATCH 059/119] feat: ensure unique peers. --- zilliqa/src/sync.rs | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index 86f66c8c5..f876234e4 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -882,10 +882,13 @@ impl Sync { } /// Downgrade a peer based on the response received. + /// + /// This algorithm favours good peers that respond quickly (i.e. no timeout). + /// In most cases, it eventually degenerates into 2 sources - avoid a single source of truth. fn done_with_peer(&mut self, downgrade: DownGrade) { if let Some(mut peer) = self.in_flight.take() { peer.score = peer.score.saturating_add(downgrade as u32); - // Ensure that the next peer is equal or better, to avoid a single source of truth. + // Ensure that the next peer is equal or better peer.score = peer.score.max(self.peers.peek().unwrap().score); // Reinsert peers that are good if peer.score < u32::MAX { @@ -896,7 +899,10 @@ impl Sync { /// Add a peer to the list of peers. pub fn add_peer(&mut self, peer: PeerId) { - // new peers should be tried later, which gives them time to sync first. + // ensure that it is unique - avoids single source of truth + self.remove_peer(peer); + // if the new peer is not synced, it will get downgraded to the back of heap. + // but by placing them at the back of the 'best' pack, we get to try them out soon. let new_peer = PeerInfo { version: PeerVer::V1, // default V2 score: self.peers.iter().map(|p| p.score).min().unwrap_or_default(), @@ -908,7 +914,7 @@ impl Sync { /// Remove a peer from the list of peers. pub fn remove_peer(&mut self, peer: PeerId) { - self.peers.retain(|p| p.peer_id != peer); + self.peers.retain(|p: &PeerInfo| p.peer_id != peer); } /// Get the next best peer to use From b05f0987dacf5eeec802fa338a63a6b7de04b79c Mon Sep 17 00:00:00 2001 From: Shawn Date: Sat, 11 Jan 2025 13:16:18 +0800 Subject: [PATCH 060/119] feat: output rate stats in block/s --- zilliqa/src/sync.rs | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index f876234e4..a860e765d 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -822,11 +822,9 @@ impl Sync { // Output some stats if let Some((when, injected)) = self.inject_at { - tracing::debug!( - "sync::InjectProposals : synced {}/{:?}", - injected - self.in_pipeline, - when.elapsed() - ); + let diff = injected - self.in_pipeline; + let rate = diff as f32 / when.elapsed().as_secs_f32(); + tracing::debug!("sync::InjectProposals : synced {} block/s", rate); } // Increment proposals injected From 84900bf1eab451147a2b239de92c8fa2796a5be0 Mon Sep 17 00:00:00 2001 From: Shawn Date: Sat, 11 Jan 2025 23:12:36 +0800 Subject: [PATCH 061/119] feat: minor reorg, logging. --- zilliqa/src/sync.rs | 122 +++++++++++++++++++++----------------------- 1 file changed, 59 insertions(+), 63 deletions(-) diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index a860e765d..d853964c1 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -102,6 +102,9 @@ impl Sync { // 30 ~ 2-days const VIEW_DRIFT: u64 = 10; + // Minimum of 2 peers to avoid single source of truth. + const MIN_PEERS: usize = 2; + pub fn new( config: &NodeConfig, db: Arc, @@ -199,8 +202,10 @@ impl Sync { let ancestor_hash = self.recent_proposals.front().unwrap().header.qc.block_hash; if self.db.get_block_by_hash(&ancestor_hash)?.is_some() { tracing::info!( - "sync::SyncProposal : finishing up {} blocks for segment #0 from {ancestor_hash}", - self.recent_proposals.len() + "sync::SyncProposal : finishing {} blocks for segment #{} from {}", + self.recent_proposals.len(), + self.chain_segments.len(), + self.peer_id, ); // inject the proposals let proposals = self.recent_proposals.drain(..).collect_vec(); @@ -253,24 +258,24 @@ impl Sync { /// This will rebuild history from the previous marker, with another peer. fn retry_phase1(&mut self) -> Result<()> { if self.chain_segments.is_empty() { - tracing::error!("sync::RetryPhase1 : cannot retry phase 1 without chain_segments!"); + tracing::error!("sync::RetryPhase1 : cannot retry phase 1 without chain segments!"); self.state = SyncState::Phase0; return Ok(()); } + tracing::debug!( + "sync::RetryPhase1 : retrying segment #{}", + self.chain_segments.len(), + ); + // remove the last segment from the chain metadata - let (peer_info, meta) = self.chain_segments.pop().unwrap(); + let (_, meta) = self.chain_segments.pop().unwrap(); let mut key = meta.parent_hash; while let Some(p) = self.chain_metadata.remove(&key) { key = p.parent_hash; } // retry from Phase 1 - tracing::info!( - "sync::RetryPhase1 : retrying block {} from {}", - meta.parent_hash, - peer_info.peer_id, - ); self.state = SyncState::Phase1(meta); if Self::DO_SPECULATIVE { self.request_missing_metadata(None)?; @@ -310,10 +315,6 @@ impl Sync { self.done_with_peer(DownGrade::None); } - let SyncState::Phase2(p2_hash) = self.state else { - anyhow::bail!("sync::MultiBlockResponse : invalid state"); - }; - tracing::info!( "sync::MultiBlockResponse : received {} blocks for segment #{} from {}", response.len(), @@ -321,18 +322,11 @@ impl Sync { from ); - // Spurious response - let Some((peer_info, _)) = self.chain_segments.last() else { - anyhow::bail!("sync::MultiBlockResponse: no more chain_segments!"); + // If the checksum does not match, retry phase 1. Maybe the node has pruned the segment. + let SyncState::Phase2(check_sum) = self.state else { + anyhow::bail!("sync::MultiBlockResponse : invalid state"); }; - // If the response is not from the expected peer e.g. delayed response, retry phase 2. - if peer_info.peer_id != from { - tracing::warn!("sync::MultiBlockResponse: unknown peer {from}, will retry"); - return Ok(()); - } - - // If the checksum does not match, retry phase 1. Maybe the node has pruned the segment. let checksum = response .iter() .fold(Hash::builder().with(Hash::ZERO.as_bytes()), |sum, p| { @@ -340,8 +334,10 @@ impl Sync { }) .finalize(); - if p2_hash != checksum { - tracing::error!("sync::MultiBlockResponse : mismatch history {checksum}"); + if check_sum != checksum { + tracing::error!( + "sync::MultiBlockResponse : unexpected checksum={check_sum} != {checksum}" + ); return self.retry_phase1(); } @@ -464,7 +460,6 @@ impl Sync { self.chain_segments.len(), peer_info.peer_id, ); - let message = match peer_info.version { PeerVer::V2 => { self.in_flight = Some(PeerInfo { @@ -493,9 +488,7 @@ impl Sync { .send_external_message(peer_info.peer_id, message)?; } } else { - tracing::warn!( - "sync::RequestMissingBlocks : insufficient peers to request missing blocks" - ); + tracing::warn!("sync::RequestMissingBlocks : insufficient peers to handle request"); } Ok(()) } @@ -511,7 +504,7 @@ impl Sync { && response.proposals.is_empty() && response.from_view == u64::MAX { - tracing::info!("sync::HandleBlockResponse : upgrading {from} to V2",); + tracing::info!("sync::HandleBlockResponse : upgrading {from}",); self.in_flight.as_mut().unwrap().version = PeerVer::V2; self.done_with_peer(DownGrade::None); return Ok(()); @@ -519,11 +512,16 @@ impl Sync { // Downgrade empty responses if response.proposals.is_empty() { - tracing::info!("sync::HandleBlockResponse : empty V1 from {from}"); + tracing::info!("sync::HandleBlockResponse : empty response {from}"); self.done_with_peer(DownGrade::Empty); return Ok(()); } + tracing::trace!( + "sync::HandleBlockResponse : received {} blocks from {from}", + response.proposals.len() + ); + // Convert the V1 response into a V2 response. match self.state { // Phase 1 - construct the metadata chain from the set of received proposals @@ -564,7 +562,7 @@ impl Sync { let multi_blocks = response .proposals .into_iter() - // filter any blocks that are not needed + // filter any blocks that are not in the chain e.g. forks .filter(|p| self.chain_metadata.contains_key(&p.hash())) .sorted_by(|a, b| b.number().cmp(&a.number())) .collect_vec(); @@ -731,7 +729,7 @@ impl Sync { /// If Phase 1 is in progress, it continues requesting blocks from the last known Phase 1 block. /// Otherwise, it requests blocks from the given starting metadata. pub fn request_missing_metadata(&mut self, meta: Option) -> Result<()> { - if matches!(self.state, SyncState::Phase2(_)) || matches!(self.state, SyncState::Phase3) { + if !matches!(self.state, SyncState::Phase1(_)) && !matches!(self.state, SyncState::Phase0) { anyhow::bail!("sync::RequestMissingMetadata : invalid state"); } // Early exit if there's a request in-flight; and if it has not expired. @@ -755,7 +753,12 @@ impl Sync { } if let Some(peer) = self.get_next_peer() { - let peer_id = peer.peer_id; + tracing::info!( + "sync::RequestMissingMetadata : requesting {} metadata of segment #{} from {}", + self.max_batch_size, + self.chain_segments.len() + 1, + peer.peer_id + ); let message = match self.state { SyncState::Phase1(ChainMetaData { parent_hash, .. }) if matches!(peer.version, PeerVer::V2) => @@ -795,18 +798,11 @@ impl Sync { } _ => anyhow::bail!("sync::MissingMetadata : invalid state"), }; - tracing::info!( - ?message, - "sync::RequestMissingMetadata : requesting missing chain from {}", - peer_id - ); - self.in_flight = Some(peer); self.message_sender - .send_external_message(peer_id, message)?; + .send_external_message(peer.peer_id, message)?; + self.in_flight = Some(peer); } else { - tracing::warn!( - "sync::RequestMissingMetadata : insufficient peers to request missing blocks" - ); + tracing::warn!("sync::RequestMissingMetadata : insufficient peers to handle request"); } Ok(()) } @@ -829,7 +825,11 @@ impl Sync { // Increment proposals injected self.in_pipeline = self.in_pipeline.saturating_add(proposals.len()); - let len = proposals.len(); + tracing::debug!( + "sync::InjectProposals : injecting {}/{} proposals", + proposals.len(), + self.in_pipeline + ); // Just pump the Proposals back to ourselves. for p in proposals { @@ -849,12 +849,6 @@ impl Sync { } self.inject_at = Some((std::time::Instant::now(), self.in_pipeline)); - - tracing::debug!( - "sync::InjectProposals : injected {}/{} proposals", - len, - self.in_pipeline - ); // return last proposal Ok(()) } @@ -885,6 +879,7 @@ impl Sync { /// In most cases, it eventually degenerates into 2 sources - avoid a single source of truth. fn done_with_peer(&mut self, downgrade: DownGrade) { if let Some(mut peer) = self.in_flight.take() { + tracing::trace!("sync::DoneWithPeer {} {:?}", peer.peer_id, downgrade); peer.score = peer.score.saturating_add(downgrade as u32); // Ensure that the next peer is equal or better peer.score = peer.score.max(self.peers.peek().unwrap().score); @@ -897,8 +892,6 @@ impl Sync { /// Add a peer to the list of peers. pub fn add_peer(&mut self, peer: PeerId) { - // ensure that it is unique - avoids single source of truth - self.remove_peer(peer); // if the new peer is not synced, it will get downgraded to the back of heap. // but by placing them at the back of the 'best' pack, we get to try them out soon. let new_peer = PeerInfo { @@ -907,24 +900,27 @@ impl Sync { peer_id: peer, last_used: Instant::now(), }; + tracing::trace!("sync::AddPeer {peer}"); + // ensure that it is unique - avoids single source of truth + self.peers.retain(|p: &PeerInfo| p.peer_id != peer); self.peers.push(new_peer); } /// Remove a peer from the list of peers. pub fn remove_peer(&mut self, peer: PeerId) { + tracing::trace!("sync::RemovePeer {peer}"); self.peers.retain(|p: &PeerInfo| p.peer_id != peer); } /// Get the next best peer to use fn get_next_peer(&mut self) -> Option { - // Minimum of 2 peers to avoid single source of truth. - if self.peers.len() < 2 { - return None; + if self.peers.len() >= Self::MIN_PEERS { + let mut peer = self.peers.pop()?; + peer.last_used = std::time::Instant::now(); // used to determine stale requests. + tracing::trace!("sync::GetNextPeer {} ({})", peer.peer_id, peer.score); + return Some(peer); } - - let mut peer = self.peers.pop()?; - peer.last_used = std::time::Instant::now(); // used to determine stale requests. - Some(peer) + None } /// Returns (am_syncing, current_highest_block) @@ -938,9 +934,9 @@ impl Sync { )? .expect("missing highest block"); Ok(( - !self.chain_metadata.is_empty() - || !self.chain_segments.is_empty() - || !self.recent_proposals.is_empty(), + !self.recent_proposals.is_empty() + || !self.chain_metadata.is_empty() + || !self.chain_segments.is_empty(), highest_block, )) } From 805313eb272286d89d18ed1cd7b0ba5bbf280461 Mon Sep 17 00:00:00 2001 From: Shawn Date: Sun, 12 Jan 2025 21:23:15 +0800 Subject: [PATCH 062/119] feat: added saving of metadata/segments to DB - allows continuation. --- zilliqa/src/sync.rs | 143 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 138 insertions(+), 5 deletions(-) diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index d853964c1..242209c71 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -9,6 +9,7 @@ use alloy::primitives::BlockNumber; use anyhow::Result; use itertools::Itertools; use libp2p::PeerId; +use rusqlite::named_params; use crate::{ cfg::NodeConfig, @@ -126,6 +127,75 @@ impl Sync { .clamp(Self::VIEW_DRIFT as usize * 2, 180); // up to 180 sec of blocks at a time. let max_blocks_in_flight = config.max_blocks_in_flight.clamp(max_batch_size, 1800); // up to 30-mins worth of blocks in-pipeline. + // FIXME: Move these to db.rs later + db.with_sqlite_tx(|c| { + c.execute_batch( + "CREATE TABLE IF NOT EXISTS sync_data ( + block_hash BLOB NOT NULL UNIQUE, + parent_hash BLOB NOT NULL, + block_number INTEGER NOT NULL PRIMARY KEY, + view_number INTEGER NOT NULL, + peer BLOB DEFAULT NULL + );", + )?; + Ok(()) + })?; + + // Restore metadata/segments + let mut metadata: BTreeMap = BTreeMap::new(); + let mut segments: Vec<(PeerInfo, ChainMetaData)> = Vec::new(); + + db.with_sqlite_tx(|c| { + let _ = c.prepare( + "SELECT parent_hash, block_hash, block_number, view_number, peer FROM sync_data ORDER BY rowid DESC", + )? + .query_map([], |row| { + let m = ChainMetaData{ + parent_hash: row.get_unwrap(0), + block_hash: row.get_unwrap(1), + block_number: row.get_unwrap(2), + view_number: row.get_unwrap(3), + }; + metadata.insert(m.block_hash, m.clone()); + + if let Ok(p) = row.get::<_, Vec>(4) { + if let Ok(peer_id) = PeerId::from_bytes(&p) { + segments.push(( + PeerInfo { + version: PeerVer::V1, + score: 0, + peer_id, + last_used: Instant::now(), + }, + m.clone(), + )); + } + } + + Ok(m) + })?.collect_vec(); + Ok(()) + })?; + + // remove last segment + if let Some((_, meta)) = segments.pop() { + let mut key = meta.parent_hash; + while let Some(p) = metadata.remove(&key) { + key = p.parent_hash; + } + } + + let state = if segments.is_empty() { + SyncState::Phase0 + } else { + tracing::info!( + "sync::New : continue from segment #{} with {} metadata", + segments.len(), + metadata.len() + ); + SyncState::Phase1(segments.last().as_ref().unwrap().1.clone()) + }; + Ok(Self { db, message_sender, @@ -136,15 +206,64 @@ impl Sync { max_blocks_in_flight, in_flight: None, in_pipeline: usize::MIN, - chain_metadata: BTreeMap::new(), - chain_segments: Vec::new(), - state: SyncState::Phase0, + chain_metadata: metadata, + chain_segments: segments, + state, recent_proposals: VecDeque::with_capacity(max_batch_size), inject_at: None, started_at_block_number: u64::MIN, }) } + fn pop_segment(&self, meta: ChainMetaData) -> Result<()> { + self.db.with_sqlite_tx(|c| { + c.execute( + "UPDATE sync_data SET peer = NULL WHERE block_hash = :block_hash", + named_params! { + ":block_hash": meta.block_hash, + }, + )?; + Ok(()) + }) + } + + fn push_segment(&self, peer: PeerInfo, meta: ChainMetaData) -> Result<()> { + self.db.with_sqlite_tx(|c| { + c.execute( + "UPDATE sync_data SET peer = :peer WHERE block_hash = :block_hash", + named_params! { + ":peer": peer.peer_id.to_bytes(), + ":block_hash": meta.block_hash, + }, + )?; + Ok(()) + }) + } + + // TODO: Move into db.rs, optimise + fn insert_metadata(&mut self, meta: ChainMetaData) -> Result<()> { + self.db.with_sqlite_tx(|c| { + c.execute( + "INSERT INTO sync_data (parent_hash, block_hash, block_number, view_number) VALUES (:parent_hash, :block_hash, :block_number, :view_number)", + named_params! { + ":parent_hash": meta.parent_hash, + ":block_hash": meta.block_hash, + ":block_number": meta.block_number, + ":view_number": meta.view_number, + }, + )?; + Ok(()) + }) + } + + // TODO: Move into db.rs, optimise + fn remove_metadata(&self, hash: Hash) -> Result<()> { + self.db.with_sqlite_tx(|c| { + c.execute("DELETE FROM sync_data WHERE block_hash = ?1", [hash])?; + Ok(()) + }) + } + /// Phase 0: Sync a block proposal. /// /// This is the main entry point for syncing a block proposal. @@ -270,9 +389,12 @@ impl Sync { // remove the last segment from the chain metadata let (_, meta) = self.chain_segments.pop().unwrap(); + self.pop_segment(meta.clone())?; + let mut key = meta.parent_hash; while let Some(p) = self.chain_metadata.remove(&key) { key = p.parent_hash; + self.remove_metadata(p.block_hash)?; } // retry from Phase 1 @@ -351,11 +473,15 @@ impl Sync { for p in &proposals { if self.chain_metadata.remove(&p.hash()).is_none() { anyhow::bail!("missing chain data for proposal"); // this should never happen! + } else { + self.remove_metadata(p.hash())?; } } // Done with this segment - self.chain_segments.pop(); + if let Some((_, meta)) = self.chain_segments.pop() { + self.pop_segment(meta)?; + } self.inject_proposals(proposals)?; // Done with phase 2 @@ -649,6 +775,7 @@ impl Sync { let segment = response; // Record landmark, including peer that has this set of blocks + self.push_segment(segment_peer.clone(), meta.clone())?; self.chain_segments.push((segment_peer, meta.clone())); // Record the oldest block in the chain's parent @@ -664,8 +791,14 @@ impl Sync { // Record the constructed chain metadata, check for loops for meta in segment { - if self.chain_metadata.insert(meta.block_hash, meta).is_some() { + if self + .chain_metadata + .insert(meta.block_hash, meta.clone()) + .is_some() + { anyhow::bail!("sync::MetadataResponse : loop in chain!"); // there is a possible loop in the chain + } else { + self.insert_metadata(meta)?; } } From f23e71b19df8494a7009cc60176cd38e1bf8010a Mon Sep 17 00:00:00 2001 From: Shawn Date: Sun, 12 Jan 2025 21:49:30 +0800 Subject: [PATCH 063/119] feat: added stateful sync algorithm feature - can continue after restart. --- zilliqa/src/sync.rs | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index 242209c71..c43e4e33b 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -142,8 +142,8 @@ impl Sync { })?; // Restore metadata/segments - let mut metadata: BTreeMap = BTreeMap::new(); - let mut segments: Vec<(PeerInfo, ChainMetaData)> = Vec::new(); + let mut metadata = BTreeMap::new(); + let mut segments = Vec::new(); db.with_sqlite_tx(|c| { let _ = c.prepare( @@ -182,13 +182,20 @@ impl Sync { let mut key = meta.parent_hash; while let Some(p) = metadata.remove(&key) { key = p.parent_hash; + db.with_sqlite_tx(|c| { + c.execute( + "DELETE FROM sync_data WHERE block_hash = ?1", + [p.block_hash], + )?; + Ok(()) + })?; } } let state = if segments.is_empty() { SyncState::Phase0 } else { - tracing::info!( + tracing::debug!( "sync::New : continue from segment #{} with {} metadata", segments.len(), metadata.len() @@ -196,6 +203,12 @@ impl Sync { SyncState::Phase1(segments.last().as_ref().unwrap().1.clone()) }; + let start_at = if let SyncState::Phase1(m) = &state { + m.block_number + } else { + u64::MIN + }; + Ok(Self { db, message_sender, @@ -211,7 +224,7 @@ impl Sync { state, recent_proposals: VecDeque::with_capacity(max_batch_size), inject_at: None, - started_at_block_number: u64::MIN, + started_at_block_number: start_at, }) } @@ -244,7 +257,7 @@ impl Sync { fn insert_metadata(&mut self, meta: ChainMetaData) -> Result<()> { self.db.with_sqlite_tx(|c| { c.execute( - "INSERT INTO sync_data (parent_hash, block_hash, block_number, view_number) VALUES (:parent_hash, :block_hash, :block_number, :view_number)", + "INSERT OR REPLACE INTO sync_data (parent_hash, block_hash, block_number, view_number) VALUES (:parent_hash, :block_hash, :block_number, :view_number)", named_params! { ":parent_hash": meta.parent_hash, ":block_hash": meta.block_hash, @@ -561,13 +574,11 @@ impl Sync { // If we have no chain_segments, we have nothing to do if let Some((peer_info, meta)) = self.chain_segments.last() { - // let mut from_view = meta.view_number; let mut request_hashes = Vec::with_capacity(self.max_batch_size); let mut key = meta.parent_hash; // start from this block - while let Some(meta) = self.chain_metadata.remove(&key) { + while let Some(meta) = self.chain_metadata.get(&key) { request_hashes.push(meta.block_hash); key = meta.parent_hash; - self.chain_metadata.insert(meta.block_hash, meta); // reinsert, for retries } // Checksum of the request hashes From f2f57992a3e3ed38ae501a46dcc8839e7740acb0 Mon Sep 17 00:00:00 2001 From: Shawn Date: Mon, 13 Jan 2025 16:56:20 +0800 Subject: [PATCH 064/119] feat: rebuilt the algorithm to use DB for state, instead of in-memory. --- zilliqa/src/sync.rs | 339 ++++++++++++++++++++++++-------------------- 1 file changed, 185 insertions(+), 154 deletions(-) diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index c43e4e33b..149f44e57 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -1,6 +1,6 @@ use std::{ cmp::Ordering, - collections::{BTreeMap, BinaryHeap, VecDeque}, + collections::{BinaryHeap, VecDeque}, sync::Arc, time::{Duration, Instant}, }; @@ -9,7 +9,7 @@ use alloy::primitives::BlockNumber; use anyhow::Result; use itertools::Itertools; use libp2p::PeerId; -use rusqlite::named_params; +use rusqlite::{named_params, OptionalExtension}; use crate::{ cfg::NodeConfig, @@ -76,10 +76,6 @@ pub struct Sync { peer_id: PeerId, // internal sync state state: SyncState, - // complete chain metadata, in-memory - chain_metadata: BTreeMap, - // markers to segments in the chain, and the source peer for that segment. - chain_segments: Vec<(PeerInfo, ChainMetaData)>, // fixed-size queue of the most recent proposals recent_proposals: VecDeque, // for statistics only @@ -127,7 +123,8 @@ impl Sync { .clamp(Self::VIEW_DRIFT as usize * 2, 180); // up to 180 sec of blocks at a time. let max_blocks_in_flight = config.max_blocks_in_flight.clamp(max_batch_size, 1800); // up to 30-mins worth of blocks in-pipeline. - // FIXME: Move these to db.rs later + // This DB could be left in-here as it is only used in this module + // TODO: Make this in-memory by exploiting SQLite TEMP tables i.e. CREATE TEMP TABLE db.with_sqlite_tx(|c| { c.execute_batch( "CREATE TABLE IF NOT EXISTS sync_data ( @@ -136,77 +133,26 @@ impl Sync { block_number INTEGER NOT NULL PRIMARY KEY, view_number INTEGER NOT NULL, peer BLOB DEFAULT NULL - );", + ); + CREATE INDEX IF NOT EXISTS idx_sync_data ON sync_data(block_number) WHERE peer IS NOT NULL;", )?; Ok(()) })?; // Restore metadata/segments - let mut metadata = BTreeMap::new(); - let mut segments = Vec::new(); - + let mut segments = 0; db.with_sqlite_tx(|c| { - let _ = c.prepare( - "SELECT parent_hash, block_hash, block_number, view_number, peer FROM sync_data ORDER BY rowid DESC", - )? - .query_map([], |row| { - let m = ChainMetaData{ - parent_hash: row.get_unwrap(0), - block_hash: row.get_unwrap(1), - block_number: row.get_unwrap(2), - view_number: row.get_unwrap(3), - }; - metadata.insert(m.block_hash, m.clone()); - - if let Ok(p) = row.get::<_, Vec>(4) { - if let Ok(peer_id) = PeerId::from_bytes(&p) { - segments.push(( - PeerInfo { - version: PeerVer::V1, - score: 0, - peer_id, - last_used: Instant::now(), - }, - m.clone(), - )); - } - } - - Ok(m) - })?.collect_vec(); + segments = c + .prepare_cached("SELECT COUNT(block_number) FROM sync_data WHERE peer IS NOT NULL")? + .query_row([], |row| row.get::<_, usize>(0)) + .optional()? + .unwrap_or_default(); Ok(()) })?; - - // remove last segment - if let Some((_, meta)) = segments.pop() { - let mut key = meta.parent_hash; - while let Some(p) = metadata.remove(&key) { - key = p.parent_hash; - db.with_sqlite_tx(|c| { - c.execute( - "DELETE FROM sync_data WHERE block_hash = ?1", - [p.block_hash], - )?; - Ok(()) - })?; - } - } - - let state = if segments.is_empty() { + let state = if segments == 0 { SyncState::Phase0 } else { - tracing::debug!( - "sync::New : continue from segment #{} with {} metadata", - segments.len(), - metadata.len() - ); - SyncState::Phase1(segments.last().as_ref().unwrap().1.clone()) - }; - - let start_at = if let SyncState::Phase1(m) = &state { - m.block_number - } else { - u64::MIN + SyncState::Retry1 }; Ok(Self { @@ -219,45 +165,147 @@ impl Sync { max_blocks_in_flight, in_flight: None, in_pipeline: usize::MIN, - chain_metadata: metadata, - chain_segments: segments, state, recent_proposals: VecDeque::with_capacity(max_batch_size), inject_at: None, - started_at_block_number: start_at, + started_at_block_number: 0, }) } - fn pop_segment(&self, meta: ChainMetaData) -> Result<()> { + /// Returns the number of stored segments + fn count_segments(&self) -> Result { + let mut segments = 0; self.db.with_sqlite_tx(|c| { - c.execute( - "UPDATE sync_data SET peer = NULL WHERE block_hash = :block_hash", - named_params! { - ":block_hash": meta.block_hash, + segments = c + .prepare_cached("SELECT COUNT(block_number) FROM sync_data WHERE peer IS NOT NULL")? + .query_row([], |row| row.get(0)) + .optional()? + .unwrap_or_default(); + Ok(()) + })?; + Ok(segments) + } + + /// Checks if the stored metadata exists + fn contains_metadata(&self, hash: &Hash) -> Result { + let mut result = false; + self.db.with_sqlite_tx(|c| { + result = c + .prepare_cached("SELECT block_number FROM sync_data WHERE block_hash = ?1")? + .query_row([hash], |row| row.get::<_, u64>(0)) + .optional()? + .is_some(); + Ok(()) + })?; + Ok(result) + } + + /// Retrieves bulk metadata information from the given block_hash (inclusive) + fn get_segment(&self, hash: Hash) -> Result> { + let mut hashes = Vec::with_capacity(self.max_batch_size); + let mut block_hash = hash; + self.db.with_sqlite_tx(|c| { + while let Some(parent_hash) = c + .prepare_cached("SELECT parent_hash FROM sync_data WHERE block_hash = ?1")? + .query_row([block_hash], |row| row.get::<_, Hash>(0)) + .optional()? + { + hashes.push(block_hash); + block_hash = parent_hash; + } + Ok(()) + })?; + Ok(hashes) + } + + /// Peeks into the top of the segment stack. + fn last_segment(&self) -> Result> { + let mut result = None; + self.db.with_sqlite_tx(|c| { + result = c + .prepare_cached("SELECT parent_hash, block_hash, block_number, view_number, peer FROM sync_data WHERE peer IS NOT NULL ORDER BY block_number ASC LIMIT 1")? + .query_row([], |row| Ok(( + ChainMetaData{ + parent_hash: row.get(0)?, + block_hash: row.get(1)?, + block_number: row.get(2)?, + view_number: row.get(3)?, }, - )?; + PeerInfo { + last_used: Instant::now(), + score:u32::MAX, + version: PeerVer::V1, + peer_id: PeerId::from_bytes(row.get::<_,Vec>(4)?.as_slice()).unwrap(), + }, + ))) + .optional()?; + Ok(()) + })?; + Ok(result) + } + + /// Pops a segment from the stack; and bulk removes all metadata associated with it. + fn pop_segment(&self) -> Result<()> { + self.db.with_sqlite_tx(|c| { + if let Some(block_hash) = c.prepare_cached("SELECT block_hash FROM sync_data WHERE peer IS NOT NULL ORDER BY block_number ASC LIMIT 1")? + .query_row([], |row| row.get::<_,Hash>(0)).optional()? { + if let Some(parent_hash) = c.prepare_cached("SELECT parent_hash FROM sync_data WHERE block_hash = ?1")? + .query_row([block_hash], |row| row.get(0)).optional()? { + + // update marker + c.prepare_cached( + "UPDATE sync_data SET peer = NULL WHERE block_hash = ?1")? + .execute( + [block_hash] + )?; + + // remove segment + let mut hashes = Vec::with_capacity(self.max_batch_size); + let mut block_hash = parent_hash; + while let Some(parent_hash) = c + .prepare_cached("SELECT parent_hash FROM sync_data WHERE block_hash = ?1")? + .query_row([block_hash], |row| row.get::<_, Hash>(0)) + .optional()? + { + hashes.push(block_hash); + block_hash = parent_hash; + } + + for hash in hashes { + c.prepare_cached("DELETE FROM sync_data WHERE block_hash = ?1")? + .execute([hash])?; + } + } + } Ok(()) }) } + /// Pushes a particular segment into the stack. fn push_segment(&self, peer: PeerInfo, meta: ChainMetaData) -> Result<()> { self.db.with_sqlite_tx(|c| { - c.execute( - "UPDATE sync_data SET peer = :peer WHERE block_hash = :block_hash", + c.prepare_cached( + "INSERT OR REPLACE INTO sync_data (parent_hash, block_hash, block_number, view_number, peer) VALUES (:parent_hash, :block_hash, :block_number, :view_number, :peer)")? + .execute( named_params! { - ":peer": peer.peer_id.to_bytes(), + ":parent_hash": meta.parent_hash, ":block_hash": meta.block_hash, + ":block_number": meta.block_number, + ":view_number": meta.view_number, + ":peer": peer.peer_id.to_bytes(), }, )?; Ok(()) }) } - // TODO: Move into db.rs, optimise - fn insert_metadata(&mut self, meta: ChainMetaData) -> Result<()> { + /// Bulk inserts a bunch of metadata. + fn insert_metadata(&self, metas: Vec) -> Result<()> { self.db.with_sqlite_tx(|c| { - c.execute( - "INSERT OR REPLACE INTO sync_data (parent_hash, block_hash, block_number, view_number) VALUES (:parent_hash, :block_hash, :block_number, :view_number)", + for meta in metas { + c.prepare_cached( + "INSERT OR REPLACE INTO sync_data (parent_hash, block_hash, block_number, view_number) VALUES (:parent_hash, :block_hash, :block_number, :view_number)")? + .execute( named_params! { ":parent_hash": meta.parent_hash, ":block_hash": meta.block_hash, @@ -265,14 +313,15 @@ impl Sync { ":view_number": meta.view_number, }, )?; + } Ok(()) }) } - // TODO: Move into db.rs, optimise - fn remove_metadata(&self, hash: Hash) -> Result<()> { + /// Empty the metadata table. + fn empty_metadata(&self) -> Result<()> { self.db.with_sqlite_tx(|c| { - c.execute("DELETE FROM sync_data WHERE block_hash = ?1", [hash])?; + c.execute("DELETE FROM sync_data", [])?; Ok(()) }) } @@ -336,15 +385,31 @@ impl Sync { tracing::info!( "sync::SyncProposal : finishing {} blocks for segment #{} from {}", self.recent_proposals.len(), - self.chain_segments.len(), + self.count_segments()?, self.peer_id, ); // inject the proposals let proposals = self.recent_proposals.drain(..).collect_vec(); self.inject_proposals(proposals)?; } + self.empty_metadata()?; self.state = SyncState::Phase0; } + // Retry to fix sync issues e.g. peers that are now offline + SyncState::Retry1 if self.in_pipeline == 0 => { + self.retry_phase1()?; + if self.started_at_block_number == 0 { + let highest_block = self + .db + .get_canonical_block_by_number( + self.db + .get_highest_canonical_block_number()? + .expect("no highest block"), + )? + .expect("missing highest block"); + self.started_at_block_number = highest_block.number(); + } + } _ => { tracing::debug!( "sync::SyncProposal : syncing {} blocks in pipeline", @@ -388,8 +453,9 @@ impl Sync { /// /// Pop the most recently used segment from the segment marker, and retry phase 1. /// This will rebuild history from the previous marker, with another peer. + /// If this function is called many times, it will eventually restart from Phase 0. fn retry_phase1(&mut self) -> Result<()> { - if self.chain_segments.is_empty() { + if self.count_segments()? == 0 { tracing::error!("sync::RetryPhase1 : cannot retry phase 1 without chain segments!"); self.state = SyncState::Phase0; return Ok(()); @@ -397,24 +463,14 @@ impl Sync { tracing::debug!( "sync::RetryPhase1 : retrying segment #{}", - self.chain_segments.len(), + self.count_segments()?, ); // remove the last segment from the chain metadata - let (_, meta) = self.chain_segments.pop().unwrap(); - self.pop_segment(meta.clone())?; - - let mut key = meta.parent_hash; - while let Some(p) = self.chain_metadata.remove(&key) { - key = p.parent_hash; - self.remove_metadata(p.block_hash)?; - } - - // retry from Phase 1 + let (meta, _) = self.last_segment()?.unwrap(); + self.pop_segment()?; self.state = SyncState::Phase1(meta); - if Self::DO_SPECULATIVE { - self.request_missing_metadata(None)?; - } + Ok(()) } @@ -445,7 +501,8 @@ impl Sync { // Empty response, downgrade peer and retry phase 1. tracing::warn!("sync::MultiBlockResponse : empty blocks {from}",); self.done_with_peer(DownGrade::Empty); - return self.retry_phase1(); + self.state = SyncState::Retry1; + return Ok(()); } else { self.done_with_peer(DownGrade::None); } @@ -453,7 +510,7 @@ impl Sync { tracing::info!( "sync::MultiBlockResponse : received {} blocks for segment #{} from {}", response.len(), - self.chain_segments.len(), + self.count_segments()?, from ); @@ -473,7 +530,8 @@ impl Sync { tracing::error!( "sync::MultiBlockResponse : unexpected checksum={check_sum} != {checksum}" ); - return self.retry_phase1(); + self.state = SyncState::Retry1; + return Ok(()); } // Response seems sane. @@ -482,23 +540,11 @@ impl Sync { .sorted_by_key(|p| p.number()) .collect_vec(); - // Remove the blocks from the chain metadata - for p in &proposals { - if self.chain_metadata.remove(&p.hash()).is_none() { - anyhow::bail!("missing chain data for proposal"); // this should never happen! - } else { - self.remove_metadata(p.hash())?; - } - } - - // Done with this segment - if let Some((_, meta)) = self.chain_segments.pop() { - self.pop_segment(meta)?; - } + self.pop_segment()?; self.inject_proposals(proposals)?; // Done with phase 2 - if self.chain_segments.is_empty() { + if self.count_segments()? == 0 { self.state = SyncState::Phase3; } else if Self::DO_SPECULATIVE { // Speculatively request more blocks @@ -573,13 +619,8 @@ impl Sync { self.peers.push(peer); // If we have no chain_segments, we have nothing to do - if let Some((peer_info, meta)) = self.chain_segments.last() { - let mut request_hashes = Vec::with_capacity(self.max_batch_size); - let mut key = meta.parent_hash; // start from this block - while let Some(meta) = self.chain_metadata.get(&key) { - request_hashes.push(meta.block_hash); - key = meta.parent_hash; - } + if let Some((meta, peer_info)) = self.last_segment()? { + let request_hashes = self.get_segment(meta.parent_hash)?; // Checksum of the request hashes let checksum = request_hashes @@ -594,7 +635,7 @@ impl Sync { tracing::info!( "sync::RequestMissingBlocks : requesting {} blocks of segment #{} from {}", request_hashes.len(), - self.chain_segments.len(), + self.count_segments()?, peer_info.peer_id, ); let message = match peer_info.version { @@ -700,7 +741,7 @@ impl Sync { .proposals .into_iter() // filter any blocks that are not in the chain e.g. forks - .filter(|p| self.chain_metadata.contains_key(&p.hash())) + .filter(|p| self.contains_metadata(&p.hash()).unwrap_or_default()) .sorted_by(|a, b| b.number().cmp(&a.number())) .collect_vec(); @@ -786,8 +827,7 @@ impl Sync { let segment = response; // Record landmark, including peer that has this set of blocks - self.push_segment(segment_peer.clone(), meta.clone())?; - self.chain_segments.push((segment_peer, meta.clone())); + self.push_segment(segment_peer, meta.clone())?; // Record the oldest block in the chain's parent self.state = SyncState::Phase1(segment.last().cloned().unwrap()); @@ -796,22 +836,12 @@ impl Sync { tracing::info!( "sync::MetadataResponse : received {} metadata segment #{} from {}", segment.len(), - self.chain_segments.len(), + self.count_segments()?, from ); - // Record the constructed chain metadata, check for loops - for meta in segment { - if self - .chain_metadata - .insert(meta.block_hash, meta.clone()) - .is_some() - { - anyhow::bail!("sync::MetadataResponse : loop in chain!"); // there is a possible loop in the chain - } else { - self.insert_metadata(meta)?; - } - } + // Record the constructed chain metadata + self.insert_metadata(segment)?; // If the segment hits our history, start Phase 2. if self.db.get_block_by_hash(&last_block_hash)?.is_some() { @@ -900,7 +930,7 @@ impl Sync { tracing::info!( "sync::RequestMissingMetadata : requesting {} metadata of segment #{} from {}", self.max_batch_size, - self.chain_segments.len() + 1, + self.count_segments()? + 1, peer.peer_id ); let message = match self.state { @@ -1007,12 +1037,12 @@ impl Sync { prop.from ); } - if let Some(p) = self.chain_metadata.remove(&prop.block.hash()) { - tracing::warn!( - "sync::MarkReceivedProposal : removing stale metadata {}", - p.block_hash - ); - } + // if let Some(p) = self.chain_metadata.remove(&prop.block.hash()) { + // tracing::warn!( + // "sync::MarkReceivedProposal : removing stale metadata {}", + // p.block_hash + // ); + // } self.in_pipeline = self.in_pipeline.saturating_sub(1); Ok(()) } @@ -1078,9 +1108,9 @@ impl Sync { )? .expect("missing highest block"); Ok(( - !self.recent_proposals.is_empty() - || !self.chain_metadata.is_empty() - || !self.chain_segments.is_empty(), + self.in_pipeline != 0 + || !self.recent_proposals.is_empty() + || self.count_segments()? != 0, highest_block, )) } @@ -1142,6 +1172,7 @@ enum SyncState { Phase1(ChainMetaData), Phase2(Hash), Phase3, + Retry1, } /// Peer Version From d95dd984ea67d459523170f1ca40e3e72dd2ac26 Mon Sep 17 00:00:00 2001 From: Shawn Date: Mon, 13 Jan 2025 18:27:07 +0800 Subject: [PATCH 065/119] feat: added PeerVer info to DB. --- zilliqa/src/sync.rs | 36 +++++++++++++++++++++++++++++------- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index 149f44e57..37d1ac287 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -9,7 +9,11 @@ use alloy::primitives::BlockNumber; use anyhow::Result; use itertools::Itertools; use libp2p::PeerId; -use rusqlite::{named_params, OptionalExtension}; +use rusqlite::{ + named_params, + types::{FromSql, FromSqlResult, ToSql, ToSqlOutput, ValueRef}, + OptionalExtension, +}; use crate::{ cfg::NodeConfig, @@ -132,7 +136,8 @@ impl Sync { parent_hash BLOB NOT NULL, block_number INTEGER NOT NULL PRIMARY KEY, view_number INTEGER NOT NULL, - peer BLOB DEFAULT NULL + peer BLOB DEFAULT NULL, + version INTEGER DEFAULT 0 ); CREATE INDEX IF NOT EXISTS idx_sync_data ON sync_data(block_number) WHERE peer IS NOT NULL;", )?; @@ -223,7 +228,7 @@ impl Sync { let mut result = None; self.db.with_sqlite_tx(|c| { result = c - .prepare_cached("SELECT parent_hash, block_hash, block_number, view_number, peer FROM sync_data WHERE peer IS NOT NULL ORDER BY block_number ASC LIMIT 1")? + .prepare_cached("SELECT parent_hash, block_hash, block_number, view_number, peer, version FROM sync_data WHERE peer IS NOT NULL ORDER BY block_number ASC LIMIT 1")? .query_row([], |row| Ok(( ChainMetaData{ parent_hash: row.get(0)?, @@ -234,7 +239,7 @@ impl Sync { PeerInfo { last_used: Instant::now(), score:u32::MAX, - version: PeerVer::V1, + version: row.get(5)?, peer_id: PeerId::from_bytes(row.get::<_,Vec>(4)?.as_slice()).unwrap(), }, ))) @@ -285,7 +290,7 @@ impl Sync { fn push_segment(&self, peer: PeerInfo, meta: ChainMetaData) -> Result<()> { self.db.with_sqlite_tx(|c| { c.prepare_cached( - "INSERT OR REPLACE INTO sync_data (parent_hash, block_hash, block_number, view_number, peer) VALUES (:parent_hash, :block_hash, :block_number, :view_number, :peer)")? + "INSERT OR REPLACE INTO sync_data (parent_hash, block_hash, block_number, view_number, peer, version) VALUES (:parent_hash, :block_hash, :block_number, :view_number, :peer, :version)")? .execute( named_params! { ":parent_hash": meta.parent_hash, @@ -293,6 +298,7 @@ impl Sync { ":block_number": meta.block_number, ":view_number": meta.view_number, ":peer": peer.peer_id.to_bytes(), + ":version": peer.version, }, )?; Ok(()) @@ -1178,6 +1184,22 @@ enum SyncState { /// Peer Version #[derive(Debug, Clone, Eq, PartialEq)] enum PeerVer { - V1, - V2, + V1 = 1, + V2 = 2, +} + +impl FromSql for PeerVer { + fn column_result(value: ValueRef) -> FromSqlResult { + u32::column_result(value).map(|i| match i { + 1 => PeerVer::V1, + 2 => PeerVer::V2, + _ => todo!("invalid version"), + }) + } +} + +impl ToSql for PeerVer { + fn to_sql(&self) -> Result { + Ok((self.clone() as u32).into()) + } } From f6b40958de54e4b02e5ea22b3ccddcc30d3ca8ed Mon Sep 17 00:00:00 2001 From: Shawn Date: Tue, 14 Jan 2025 16:44:58 +0800 Subject: [PATCH 066/119] chore: post-rebase. --- zilliqa/src/block_store.rs | 2 +- zilliqa/src/message.rs | 2 +- zilliqa/src/sync.rs | 5 +---- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/zilliqa/src/block_store.rs b/zilliqa/src/block_store.rs index e17b60e96..0cf063564 100644 --- a/zilliqa/src/block_store.rs +++ b/zilliqa/src/block_store.rs @@ -19,7 +19,7 @@ use crate::{ constants, crypto::Hash, db::Db, - message::{Block, BlockRequest, BlockStrategy, ExternalMessage, Proposal}, + message::{Block, BlockStrategy, Proposal}, node::{MessageSender, OutgoingMessageFailure, RequestId}, range_map::RangeMap, time::SystemTime, diff --git a/zilliqa/src/message.rs b/zilliqa/src/message.rs index c9a758a31..690ad67c6 100644 --- a/zilliqa/src/message.rs +++ b/zilliqa/src/message.rs @@ -209,7 +209,7 @@ pub struct BlockRequest { pub to_view: u64, } -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] pub struct BlockResponse { pub proposals: Vec, pub from_view: u64, diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index 37d1ac287..6b0348dfa 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -19,10 +19,7 @@ use crate::{ cfg::NodeConfig, crypto::Hash, db::Db, - message::{ - Block, BlockRequest, BlockRequestV2, BlockResponse, ChainMetaData, ExternalMessage, - InjectedProposal, Proposal, - }, + message::{Block, BlockRequest, BlockRequestV2, BlockResponse, ChainMetaData, ExternalMessage, InjectedProposal, Proposal}, node::MessageSender, time::SystemTime, }; From 8aae098b1239cad3abc36113188c6116e1899484 Mon Sep 17 00:00:00 2001 From: Shawn Date: Thu, 16 Jan 2025 10:57:05 +0800 Subject: [PATCH 067/119] feat: removed block_store.rs --- z2/src/converter.rs | 22 +- zilliqa/src/api/zilliqa.rs | 14 +- zilliqa/src/block_store.rs | 1023 ------------------------------------ zilliqa/src/consensus.rs | 201 ++----- zilliqa/src/db.rs | 2 +- zilliqa/src/exec.rs | 9 +- zilliqa/src/lib.rs | 1 - zilliqa/src/node.rs | 20 - zilliqa/src/pool.rs | 18 +- zilliqa/src/state.rs | 49 +- zilliqa/src/sync.rs | 42 +- 11 files changed, 99 insertions(+), 1302 deletions(-) delete mode 100644 zilliqa/src/block_store.rs diff --git a/z2/src/converter.rs b/z2/src/converter.rs index 251a0a324..dbb1445d1 100644 --- a/z2/src/converter.rs +++ b/z2/src/converter.rs @@ -14,18 +14,14 @@ use bitvec::{bitarr, order::Msb0}; use eth_trie::{EthTrie, MemoryDB, Trie}; use indicatif::{ProgressBar, ProgressFinish, ProgressIterator, ProgressStyle}; use itertools::Itertools; -use libp2p::PeerId; use sha2::{Digest, Sha256}; -use tokio::sync::mpsc; use tracing::{debug, trace, warn}; use zilliqa::{ - block_store::BlockStore, cfg::{scilla_ext_libs_path_default, Amount, Config, NodeConfig}, crypto::{Hash, SecretKey}, db::Db, exec::store_external_libraries, message::{Block, QuorumCertificate, Vote, MAX_COMMITTEE_SIZE}, - node::{MessageSender, RequestId}, schnorr, scilla::{storage_key, CheckOutput, ParamValue, Transition}, state::{Account, Code, ContractInit, State}, @@ -346,27 +342,15 @@ pub async fn convert_persistence( "{msg} {wide_bar} [{per_sec}] {human_pos}/~{human_len} ({elapsed}/~{duration})", )?; - let (outbound_message_sender, _a) = mpsc::unbounded_channel(); - let (local_message_sender, _b) = mpsc::unbounded_channel(); - let message_sender = MessageSender { - our_shard: 0, - our_peer_id: PeerId::random(), - outbound_channel: outbound_message_sender, - local_channel: local_message_sender, - request_id: RequestId::default(), - }; + // let (outbound_message_sender, _a) = mpsc::unbounded_channel(); + // let (local_message_sender, _b) = mpsc::unbounded_channel(); let zq2_db = Arc::new(zq2_db); let node_config = &zq2_config.nodes[0]; - let block_store = Arc::new(BlockStore::new( - node_config, - zq2_db.clone(), - message_sender.clone(), - )?); let mut state = State::new_with_genesis( zq2_db.clone().state_trie()?, node_config.clone(), - block_store, + zq2_db.clone(), )?; let mut scilla_docker = run_scilla_docker()?; diff --git a/zilliqa/src/api/zilliqa.rs b/zilliqa/src/api/zilliqa.rs index e587597cf..5d0cc44b4 100644 --- a/zilliqa/src/api/zilliqa.rs +++ b/zilliqa/src/api/zilliqa.rs @@ -508,7 +508,7 @@ fn get_blockchain_info(_: Params, node: &Arc>) -> Result>) -> Result>) -> Result block.transactions.len(), @@ -1247,11 +1245,7 @@ fn get_recent_transactions( let mut txns = Vec::new(); let mut blocks_searched = 0; while block_number > 0 && txns.len() < 100 && blocks_searched < 100 { - let block = match node - .consensus - .block_store - .get_canonical_block_by_number(block_number)? - { + let block = match node.consensus.get_canonical_block_by_number(block_number)? { Some(block) => block, None => continue, }; @@ -1274,7 +1268,7 @@ fn get_recent_transactions( // GetNumTransactions fn get_num_transactions(_params: Params, node: &Arc>) -> Result { let node = node.lock().unwrap(); - let num_transactions = node.consensus.block_store.get_num_transactions()?; + let num_transactions = node.consensus.get_num_transactions()?; Ok(num_transactions.to_string()) } @@ -1283,7 +1277,6 @@ fn get_num_txns_tx_epoch(_params: Params, node: &Arc>) -> Result block.transactions.len(), @@ -1302,7 +1295,6 @@ fn get_num_txns_ds_epoch(_params: Params, node: &Arc>) -> Result Self { - Self { - parent_hash, - from, - proposal, - } - } -} - -/// A block cache. -/// We need to be careful to conserve block space in the presence of block flooding attacks, and we need to -/// make sure we don't lose blocks that form part of the main chain repeatedly, else we will never be able -/// to construct it. -/// -/// Similarly, we should ensure that we always buffer proposals close to the head of the tree, else we will -/// lose sync frequently and have to request, which will slow down block production. -/// -/// An easy way to do this is to put a hash of the node address (actually, we just use the low bits) in the -/// bottom (log2(N_WAYS)) bits of the view number. We then evict the largest tag le (max_view - buffer). -/// -/// I don't think it actually matters whether we use the view or the block number here, since we're not using -/// fixed-size arrays. -/// -#[derive(Debug, Serialize, Deserialize)] -pub struct BlockCache { - /// Caches proposals that are not yet blocks, and are before the head_cache. - pub cache: BTreeMap, - /// Caches proposals close to the head. - /// This buys us out of the situation where we are, say, 2 blocks behind the head. - /// We request those blocks, but by the time we get them, a new block is proposed. - /// So we're now a block behind. We request it, and then, by the time we get it ... - /// and so on. The head_cache caches broadcast proposals at the head of the chain - /// so we only need to get to (head_of_chain - head_cache_entries) and we can - /// then catch up using the head cache. - pub head_cache: BTreeMap, - /// Caches ranges where we think there is no block at all (just an empty view) - pub empty_view_ranges: RangeMap, - /// The head cache - this caches - /// An index into the cache by parent hash - pub by_parent_hash: HashMap>, - /// Set associative shift - pub shift: usize, - /// This is used to count the number of times we've looked for a fork. - /// The counter is zeroed when we receive (or pop) a new block, and counts 1 every - /// time we looked. - pub fork_counter: usize, - /// Copied from the parent to minimise the number of additional parameters we need. - pub max_blocks_in_flight: u64, - /// These are views which we have removed from the cache to process later. Remember not to re-request them, or - /// we will end up asking peers for views which we are about to process. - /// We need to remember to clear these flags once we have the proposal, because it might be a lie and we may need - /// to rerequest in order to get the right view (there will only ever be one legitimate view with a given number, - /// but peers may lie to us about what it is) - pub views_expecting_proposals: BTreeSet, -} - -impl BlockCache { - pub fn new(max_blocks_in_flight: u64) -> Self { - Self { - cache: BTreeMap::new(), - head_cache: BTreeMap::new(), - empty_view_ranges: RangeMap::new(), - by_parent_hash: HashMap::new(), - shift: 8 - constants::BLOCK_CACHE_LOG2_WAYS, - fork_counter: 0, - max_blocks_in_flight, - views_expecting_proposals: BTreeSet::new(), - } - } - - pub fn key_from_view(&self, peer: &PeerId, view_num: u64) -> u128 { - let ways = peer.to_bytes().pop().unwrap_or(0x00); - u128::from(ways >> self.shift) | (u128::from(view_num) << self.shift) - } - - pub fn view_from_key(&self, key: u128) -> u64 { - u64::try_from(key >> self.shift).unwrap() - } - - pub fn min_key_for_view(&self, view: u64) -> u128 { - u128::from(view) << self.shift - } - - pub fn expect_process_proposal(&mut self, view: u64) { - self.views_expecting_proposals.insert(view); - } - - pub fn received_process_proposal(&mut self, view: u64) { - self.views_expecting_proposals.remove(&view); - } - - /// returns the minimum key (view << shift) that we are prepared to store in the head cache. - /// keys smaller than this are stored in the main cache. - /// We compute this by subtracting a constant from (highest_known_view +1)<< shift - which is - /// the highest key we think could currently exist (highest view we've ever seen +1 shifted up). - /// (the constant is preshifted for efficiency) - /// This aims to keep the head cache at roughly BLOCK_CACHE_HEAD_BUFFER_ENTRIES entries - /// (note that this will be BLOCK_CACHE_HEAD_BUFFER_ENTRIES >> shift cached views, since the - /// head cache is set associative) - pub fn min_head_cache_key(&self, highest_known_view: u64) -> u128 { - let delta = u128::try_from(constants::BLOCK_CACHE_HEAD_BUFFER_ENTRIES).unwrap(); - let highest_key = u128::from(highest_known_view + 1) << self.shift; - highest_key // prevent underflowing for low views - .saturating_sub(delta) - } - - pub fn destructive_proposals_from_parent_hashes( - &mut self, - hashes: &[Hash], - ) -> Vec<(PeerId, Proposal)> { - // For each hash, find the list of blocks that have it as the parent. - let cache_keys = hashes - .iter() - .filter_map(|x| self.by_parent_hash.remove(x)) - .flatten() - .collect::>(); - let maybe = cache_keys - .iter() - .filter_map(|key| { - self.cache - .remove(key) - .or_else(|| self.head_cache.remove(key)) - .map(|entry| (entry.from, entry.proposal)) - }) - .collect::>(); - if !cache_keys.is_empty() { - let max_view = - self.view_from_key(cache_keys.iter().fold(0, |v1, v2| std::cmp::max(v1, *v2))); - // Ignore any gaps up to this point, because they may be lies. - (_, self.empty_view_ranges) = - self.empty_view_ranges - .diff_inter(&RangeMap::from_range(&Range { - start: 0, - end: max_view + 1, - })); - // We got a real block! Reset the fork counter. - self.fork_counter = 0; - } - maybe - } - - /// Delete all blocks in the cache up to and including block_number - pub fn delete_blocks_up_to(&mut self, block_number: u64) { - // note that this code embodies the assumption that increasing block number implies - // increasing view number. - self.trim_with_fn(|_, v| -> bool { v.proposal.number() <= block_number }); - } - - pub fn trim(&mut self, highest_confirmed_view: u64) { - let lowest_ignored_key = self.min_key_for_view(highest_confirmed_view); - debug!("trim: lowest_ignored_key = {0}", lowest_ignored_key); - self.trim_with_fn(|k, _| -> bool { *k < lowest_ignored_key }); - // We don't care about anything lower than what we're about to flush - self.views_expecting_proposals = self - .views_expecting_proposals - .split_off(&highest_confirmed_view); - } - - /// DANGER WILL ROBINSON! This function only searches from the minimum key to the maximum, so - /// any selector function which is not monotonic in key will not work properly. - fn trim_with_fn bool>(&mut self, selector: F) { - // We've deleted or replaced this key with this parent hash; remove it from the index. - fn unlink_parent_hash(cache: &mut HashMap>, key: &u128, hash: &Hash) { - let mut do_remove = false; - if let Some(val) = cache.get_mut(hash) { - val.remove(key); - if val.is_empty() { - do_remove = true - } - } - if do_remove { - cache.remove(hash); - } - } - - let cache_entries = self.max_blocks_in_flight << constants::BLOCK_CACHE_LOG2_WAYS; - // debug!("trim: cache had: {0}", self.extant_block_ranges()?); - // Should really be an option, but given that there is a convenient sentinel.. - let mut lowest_view_in_cache: Option = None; - let shift = self.shift; - - for cache_ptr in [&mut self.cache, &mut self.head_cache] { - while let Some((k, v)) = cache_ptr.first_key_value() { - if selector(k, v) { - // Kill it! - if let Some((k, v)) = cache_ptr.pop_first() { - unlink_parent_hash(&mut self.by_parent_hash, &k, &v.parent_hash); - }; - } else { - let view_number = u64::try_from(*k >> shift).unwrap(); - lowest_view_in_cache = Some( - lowest_view_in_cache.map_or(view_number, |x| std::cmp::min(x, view_number)), - ); - break; - } - } - } - - // Empty view ranges below the thing we last trimmed might not exist - zap them. - if let Some(v) = lowest_view_in_cache { - (_, self.empty_view_ranges) = - self.empty_view_ranges - .diff_inter(&RangeMap::from_range(&Range { - start: 0, - end: v + 1, - })); - } - // And trim. - let cache_size = usize::try_from(cache_entries).unwrap(); - self.empty_view_ranges.truncate(cache_size); - - while self.head_cache.len() > constants::BLOCK_CACHE_HEAD_BUFFER_ENTRIES { - if let Some((k, v)) = self.head_cache.pop_first() { - // Push this into the main cache, otherwise we will get into the state where - // blocks are removed from the head cache and lost and we are constantly - // requesting blocks to replace them. - self.cache.insert(k, v); - } - } - while self.cache.len() > cache_size { - if let Some((k, v)) = self.cache.pop_last() { - unlink_parent_hash(&mut self.by_parent_hash, &k, &v.parent_hash); - } - } - // Both caches are now at most the "right" number of entries long. - } - - pub fn no_blocks_at(&mut self, no_blocks_in: &Range) { - self.empty_view_ranges.with_range(no_blocks_in); - } - - pub fn delete_empty_view_range_cache(&mut self) { - self.empty_view_ranges = RangeMap::new(); - } - - /// Insert this proposal into the cache. - pub fn insert( - &mut self, - from: &PeerId, - parent_hash: &Hash, - proposal: Proposal, - highest_confirmed_view: u64, - highest_known_view: u64, - ) -> Result<()> { - fn insert_with_replacement( - into: &mut BTreeMap, - by_parent_hash: &mut HashMap>, - from: &PeerId, - parent_hash: &Hash, - key: u128, - value: Proposal, - ) { - into.insert(key, BlockCacheEntry::new(*parent_hash, *from, value)) - .map(|entry| { - by_parent_hash - .get_mut(&entry.parent_hash) - .map(|x| x.remove(&key)) - }); - if let Some(v) = by_parent_hash.get_mut(parent_hash) { - v.insert(key); - } else { - let mut new_set = HashSet::new(); - new_set.insert(key); - by_parent_hash.insert(*parent_hash, new_set); - } - } - - if proposal.header.view <= highest_confirmed_view { - // nothing to do. - return Ok(()); - } - // First, insert us. - let key = self.key_from_view(from, proposal.header.view); - if key > self.min_head_cache_key(highest_known_view) { - insert_with_replacement( - &mut self.head_cache, - &mut self.by_parent_hash, - from, - parent_hash, - key, - proposal, - ); - } else { - insert_with_replacement( - &mut self.cache, - &mut self.by_parent_hash, - from, - parent_hash, - key, - proposal, - ); - } - // Zero the fork counter. - self.fork_counter = 0; - // Now evict the worst entry - self.trim(highest_confirmed_view); - Ok(()) - } - - pub fn inc_fork_counter(&mut self) -> usize { - self.fork_counter += 1; - self.fork_counter - } - - pub fn reset_fork_counter(&mut self) { - self.fork_counter = 0; - } - - // For debugging - what view number ranges are in the cache? - pub fn extant_block_ranges(&self) -> RangeMap { - let mut result = RangeMap::new(); - let shift = 8 - constants::BLOCK_CACHE_LOG2_WAYS; - for key in self.cache.keys() { - let _ = u128::try_into(key >> shift).map(|x| result.with_elem(x)); - } - for key in self.head_cache.keys() { - let _ = u128::try_into(key >> shift).map(|x| result.with_elem(x)); - } - result - } - - pub fn expectant_block_ranges(&self) -> RangeMap { - let mut result = RangeMap::new(); - self.views_expecting_proposals.iter().for_each(|v| { - result.with_elem(*v); - }); - result - } -} - -/// Stores and manages the node's list of blocks. Also responsible for making requests for new blocks. -/// -/// # Syncing Algorithm -/// -/// We rely on [crate::consensus::Consensus] informing us of newly received block proposals via: -/// * [BlockStore::process_block] for blocks that can be part of our chain, because we already have their parent. -/// * [BlockStore::buffer_proposal] for blocks that can't (yet) be part of our chain. -/// -/// Both these code paths also call [BlockStore::request_missing_blocks]. This finds the greatest view of any proposal -/// we've seen (whether its part of our chain or not). -/// -/// -/// TODO(#1096): Retries for blocks we request but never receive. -#[derive(Debug)] -pub struct BlockStore { - db: Arc, - block_cache: Arc>>, - /// The maximum view of any proposal we have received, even if it is not part of our chain yet. - highest_known_view: u64, - /// Highest confirmed view - blocks we know to be correct. - highest_confirmed_view: u64, - /// Information we keep about our peers' state. - peers: BTreeMap, - /// The maximum number of blocks to send requests for at a time. - max_blocks_in_flight: u64, - /// Our block strategies. - strategies: Vec, - /// The block views we have available. This is read once from the DB at start-up and incrementally updated whenever - /// we receive a new block. We do this because obtaining the data from the DB is expensive. - available_blocks: RangeMap, - - /// Buffered block proposals. - buffered: BlockCache, - /// Requests we would like to send, but haven't been able to (e.g. because we have no peers). - unserviceable_requests: Option, - message_sender: MessageSender, - - /// Where we last started syncing, so we can report it in get_sync_data() - started_syncing_at: BlockNumber, - /// Previous sync flag, so we can tell when it changes. - last_sync_flag: bool, -} - -/// Data about block availability sent between peers -#[derive(Clone, Debug, Serialize, Deserialize)] -struct BlockAvailability { - /// None means no information, Some([]) means the other node shouldn't be relied upon for any blocks at all. - strategies: Option>, - /// The largest view we've seen from a block that this peer sent us. - highest_known_view: u64, -} - -#[derive(Clone, Debug)] -struct PeerInfo { - /// Availability from this peer - availability: BlockAvailability, - /// When did we last update availability? - availability_updated_at: Option, - /// Requests we've sent to the peer. - pending_requests: HashMap, - /// If `Some`, the time of the most recently failed request. - last_request_failed_at: Option, -} - -impl PeerInfo { - fn new() -> Self { - Self { - availability: BlockAvailability::new(), - availability_updated_at: None, - pending_requests: HashMap::new(), - last_request_failed_at: None, - } - } -} - -/// Data about a peer -#[derive(Serialize, Deserialize, Clone, Debug)] -pub struct PeerInfoStatus { - availability: BlockAvailability, - availability_updated_at: Option, - pending_requests: Vec<(String, SystemTime, u64, u64)>, - last_request_failed_at: Option, -} - -/// Data about the block store, used for debugging. -#[derive(Serialize, Deserialize, Clone, Debug)] -pub struct BlockStoreStatus { - highest_known_view: u64, - views_held: Vec>, - peers: Vec<(String, PeerInfoStatus)>, - availability: Option>, -} - -impl BlockStoreStatus { - pub fn new(block_store: &mut BlockStore) -> Result { - let peers = block_store - .peers - .iter() - .map(|(k, v)| (format!("{:?}", k), PeerInfoStatus::new(v))) - .collect::>(); - Ok(Self { - highest_known_view: block_store.highest_known_view, - views_held: block_store.db.get_view_ranges()?, - peers, - availability: block_store.availability()?, - }) - } -} - -impl PeerInfoStatus { - // Annoyingly, this can't (easily) be allowed to fail without making generating debug info hard. - fn new(info: &PeerInfo) -> Self { - fn s_from_time(q: Option) -> Option { - q.map(|z| { - z.duration_since(SystemTime::UNIX_EPOCH) - .unwrap_or(Duration::ZERO) - .as_secs() - }) - } - let pending_requests = info - .pending_requests - .iter() - .map(|(k, v)| (format!("{:?}", k), v.0, v.1, v.2)) - .collect::>(); - Self { - availability: info.availability.clone(), - availability_updated_at: s_from_time(info.availability_updated_at), - pending_requests, - last_request_failed_at: s_from_time(info.last_request_failed_at), - } - } -} - -impl BlockAvailability { - pub fn new() -> Self { - Self { - strategies: None, - highest_known_view: 0, - } - } -} - -impl BlockStore { - pub fn new(config: &NodeConfig, db: Arc, message_sender: MessageSender) -> Result { - let available_blocks = - db.get_view_ranges()? - .iter() - .fold(RangeMap::new(), |mut range_map, range| { - range_map.with_range(range); - range_map - }); - Ok(BlockStore { - db, - block_cache: Arc::new(RwLock::new(LruCache::new(NonZeroUsize::new(5).unwrap()))), - highest_known_view: 0, - highest_confirmed_view: 0, - peers: BTreeMap::new(), - max_blocks_in_flight: config.max_blocks_in_flight as u64, - strategies: vec![BlockStrategy::Latest(constants::RETAINS_LAST_N_BLOCKS)], - available_blocks, - buffered: BlockCache::new(config.max_blocks_in_flight as u64), - unserviceable_requests: None, - message_sender, - started_syncing_at: 0, - last_sync_flag: false, - }) - } - - /// The data set here is held in memory. It can be useful to update manually - /// For example after a restart to remind block_store of its peers and height - pub fn set_peers_and_view( - &mut self, - highest_known_view: u64, - peer_ids: &Vec, - ) -> Result<()> { - for peer_id in peer_ids { - self.peer_info(*peer_id); - } - self.highest_known_view = highest_known_view; - Ok(()) - } - - /// Create a read-only clone of this [BlockStore]. The read-only property must be upheld by the caller - Calling - /// any `&mut self` methods on the returned [BlockStore] will lead to problems. This clone is cheap. - pub fn clone_read_only(&self) -> Arc { - Arc::new(BlockStore { - db: self.db.clone(), - block_cache: self.block_cache.clone(), - highest_known_view: 0, - highest_confirmed_view: 0, - peers: BTreeMap::new(), - max_blocks_in_flight: 0, - strategies: self.strategies.clone(), - available_blocks: RangeMap::new(), - buffered: BlockCache::new(0), - unserviceable_requests: None, - message_sender: self.message_sender.clone(), - started_syncing_at: self.started_syncing_at, - last_sync_flag: self.last_sync_flag, - }) - } - - /// Update someone else's availability - pub fn update_availability( - &mut self, - from: PeerId, - avail: &Option>, - ) -> Result<()> { - let the_peer = self.peer_info(from); - the_peer.availability.strategies.clone_from(avail); - the_peer.availability_updated_at = Some(SystemTime::now()); - Ok(()) - } - - /// Retrieve our availability. - /// We need to do this by view range, which means that we need to account for views where there was no block. - /// So, the underlying db function finds the view lower and upper bounds of our contiguous block ranges and we - /// advertise those. - pub fn availability(&self) -> Result>> { - let mut to_return = self.strategies.clone(); - to_return.extend( - self.available_blocks - .ranges - .iter() - .map(|range| BlockStrategy::CachedViewRange(range.clone(), None)), - ); - Ok(Some(to_return)) - } - - /// Buffer a block proposal whose parent we don't yet know about. - pub fn buffer_proposal(&mut self, from: PeerId, proposal: Proposal) -> Result<()> { - let view = proposal.view(); - - // If this is the highest block we've seen, remember its view. - if view > self.highest_known_view { - trace!(view, "block_store:: new highest known view"); - self.highest_known_view = view; - } - - trace!( - "block_store:: buffer_proposal {view}, hc {0}", - self.highest_confirmed_view - ); - self.buffered.insert( - &from, - &proposal.header.qc.block_hash.clone(), - proposal, - self.highest_confirmed_view, - self.highest_known_view, - )?; - - let peer = self.peer_info(from); - if view > peer.availability.highest_known_view { - trace!(%from, view, "block_store:: new highest known view for peer"); - peer.availability.highest_known_view = view; - } - - Ok(()) - } - - /// This function: - /// - /// * Looks through the blocks we have - /// * Finds the next blocks it thinks we need - /// * Iterates through our known peers. - /// - /// If we don't have availability for a peer, we will request it by - /// sending an empty block request. - /// - /// If we do, we will try to request whatever blocks it has that we want. - /// - /// We limit the number of outstanding requests per peer, in order to - /// avoid bufferbloat at the peer's input message queue. - /// - /// We don't ask for blocks that we think are in flight (ie. we've - /// requested them but they have not yet arrived), those we don't think a - /// peer has, or those we think are gaps (remember that requests are made - /// by view, so you can't guarantee that every view has a block). - /// - /// We time out outstanding requests on a flat-timeout basis (our model - /// being that if you haven't replied by now, the whole message has - /// probably been lost). - /// Returns whether this function thinks we are syncing or not. - pub fn request_missing_blocks(&mut self) -> Result { - // Get the highest view we currently have committed to our chain. - // This is a bit horrid - it can go down as well as up, because we can roll back blocks - // when we discover that they are ahead of what we think the rest of the chain - // has committed to - if we don't roll back here, we won't then fetch the canonical - // versions of those blocks (thinking we already have them). - let (syncing, current_block) = self.am_syncing()?; - self.highest_confirmed_view = current_block.view(); - let current_view = current_block.view(); - trace!( - "block_store::request_missing_blocks() : set highest_confirmed_view {0} (current = {1})", - self.highest_confirmed_view, - current_view, - ); - - // First off, let's load up the unserviceable requests. - let mut to_request = if let Some(us_requests) = self.unserviceable_requests.take() { - us_requests - } else { - RangeMap::new() - }; - - // If we think the network might be ahead of where we currently are, attempt to download the missing blocks. - // This is complicated, because we mustn't request more blocks than will fit in our cache, or we might - // end up evicting the critical part of the chain.. - // @todo I can't think of a more elegant way than this, but it's horrid - we want to exclude views which - // we might still be voting on. - if syncing { - trace!( - current_view, - self.highest_known_view, - self.max_blocks_in_flight, - "block_store::request_missing_blocks() : missing some blocks" - ); - { - // We need to request from current_view, because these blocks might never be returned by our peers - // deduplication of requests is done one level lower - in request_blocks(). - let from = current_view + 1; - // Never request more than current_view + max_blocks_in_flight, or the cache won't be able to hold - // the responses and we'll end up being unable to reconstruct the chain. Not strictly true, because - // the network will hold some blocks for us, but true enough that I think we ought to treat it as - // such. - let to = cmp::min( - current_view + self.max_blocks_in_flight, - self.highest_known_view, - ); - trace!("block_store::request_missing_blocks() : requesting blocks {from} to {to}"); - to_request.with_range(&Range { - start: from, - end: to + 1, - }); - } - if !to_request.is_empty() { - self.request_blocks(&to_request)?; - } - } else { - // We're synced - no need to try and guess forks. - trace!( - "block_store::request_missing_blocks() : synced with highest_known_view {0}, current_view {1}", - self.highest_known_view, - current_view - ); - self.buffered.reset_fork_counter(); - } - - if syncing && !self.last_sync_flag { - // We didn't used to be syncing; remember when we started. - self.started_syncing_at = current_block.number(); - } - self.last_sync_flag = syncing; - - Ok(syncing) - } - - pub fn prune_pending_requests(&mut self) -> Result<()> { - // In the good old days, we could've done this by linear interpolation on the timestamp. - let current_time = SystemTime::now(); - for peer in self.peers.keys().cloned().collect::>() { - let the_peer = self.peer_info(peer); - the_peer.pending_requests = the_peer - .pending_requests - .iter() - .filter_map(|(k, (v1, v2, v3))| { - // How long since this request was sent? - match current_time.duration_since(*v1) { - Ok(since) => { - if since > constants::BLOCK_REQUEST_RESPONSE_TIMEOUT { - // Time out everything. - trace!("block_store::prune_pending_requests: timing out pending request {k:?} {v1:?} {v2} {v3}"); - None - } else { - Some((*k, (*v1, *v2, *v3))) - } - } - _ => None, - } - }) - .collect(); - } - Ok(()) - } - - pub fn retry_us_requests(&mut self) -> Result<()> { - if let Some(us_requests) = self.unserviceable_requests.take() { - self.request_blocks(&us_requests)?; - } - Ok(()) - } - - /// Make a request for the blocks associated with a range of views. Returns `true` if a request was made and `false` if the request had to be - /// buffered because no peers were available. - /// Public so we can trigger it from the debug API - pub fn request_blocks(&mut self, _req: &RangeMap) -> Result { - Ok(false) // FIXME: Stub - } - - pub fn get_block(&self, hash: Hash) -> Result> { - let mut block_cache = self - .block_cache - .write() - .map_err(|e| anyhow!("Failed to get write access to block cache: {e}"))?; - if let Some(block) = block_cache.get(&hash) { - return Ok(Some(block.clone())); - } - let Some(block) = self.db.get_block_by_hash(&hash)? else { - return Ok(None); - }; - block_cache.put(hash, block.clone()); - Ok(Some(block)) - } - - pub fn get_block_by_view(&self, view: u64) -> Result> { - let Some(hash) = self.db.get_block_hash_by_view(view)? else { - return Ok(None); - }; - self.get_block(hash) - } - - pub fn get_highest_canonical_block_number(&self) -> Result> { - self.db.get_highest_canonical_block_number() - } - - pub fn get_canonical_block_by_number(&self, number: u64) -> Result> { - self.db.get_canonical_block_by_number(number) - } - - /// Called to process a block which can be added to the chain. - /// - insert the block into any necessary databases - /// - update the highest known and confirmed views, if necessary, - /// - Return a list of proposals that can now be made part of the chain, removing - /// them from the cache to free up space as we do so. - /// - /// The caller should arrange to process the returned list asynchronously to avoid - /// blocking message processing for too long. - pub fn process_block( - &mut self, - from: Option, - block: Block, - ) -> Result> { - trace!(?from, number = block.number(), hash = ?block.hash(), "block_store::process_block() : starting"); - self.db.insert_block(&block)?; - self.available_blocks.with_elem(block.view()); - - if let Some(from) = from { - let peer = self.peer_info(from); - if block.view() > peer.availability.highest_known_view { - trace!(%from, view = block.view(), "new highest known view for peer"); - peer.availability.highest_known_view = block.view(); - } - } - - // There are two sets - let result = self - .buffered - .destructive_proposals_from_parent_hashes(&[block.hash()]); - - // Update highest_confirmed_view, but don't trim the cache if - // we're not changing anything. - if block.header.view > self.highest_confirmed_view { - self.highest_confirmed_view = block.header.view; - self.buffered.trim(self.highest_confirmed_view); - } - - Ok(result) - } - - pub fn report_outgoing_message_failure( - &mut self, - failure: OutgoingMessageFailure, - ) -> Result<()> { - let peer_info = self.peer_info(failure.peer); - let Some((_, from, to)) = peer_info.pending_requests.remove(&failure.request_id) else { - // A request we didn't know about failed. It must have been sent by someone else. - return Ok(()); - }; - peer_info.last_request_failed_at = Some(SystemTime::now()); - - trace!("block_store : outgoing_message_failure: re-requesting {from} - {to}"); - self.request_blocks(&RangeMap::from_closed_interval(from, to))?; - - Ok(()) - } - - fn peer_info(&mut self, peer: PeerId) -> &mut PeerInfo { - self.peers.entry(peer).or_insert_with(PeerInfo::new) - } - - pub fn forget_block_range(&mut self, blocks: Range) -> Result<()> { - self.db.forget_block_range(blocks) - } - - pub fn contains_block(&mut self, block_hash: &Hash) -> Result { - self.db.contains_block(block_hash) - } - - // Retrieve the plausible next blocks for the block with this hash - // Because of forks there might be many of these. - pub fn obtain_child_block_candidates_for( - &mut self, - hashes: &[Hash], - ) -> Result> { - // The easy case is that there's something in the buffer with us as its parent hash. - let with_parent_hashes = self - .buffered - .destructive_proposals_from_parent_hashes(hashes); - if with_parent_hashes.is_empty() { - // There isn't. There are three cases: - // - // 1. We simply haven't received the next block yet. Give up and wait for it. - // 2. We have received a lie for the next block. Delete it and try again. - // 3. There was a fork and so the true next block is a bit further on in the - // chain than we've looked so far. - // - // There would be a few easy optimisations if we could eg. assume that forks were max length - // 1. As it is, I can't think of a clever way to do this, so... - - // In any case, deleting any cached block that calls itself the next block is - // the right thing to do - if it really was the next block, we would not be - // executing this branch. - if let Some(highest_block_number) = self.db.get_highest_canonical_block_number()? { - self.buffered.delete_blocks_up_to(highest_block_number + 1); - trace!( - "block_store::obtain_child_block_candidates : deleted cached blocks up to and including {0}", - highest_block_number + 1 - ); - } - - let fork_elems = - self.buffered.inc_fork_counter() * (1 + constants::EXAMINE_BLOCKS_PER_FORK_COUNT); - - // Limit the number of forks to process otherwise the db query can take too long - const MAX_FORK_BLOCKS_TO_QUERY: usize = 512; - let fork_elems = cmp::min(fork_elems, MAX_FORK_BLOCKS_TO_QUERY); - - let parent_hashes = self.db.get_highest_block_hashes(fork_elems)?; - let revised = self - .buffered - .destructive_proposals_from_parent_hashes(&parent_hashes); - if !revised.is_empty() { - // Found some! - self.buffered.reset_fork_counter(); - } - Ok(revised) - } else { - Ok(with_parent_hashes) - } - } - - pub fn next_proposals_if_likely(&mut self) -> Result> { - // This is a bit sneaky, but the db overhead is just stepping through its B-Tree and this - // lets us cut out a lot of forks with 0 retries. - self.obtain_child_block_candidates_for( - &self - .db - .get_highest_block_hashes(constants::EXAMINE_BLOCKS_PER_FORK_COUNT)?, - ) - } - - pub fn delete_empty_view_range_cache(&mut self) { - self.buffered.delete_empty_view_range_cache(); - } - - /// Suppose that there is a view with no associated block. - /// Because we request views, not blocks, we will ask for blocks for those views. - /// Because there are no valid blocks in those views, we won't get them. - /// We will therefore ask again, and continue doing so forever, potentially exhausting our capacity for outstanding - /// view requests and blocking us from requesting blocks from views in which they might be extant. - /// We avoid this by finding the gaps between the view numbers of proposals we receive and caching - /// this list in the block_cache. We then arrange not to rerequest blocks in views for which we know there are no - /// valid blocks - remembering to clear this periodically in case a malicious node has lied to us about it. - /// - /// this function takes a list of proposals in a block response, works out the gaps between them and caches - /// the result. Gaps at the beginning of the sequence are recorded in the space between from_view and the view of the - /// first proposal; gaps at the end are ignored (and will be returned when we ask for the next view up from where - /// this block proposal left off). - pub fn buffer_lack_of_proposals( - &mut self, - from_view: u64, - proposals: &Vec, - ) -> Result<()> { - // OK. Find the gaps and register them as areas not to ask about again, because - // we now "know" that there is no block in this range. - // If this turns out to be a lie, we will pop the first block in the gap and check to see - // if it our next block. This will have the side-effect of forgetting about any gaps before - // that point, which we will then re-query, realise our mistake and carry on. - // @todo this is horribly slow - speed it up! - let mut gap_start = from_view; - let mut gap_end; - for p in proposals { - gap_end = p.header.view; - if gap_end > gap_start { - self.buffered.no_blocks_at(&Range { - start: gap_start, - end: gap_end, - }); - } - gap_start = gap_end + 1; - } - // There's never a gap at the end, because we don't know at which view we stopped. - Ok(()) - } - - pub fn get_num_transactions(&self) -> Result { - let count = self.db.get_total_transaction_count()?; - Ok(count) - } - - pub fn summarise_buffered(&self) -> RangeMap { - self.buffered.extant_block_ranges() - } - - pub fn expect_process_proposal(&mut self, view: u64) { - self.buffered.expect_process_proposal(view); - } - - pub fn received_process_proposal(&mut self, view: u64) { - self.buffered.received_process_proposal(view); - } - - /// Returns (am_syncing, current_highest_block) - pub fn am_syncing(&self) -> Result<(bool, Block)> { - let current_block = self - .db - .get_canonical_block_by_number( - self.db - .get_highest_canonical_block_number()? - .ok_or_else(|| anyhow!("no highest block"))?, - )? - .ok_or_else(|| anyhow!("missing highest block"))?; - Ok(( - (self.highest_known_view + 2) > current_block.view(), - current_block, - )) - } - - // Returns (starting_block, current_block, highest_block) if we're syncing, - // None if we're not. - pub fn get_sync_data(&self) -> Result> { - let (flag, highest_block) = self.am_syncing()?; - if !flag { - Ok(None) - } else { - // Compute the highest block. We're going to do this by taking the difference between - - // get an estimated block number if no more views were skipped. - let skipped_views = highest_block.view() - highest_block.number(); - let expected_highest_block_number = self.highest_known_view - skipped_views; - Ok(Some(( - self.started_syncing_at, - highest_block.number(), - expected_highest_block_number, - ))) - } - } -} diff --git a/zilliqa/src/consensus.rs b/zilliqa/src/consensus.rs index cca2c722e..674b20d29 100644 --- a/zilliqa/src/consensus.rs +++ b/zilliqa/src/consensus.rs @@ -20,7 +20,6 @@ use tokio::sync::{broadcast, mpsc::UnboundedSender}; use tracing::*; use crate::{ - block_store::BlockStore, blockhooks, cfg::{ConsensusConfig, NodeConfig}, constants::TIME_TO_ALLOW_PROPOSAL_BROADCAST, @@ -31,12 +30,11 @@ use crate::{ inspector::{self, ScillaInspector, TouchedAddressInspector}, message::{ AggregateQc, BitArray, BitSlice, Block, BlockHeader, BlockRef, BlockStrategy, - ExternalMessage, InternalMessage, NewView, ProcessProposal, Proposal, QuorumCertificate, - Vote, MAX_COMMITTEE_SIZE, + ExternalMessage, InternalMessage, NewView, Proposal, QuorumCertificate, Vote, + MAX_COMMITTEE_SIZE, }, node::{MessageSender, NetworkMessage, OutgoingMessageFailure}, pool::{TransactionPool, TxAddResult, TxPoolContent}, - range_map::RangeMap, state::State, sync::Sync, time::SystemTime, @@ -153,7 +151,6 @@ pub struct Consensus { message_sender: MessageSender, reset_timeout: UnboundedSender, pub sync: Sync, - pub block_store: BlockStore, latest_leader_cache: RefCell>, votes: BTreeMap, /// Votes for a block we don't have stored. They are retained in case we receive the block later. @@ -210,18 +207,16 @@ impl Consensus { let sync = Sync::new(&config, db.clone(), message_sender.clone(), Vec::new())?; - // It is important to create the `BlockStore` after the checkpoint has been loaded into the DB. The - // `BlockStore` pre-loads and caches information about the currently stored blocks. - let block_store = BlockStore::new(&config, db.clone(), message_sender.clone())?; - let latest_block = db .get_finalized_view()? - .map(|view| { - block_store - .get_block_by_view(view)? - .ok_or_else(|| anyhow!("no header found at view {view}")) + .and_then(|view| { + db.get_block_hash_by_view(view) + .expect("no header found at view {view}") }) - .transpose()?; + .and_then(|hash| { + db.get_block_by_hash(&hash) + .expect("no block found for hash {hash}") + }); let mut state = if let Some(latest_block) = &latest_block { trace!("Loading state from latest block"); @@ -229,15 +224,11 @@ impl Consensus { db.state_trie()?, latest_block.state_root_hash().into(), config.clone(), - block_store.clone_read_only(), + db.clone(), ) } else { trace!("Constructing new state from genesis"); - State::new_with_genesis( - db.state_trie()?, - config.clone(), - block_store.clone_read_only(), - )? + State::new_with_genesis(db.state_trie()?, config.clone(), db.clone())? }; let (latest_block, latest_block_view) = match latest_block { @@ -251,10 +242,9 @@ impl Consensus { let (start_view, finalized_view, high_qc) = { match db.get_high_qc()? { Some(qc) => { - let high_block = block_store - .get_block(qc.block_hash)? + let high_block = db + .get_block_by_hash(&qc.block_hash)? .ok_or_else(|| anyhow!("missing block that high QC points to!"))?; - let finalized_number = db .get_finalized_view()? .ok_or_else(|| anyhow!("missing latest finalized view!"))?; @@ -291,8 +281,7 @@ impl Consensus { let highest_block_number = db .get_highest_canonical_block_number()? .ok_or_else(|| anyhow!("can't find highest block num in database!"))?; - - let head_block = block_store + let head_block = db .get_canonical_block_by_number(highest_block_number)? .ok_or_else(|| anyhow!("missing head block!"))?; trace!( @@ -329,7 +318,6 @@ impl Consensus { secret_key, config, sync, - block_store, latest_leader_cache: RefCell::new(None), message_sender, reset_timeout, @@ -401,8 +389,8 @@ impl Consensus { // Remind block_store of our peers and request any potentially missing blocks let high_block = consensus - .block_store - .get_block(high_qc.block_hash)? + .db + .get_block_by_hash(&high_qc.block_hash)? .ok_or_else(|| anyhow!("missing block that high QC points to!"))?; let executed_block = BlockHeader { @@ -413,24 +401,15 @@ impl Consensus { // Grab last seen committee's peerIds in case others also went offline let committee = state_at.get_stakers(executed_block)?; - let recent_peer_ids: Vec<_> = committee + let recent_peer_ids = committee .iter() .filter(|&&peer_public_key| peer_public_key != consensus.public_key()) .filter_map(|&peer_public_key| { state_at.get_peer_id(peer_public_key).unwrap_or(None) }) - .collect(); + .collect_vec(); - consensus - .block_store - .set_peers_and_view(high_block.view(), &recent_peer_ids)?; - // It is likley that we missed the most recent proposal. Request it now - consensus - .block_store - .request_blocks(&RangeMap::from_closed_interval( - high_block.view(), - high_block.view() + 1, - ))?; + consensus.sync.add_peers(recent_peer_ids); } Ok(consensus) @@ -463,11 +442,11 @@ impl Consensus { pub fn head_block(&self) -> Block { let highest_block_number = self - .block_store + .db .get_highest_canonical_block_number() .unwrap() .unwrap(); - self.block_store + self.db .get_canonical_block_by_number(highest_block_number) .unwrap() .unwrap() @@ -652,7 +631,7 @@ impl Consensus { // FIXME: Cleanup - if self.block_store.contains_block(&block.hash())? { + if self.db.contains_block(&block.hash())? { trace!("ignoring block proposal, block store contains this block already"); return Ok(None); } @@ -678,29 +657,11 @@ impl Consensus { return Ok(None); } - match self.check_block(&block, during_sync) { - Ok(()) => {} - Err((e, temporary)) => { - // If this block could become valid in the future, buffer it. - if temporary { - self.block_store.buffer_proposal( - from, - Proposal::from_parts_with_hashes( - block, - transactions - .into_iter() - .map(|tx| { - let hash = tx.calculate_hash(); - (tx, hash) - }) - .collect(), - ), - )?; - } else { - warn!(?e, "invalid block proposal received!"); - } - return Ok(None); + if let Err((e, temporary)) = self.check_block(&block, during_sync) { + if !temporary { + warn!(?e, "invalid block proposal received!"); } + return Ok(None); } self.update_high_qc_and_view(block.agg.is_some(), block.header.qc)?; @@ -723,19 +684,6 @@ impl Consensus { block.view(), view ); - self.block_store.buffer_proposal( - from, - Proposal::from_parts_with_hashes( - block, - transactions - .into_iter() - .map(|tx| { - let hash = tx.calculate_hash(); - (tx, hash) - }) - .collect(), - ), - )?; return Ok(None); } @@ -1990,7 +1938,7 @@ impl Consensus { new_high_qc: QuorumCertificate, ) -> Result<()> { let view = self.get_view()?; - let Some(new_high_qc_block) = self.block_store.get_block(new_high_qc.block_hash)? else { + let Some(new_high_qc_block) = self.db.get_block_by_hash(&new_high_qc.block_hash)? else { // We don't set high_qc to a qc if we don't have its block. warn!("Recieved potential high QC but didn't have the corresponding block"); return Ok(()); @@ -2441,11 +2389,10 @@ impl Consensus { pub fn receive_block_availability( &mut self, from: PeerId, - availability: &Option>, + _availability: &Option>, ) -> Result<()> { trace!("Received block availability from {:?}", from); - self.block_store.update_availability(from, availability)?; - Ok(()) + Ok(()) // FIXME: Stub } // Checks for the validity of a block and adds it to our block store if valid. @@ -2460,8 +2407,6 @@ impl Consensus { proposal.number(), proposal.view() ); - self.block_store - .received_process_proposal(proposal.header.view); let result = self.proposal(from, proposal, true)?; // Processing the received block can either result in: // * A `Proposal`, if we have buffered votes for this block which form a supermajority, meaning we can @@ -2477,25 +2422,7 @@ impl Consensus { let hash = block.hash(); debug!(?from, ?hash, ?block.header.view, ?block.header.number, "added block"); let _ = self.new_blocks.send(block.header); - // We may have child blocks; process them too. - self.block_store - .process_block(from, block)? - .into_iter() - .try_for_each(|(from_id, child_proposal)| -> Result<()> { - // The only reason this can fail is permanent failure of the messaging mechanism, so - // propagate it back here. - // Mark this block in the cache as "we're about to process this one" - let view = child_proposal.header.view; - self.message_sender.send_external_message( - self.peer_id(), - ExternalMessage::ProcessProposal(ProcessProposal { - from: from_id.to_bytes(), - block: child_proposal, - }), - )?; - self.block_store.expect_process_proposal(view); - Ok(()) - })?; + self.db.insert_block(&block)?; Ok(()) } @@ -2536,15 +2463,15 @@ impl Consensus { } pub fn get_block(&self, key: &Hash) -> Result> { - self.block_store.get_block(*key) + self.db.get_block_by_hash(key) } pub fn get_block_by_view(&self, view: u64) -> Result> { - self.block_store.get_block_by_view(view) + self.db.get_block_by_view(view) } pub fn get_canonical_block_by_number(&self, number: u64) -> Result> { - self.block_store.get_canonical_block_by_number(number) + self.db.get_canonical_block_by_number(number) } fn set_finalized_view(&mut self, view: u64) -> Result<()> { @@ -2617,7 +2544,7 @@ impl Consensus { pub fn state_at(&self, number: u64) -> Result> { Ok(self - .block_store + .db .get_canonical_block_by_number(number)? .map(|block| self.state.at_root(block.state_root_hash().into()))) } @@ -3215,71 +3142,33 @@ impl Consensus { } } + pub fn get_num_transactions(&self) -> Result { + let count = self.db.get_total_transaction_count()?; + Ok(count) + } + pub fn report_outgoing_message_failure( &mut self, - failure: OutgoingMessageFailure, + _failure: OutgoingMessageFailure, ) -> Result<()> { - self.block_store.report_outgoing_message_failure(failure) + Ok(()) // FIXME: Stub } pub fn tick(&mut self) -> Result<()> { trace!("consensus::tick()"); trace!("request_missing_blocks from timer"); - // Drives the block fetching state machine - see docs/fetching_blocks.md - if self.block_store.request_missing_blocks()? { - // We're syncing.. - // Is it likely that the next thing in the buffer could be the next block? - let likely_blocks = self.block_store.next_proposals_if_likely()?; - if likely_blocks.is_empty() { - trace!("no blocks buffered"); - // If there are no next blocks buffered, someone may well have lied to us about - // where the gaps in the view range are. This should be a rare occurrence, so in - // lieu of timing it out, just zap the view range gap and we'll take the hit on - // any rerequests. - self.block_store.delete_empty_view_range_cache(); - } else { - likely_blocks.into_iter().for_each(|(from, block)| { - trace!( - "buffer may contain the next block - {0:?} v={1} n={2}", - block.hash(), - block.view(), - block.number() - ); - // Ignore errors here - just carry on and wait for re-request to clean up. - let view = block.view(); - let _ = self.message_sender.send_external_message( - self.peer_id(), - ExternalMessage::ProcessProposal(ProcessProposal { - from: from.to_bytes(), - block, - }), - ); - self.block_store.expect_process_proposal(view); - }); - } + // Drives syncing from timeouts, not just new Proposals + if self.sync.am_syncing()? { + // TODO: Sync from Timeouts } else { trace!("not syncing ..."); } Ok(()) } - pub fn buffer_proposal(&mut self, from: PeerId, proposal: Proposal) -> Result<()> { - self.block_store.buffer_proposal(from, proposal)?; - Ok(()) - } - - pub fn buffer_lack_of_proposals( - &mut self, - from_view: u64, - proposals: &Vec, - ) -> Result<()> { - self.block_store - .buffer_lack_of_proposals(from_view, proposals) - } - pub fn get_sync_data(&self) -> Result> { - self.block_store.get_sync_data() + self.sync.get_sync_data() } } diff --git a/zilliqa/src/db.rs b/zilliqa/src/db.rs index aba4e0968..31c104b90 100644 --- a/zilliqa/src/db.rs +++ b/zilliqa/src/db.rs @@ -1191,7 +1191,7 @@ fn decompress_file + Debug>(input_file_path: P, output_file_path: /// An implementor of [eth_trie::DB] which uses a [Connection] to persist data. #[derive(Debug, Clone)] pub struct TrieStorage { - db: Arc>, + pub db: Arc>, cache: Arc, Vec>>>, } diff --git a/zilliqa/src/exec.rs b/zilliqa/src/exec.rs index 49faece60..78eaafad5 100644 --- a/zilliqa/src/exec.rs +++ b/zilliqa/src/exec.rs @@ -415,7 +415,6 @@ impl DatabaseRef for &State { fn block_hash_ref(&self, number: u64) -> Result { Ok(self - .block_store .get_canonical_block_by_number(number)? .map(|block| B256::new(block.hash().0)) .unwrap_or_default()) @@ -1203,15 +1202,11 @@ impl PendingState { } pub fn get_canonical_block_by_number(&self, block_number: u64) -> Result> { - self.pre_state - .block_store - .get_canonical_block_by_number(block_number) + self.pre_state.get_canonical_block_by_number(block_number) } pub fn get_highest_canonical_block_number(&self) -> Result> { - self.pre_state - .block_store - .get_highest_canonical_block_number() + self.pre_state.get_highest_canonical_block_number() } pub fn load_account(&mut self, address: Address) -> Result<&mut PendingAccount> { diff --git a/zilliqa/src/lib.rs b/zilliqa/src/lib.rs index bbb360644..9b783a862 100644 --- a/zilliqa/src/lib.rs +++ b/zilliqa/src/lib.rs @@ -1,5 +1,4 @@ pub mod api; -pub mod block_store; mod blockhooks; pub mod cfg; pub mod consensus; diff --git a/zilliqa/src/node.rs b/zilliqa/src/node.rs index 589d7065c..58744ce98 100644 --- a/zilliqa/src/node.rs +++ b/zilliqa/src/node.rs @@ -911,26 +911,6 @@ impl Node { Ok(()) } - fn _handle_block_response(&mut self, from: PeerId, response: BlockResponse) -> Result<()> { - trace!( - "block_store::handle_block_response - received blocks response of length {}", - response.proposals.len() - ); - self.consensus - .receive_block_availability(from, &response.availability)?; - - self.consensus - .buffer_lack_of_proposals(response.from_view, &response.proposals)?; - - for block in response.proposals { - // Buffer the block so that we know we have it - in fact, add it to the cache so - // that we can include it in the chain if necessary. - self.consensus.buffer_proposal(from, block)?; - } - trace!("block_store::handle_block_response: finished handling response"); - Ok(()) - } - fn handle_injected_proposal(&mut self, from: PeerId, req: InjectedProposal) -> Result<()> { if from != self.consensus.peer_id() { warn!("Someone ({from}) sent me a InjectedProposal; illegal- ignoring"); diff --git a/zilliqa/src/pool.rs b/zilliqa/src/pool.rs index 32b71d908..9239e4fc7 100644 --- a/zilliqa/src/pool.rs +++ b/zilliqa/src/pool.rs @@ -403,16 +403,13 @@ mod tests { primitives::{Address, Bytes, PrimitiveSignature, TxKind, U256}, }; use anyhow::Result; - use libp2p::PeerId; use rand::{seq::SliceRandom, thread_rng}; use super::TransactionPool; use crate::{ - block_store::BlockStore, cfg::NodeConfig, crypto::Hash, db::Db, - node::{MessageSender, RequestId}, state::State, transaction::{EvmGas, SignedTransaction, TxIntershard, VerifiedTransaction}, }; @@ -468,23 +465,10 @@ mod tests { fn get_in_memory_state() -> Result { let node_config = NodeConfig::default(); - let (s1, _) = tokio::sync::mpsc::unbounded_channel(); - let (s2, _) = tokio::sync::mpsc::unbounded_channel(); - - let message_sender = MessageSender { - our_shard: 0, - our_peer_id: PeerId::random(), - outbound_channel: s1, - local_channel: s2, - request_id: RequestId::default(), - }; - let db = Db::new::(None, 0, 0)?; let db = Arc::new(db); - let block_store = BlockStore::new(&node_config, db.clone(), message_sender.clone())?; - - State::new_with_genesis(db.state_trie()?, node_config, Arc::new(block_store)) + State::new_with_genesis(db.state_trie()?, node_config, db.clone()) } fn create_acc(state: &mut State, address: Address, balance: u128, nonce: u64) -> Result<()> { diff --git a/zilliqa/src/state.rs b/zilliqa/src/state.rs index 059efe9db..d31000cd9 100644 --- a/zilliqa/src/state.rs +++ b/zilliqa/src/state.rs @@ -17,13 +17,12 @@ use sha3::{Digest, Keccak256}; use tracing::debug; use crate::{ - block_store::BlockStore, cfg::{Amount, Forks, NodeConfig, ScillaExtLibsPath}, contracts::{self, Contract}, crypto::{self, Hash}, - db::TrieStorage, + db::{Db, TrieStorage}, error::ensure_success, - message::{BlockHeader, MAX_COMMITTEE_SIZE}, + message::{Block, BlockHeader, MAX_COMMITTEE_SIZE}, node::ChainId, scilla::{ParamValue, Scilla, Transition}, serde_util::vec_param_value, @@ -40,6 +39,7 @@ use crate::{ /// the storage root is used to index into the state /// all the keys are hashed and stored in the same sled tree pub struct State { + sql: Arc, db: Arc, accounts: PatriciaTrie, /// The Scilla interpreter interface. Note that it is lazily initialized - This is a bit of a hack to ensure that @@ -54,11 +54,10 @@ pub struct State { pub scilla_call_gas_exempt_addrs: Vec
, pub chain_id: ChainId, pub forks: Forks, - pub block_store: Arc, } impl State { - pub fn new(trie: TrieStorage, config: &NodeConfig, block_store: Arc) -> State { + pub fn new(trie: TrieStorage, config: &NodeConfig, sql: Arc) -> State { let db = Arc::new(trie); let consensus_config = &config.consensus; Self { @@ -74,7 +73,7 @@ impl State { scilla_call_gas_exempt_addrs: consensus_config.scilla_call_gas_exempt_addrs.clone(), chain_id: ChainId::new(config.eth_chain_id), forks: consensus_config.forks.clone(), - block_store, + sql, } } @@ -95,17 +94,13 @@ impl State { trie: TrieStorage, root_hash: B256, config: NodeConfig, - block_store: Arc, + sql: Arc, ) -> Self { - Self::new(trie, &config, block_store).at_root(root_hash) + Self::new(trie, &config, sql).at_root(root_hash) } - pub fn new_with_genesis( - trie: TrieStorage, - config: NodeConfig, - block_store: Arc, - ) -> Result { - let mut state = State::new(trie, &config, block_store); + pub fn new_with_genesis(trie: TrieStorage, config: NodeConfig, sql: Arc) -> Result { + let mut state = State::new(trie, &config, sql); if config.consensus.is_main { let shard_data = contracts::shard_registry::CONSTRUCTOR.encode_input( @@ -285,8 +280,8 @@ impl State { gas_price: self.gas_price, scilla_call_gas_exempt_addrs: self.scilla_call_gas_exempt_addrs.clone(), chain_id: self.chain_id, - block_store: self.block_store.clone(), forks: self.forks.clone(), + sql: self.sql.clone(), } } @@ -382,6 +377,14 @@ impl State { &bincode::serialize(&account)?, )?) } + + pub fn get_canonical_block_by_number(&self, number: u64) -> Result> { + self.sql.get_canonical_block_by_number(number) + } + + pub fn get_highest_canonical_block_number(&self) -> Result> { + self.sql.get_highest_canonical_block_number() + } } pub mod contract_addr { @@ -579,37 +582,23 @@ mod tests { use std::{path::PathBuf, sync::Arc}; use crypto::Hash; - use libp2p::PeerId; use revm::primitives::FixedBytes; use super::*; use crate::{ api::to_hex::ToHex, - block_store::BlockStore, cfg::NodeConfig, db::Db, message::BlockHeader, - node::{MessageSender, RequestId}, }; #[test] fn deposit_contract_updateability() { - let (s1, _) = tokio::sync::mpsc::unbounded_channel(); - let (s2, _) = tokio::sync::mpsc::unbounded_channel(); - let message_sender = MessageSender { - our_shard: 0, - our_peer_id: PeerId::random(), - outbound_channel: s1, - local_channel: s2, - request_id: RequestId::default(), - }; let db = Db::new::(None, 0, 0).unwrap(); let db = Arc::new(db); let config = NodeConfig::default(); - let block_store = - Arc::new(BlockStore::new(&config, db.clone(), message_sender.clone()).unwrap()); - let mut state = State::new(db.state_trie().unwrap(), &config, block_store); + let mut state = State::new(db.state_trie().unwrap(), &config, db); let deposit_init_addr = state.deploy_initial_deposit_contract(&config).unwrap(); diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index 6b0348dfa..44629c744 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -19,7 +19,10 @@ use crate::{ cfg::NodeConfig, crypto::Hash, db::Db, - message::{Block, BlockRequest, BlockRequestV2, BlockResponse, ChainMetaData, ExternalMessage, InjectedProposal, Proposal}, + message::{ + Block, BlockRequest, BlockRequestV2, BlockResponse, ChainMetaData, ExternalMessage, + InjectedProposal, Proposal, + }, node::MessageSender, time::SystemTime, }; @@ -1067,6 +1070,13 @@ impl Sync { } } + /// Add bulk peers + pub fn add_peers(&mut self, peers: Vec) { + for peer in peers { + self.add_peer(peer); + } + } + /// Add a peer to the list of peers. pub fn add_peer(&mut self, peer: PeerId) { // if the new peer is not synced, it will get downgraded to the back of heap. @@ -1101,30 +1111,28 @@ impl Sync { } /// Returns (am_syncing, current_highest_block) - pub fn am_syncing(&self) -> Result<(bool, Block)> { - let highest_block = self - .db - .get_canonical_block_by_number( - self.db - .get_highest_canonical_block_number()? - .expect("no highest block"), - )? - .expect("missing highest block"); - Ok(( - self.in_pipeline != 0 - || !self.recent_proposals.is_empty() - || self.count_segments()? != 0, - highest_block, - )) + pub fn am_syncing(&self) -> Result { + Ok(self.in_pipeline != 0 + || self.count_segments()? != 0 + || !self.recent_proposals.is_empty()) } // Returns (starting_block, current_block, highest_block) if we're syncing, // None if we're not. pub fn get_sync_data(&self) -> Result> { - let (flag, highest_block) = self.am_syncing()?; + let flag = self.am_syncing()?; if !flag { Ok(None) } else { + let highest_block = self + .db + .get_canonical_block_by_number( + self.db + .get_highest_canonical_block_number()? + .expect("no highest block"), + )? + .expect("missing highest block"); + let highest_saved_block_number = highest_block.number(); let highest_block_number_seen = self.recent_proposals.back().unwrap().number(); Ok(Some(( From 0af7bf11880fa579cb6870b9e06ef14f9480381a Mon Sep 17 00:00:00 2001 From: Shawn Date: Thu, 16 Jan 2025 11:10:42 +0800 Subject: [PATCH 068/119] feat: added sync from timeout, not just proposals. --- zilliqa/src/consensus.rs | 1 + zilliqa/src/sync.rs | 10 ++++++++++ 2 files changed, 11 insertions(+) diff --git a/zilliqa/src/consensus.rs b/zilliqa/src/consensus.rs index 674b20d29..f079b538d 100644 --- a/zilliqa/src/consensus.rs +++ b/zilliqa/src/consensus.rs @@ -3161,6 +3161,7 @@ impl Consensus { // Drives syncing from timeouts, not just new Proposals if self.sync.am_syncing()? { // TODO: Sync from Timeouts + self.sync.sync_internal()?; } else { trace!("not syncing ..."); } diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index 44629c744..4fcf76534 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -347,6 +347,16 @@ impl Sync { } self.recent_proposals.push_back(proposal); + self.sync_internal() + } + + pub fn sync_internal(&mut self) -> Result<()> { + if self.recent_proposals.is_empty() { + // Do nothing if there's no recent proposals. + tracing::debug!("sync::Internal : missing recent proposals"); + return Ok(()); + } + match self.state { // Check if we are out of sync SyncState::Phase0 if self.in_pipeline == 0 => { From 0795f0e9b79e9cc0d6d852fac85bf4e80a4539fa Mon Sep 17 00:00:00 2001 From: Shawn Date: Thu, 16 Jan 2025 17:40:58 +0800 Subject: [PATCH 069/119] feat: made the batch_size dynamic, so that it can get past a larger range. --- zilliqa/src/state.rs | 7 +------ zilliqa/src/sync.rs | 35 ++++++++++++++++++++++------------- 2 files changed, 23 insertions(+), 19 deletions(-) diff --git a/zilliqa/src/state.rs b/zilliqa/src/state.rs index d31000cd9..a77a6e265 100644 --- a/zilliqa/src/state.rs +++ b/zilliqa/src/state.rs @@ -585,12 +585,7 @@ mod tests { use revm::primitives::FixedBytes; use super::*; - use crate::{ - api::to_hex::ToHex, - cfg::NodeConfig, - db::Db, - message::BlockHeader, - }; + use crate::{api::to_hex::ToHex, cfg::NodeConfig, db::Db, message::BlockHeader}; #[test] fn deposit_contract_updateability() { diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index 4fcf76534..142bc686c 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -72,6 +72,7 @@ pub struct Sync { request_timeout: Duration, // how many blocks to request at once max_batch_size: usize, + max_batch_size_const: usize, // how many blocks to inject into the queue max_blocks_in_flight: usize, // count of proposals pending in the pipeline @@ -95,14 +96,6 @@ impl Sync { #[cfg(debug_assertions)] const DO_SPECULATIVE: bool = false; - // For V1 BlockRequest, we request a little more than we need, due to drift - // Since the view number is an 'internal' clock, it is possible for the same block number - // to have different view numbers. - // 10 ~ 1-min - // 20 ~ 1-hr - // 30 ~ 2-days - const VIEW_DRIFT: u64 = 10; - // Minimum of 2 peers to avoid single source of truth. const MIN_PEERS: usize = 2; @@ -122,9 +115,7 @@ impl Sync { }) .collect(); let peer_id = message_sender.our_peer_id; - let max_batch_size = config - .block_request_batch_size - .clamp(Self::VIEW_DRIFT as usize * 2, 180); // up to 180 sec of blocks at a time. + let max_batch_size = config.block_request_batch_size.clamp(30, 180); // up to 180 sec of blocks at a time. let max_blocks_in_flight = config.max_blocks_in_flight.clamp(max_batch_size, 1800); // up to 30-mins worth of blocks in-pipeline. // This DB could be left in-here as it is only used in this module @@ -167,6 +158,7 @@ impl Sync { peer_id, request_timeout: config.consensus.consensus_timeout, max_batch_size, + max_batch_size_const: max_batch_size, max_blocks_in_flight, in_flight: None, in_pipeline: usize::MIN, @@ -707,8 +699,20 @@ impl Sync { // Downgrade empty responses if response.proposals.is_empty() { tracing::info!("sync::HandleBlockResponse : empty response {from}"); + + if let Some(availability) = response.availability { + tracing::info!("sync::Availability {}", availability.len()); + // response may be too large, so reduce request range + // this has the impact of slowing sync progress to a crawl. + self.max_batch_size = self + .max_batch_size + .saturating_sub(self.max_batch_size_const / 20) + .max(5); // 5% reduce, down to 5 - empirical value + } self.done_with_peer(DownGrade::Empty); return Ok(()); + } else { + self.max_batch_size = self.max_batch_size_const; } tracing::trace!( @@ -962,8 +966,12 @@ impl Sync { SyncState::Phase1(ChainMetaData { view_number, .. }) if matches!(peer.version, PeerVer::V1) => { + // For V1 BlockRequest, we request a little more than we need, due to drift + // Since the view number is an 'internal' clock, it is possible for the same block number + // to have different view numbers. + let drift = self.max_batch_size as u64 / 10; ExternalMessage::BlockRequest(BlockRequest { - to_view: view_number.saturating_add(Self::VIEW_DRIFT), + to_view: view_number.saturating_add(drift), from_view: view_number.saturating_sub(self.max_batch_size as u64), }) } @@ -981,8 +989,9 @@ impl Sync { let meta = meta.unwrap(); let view_number = meta.view_number; self.state = SyncState::Phase1(meta); + let drift = self.max_batch_size as u64 / 10; ExternalMessage::BlockRequest(BlockRequest { - to_view: view_number.saturating_add(Self::VIEW_DRIFT), + to_view: view_number.saturating_add(drift), from_view: view_number.saturating_sub(self.max_batch_size as u64), }) } From 3b7ffcf1c24d1962bd9e49434d2232f3bbbdba79 Mon Sep 17 00:00:00 2001 From: Shawn Date: Fri, 17 Jan 2025 10:46:08 +0800 Subject: [PATCH 070/119] feat: added dynamic_batch_sizing() which is reactive, not pro-active. --- zilliqa/src/sync.rs | 62 +++++++++++++++++++++++++++++++-------------- 1 file changed, 43 insertions(+), 19 deletions(-) diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index 142bc686c..c9b63b8d7 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -609,6 +609,7 @@ impl Sync { "sync::RequestMissingBlocks : in-flight request {} timed out, requesting from new peer", peer.peer_id ); + self.dynamic_batch_sizing(peer.peer_id, DownGrade::Timeout)?; self.done_with_peer(DownGrade::Timeout); } else { return Ok(()); @@ -679,6 +680,45 @@ impl Sync { Ok(()) } + /// Phase 1: Dynamic Batch Sizing + /// + /// Due to a hard-coded 10MB response limit in libp2p, we may be limited in how many blocks we can request + /// for in a single request, between 1-100 blocks. + /// TODO: Make this a pro-active setting instead. + fn dynamic_batch_sizing(&mut self, from: PeerId, reason: DownGrade) -> Result<()> { + let Some(peer) = self.in_flight.as_ref() else { + todo!("invalid peer"); + }; + + match (&self.state, &peer.version, reason) { + // V1 response may be too large. Reduce request range. + (SyncState::Phase1(_), PeerVer::V1, DownGrade::Timeout) => { + self.max_batch_size = self + .max_batch_size + .saturating_sub(self.max_batch_size / 2) + .max(1); + } + (SyncState::Phase1(_), PeerVer::V1, DownGrade::Empty) => { + self.max_batch_size = self + .max_batch_size + .saturating_sub(self.max_batch_size / 3) + .max(1); + } + // V1 responses are going well, increase the request range linearly + (SyncState::Phase1(_), PeerVer::V1, DownGrade::None) if from == peer.peer_id => { + self.max_batch_size = self + .max_batch_size + .saturating_add(self.max_batch_size_const / 10) + // For V1, ~100 empty blocks saturates the response payload + .min(100); + } + // V2 response may be too large, which can induce a timeout. Split into 10 block segments + _ => {} + } + + Ok(()) + } + /// Phase 1 / 2: Handle a V1 block response /// /// If the response if from a V2 peer, it will upgrade that peer to V2. @@ -696,25 +736,6 @@ impl Sync { return Ok(()); } - // Downgrade empty responses - if response.proposals.is_empty() { - tracing::info!("sync::HandleBlockResponse : empty response {from}"); - - if let Some(availability) = response.availability { - tracing::info!("sync::Availability {}", availability.len()); - // response may be too large, so reduce request range - // this has the impact of slowing sync progress to a crawl. - self.max_batch_size = self - .max_batch_size - .saturating_sub(self.max_batch_size_const / 20) - .max(5); // 5% reduce, down to 5 - empirical value - } - self.done_with_peer(DownGrade::Empty); - return Ok(()); - } else { - self.max_batch_size = self.max_batch_size_const; - } - tracing::trace!( "sync::HandleBlockResponse : received {} blocks from {from}", response.proposals.len() @@ -807,9 +828,11 @@ impl Sync { if response.is_empty() { // Empty response, downgrade peer and retry with a new peer. tracing::warn!("sync::MetadataResponse : empty blocks {from}",); + self.dynamic_batch_sizing(from, DownGrade::Empty)?; self.done_with_peer(DownGrade::Empty); return Ok(()); } else { + self.dynamic_batch_sizing(from, DownGrade::None)?; self.done_with_peer(DownGrade::None); } @@ -933,6 +956,7 @@ impl Sync { "sync::RequestMissingMetadata : in-flight request {} timed out, requesting from new peer", peer.peer_id ); + self.dynamic_batch_sizing(peer.peer_id, DownGrade::Timeout)?; self.done_with_peer(DownGrade::Timeout); } else { return Ok(()); From 49ad23c9f2d859e70365d0e6a62ab5c097cef954 Mon Sep 17 00:00:00 2001 From: Shawn Date: Fri, 17 Jan 2025 11:29:59 +0800 Subject: [PATCH 071/119] feat: make dynamic_batch_sizing() work per-peer, not per sync. --- zilliqa/src/sync.rs | 125 ++++++++++++++++++++++++-------------------- 1 file changed, 68 insertions(+), 57 deletions(-) diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index c9b63b8d7..9c588dff9 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -72,7 +72,6 @@ pub struct Sync { request_timeout: Duration, // how many blocks to request at once max_batch_size: usize, - max_batch_size_const: usize, // how many blocks to inject into the queue max_blocks_in_flight: usize, // count of proposals pending in the pipeline @@ -105,6 +104,9 @@ impl Sync { message_sender: MessageSender, peers: Vec, ) -> Result { + let peer_id = message_sender.our_peer_id; + let max_batch_size = config.block_request_batch_size.clamp(30, 180); // up to 180 sec of blocks at a time. + let max_blocks_in_flight = config.max_blocks_in_flight.clamp(max_batch_size, 1800); // up to 30-mins worth of blocks in-pipeline. let peers = peers .into_iter() .map(|peer_id| PeerInfo { @@ -112,11 +114,10 @@ impl Sync { score: 0, peer_id, last_used: Instant::now(), + reason: DownGrade::None, + batch_size: max_batch_size, }) .collect(); - let peer_id = message_sender.our_peer_id; - let max_batch_size = config.block_request_batch_size.clamp(30, 180); // up to 180 sec of blocks at a time. - let max_blocks_in_flight = config.max_blocks_in_flight.clamp(max_batch_size, 1800); // up to 30-mins worth of blocks in-pipeline. // This DB could be left in-here as it is only used in this module // TODO: Make this in-memory by exploiting SQLite TEMP tables i.e. CREATE TEMP TABLE @@ -158,7 +159,6 @@ impl Sync { peer_id, request_timeout: config.consensus.consensus_timeout, max_batch_size, - max_batch_size_const: max_batch_size, max_blocks_in_flight, in_flight: None, in_pipeline: usize::MIN, @@ -232,7 +232,9 @@ impl Sync { last_used: Instant::now(), score:u32::MAX, version: row.get(5)?, - peer_id: PeerId::from_bytes(row.get::<_,Vec>(4)?.as_slice()).unwrap(), + reason: DownGrade::None, + batch_size: self.max_batch_size, + peer_id: PeerId::from_bytes(row.get::<_,Vec>(4)?.as_slice()).unwrap(), }, ))) .optional()?; @@ -609,7 +611,6 @@ impl Sync { "sync::RequestMissingBlocks : in-flight request {} timed out, requesting from new peer", peer.peer_id ); - self.dynamic_batch_sizing(peer.peer_id, DownGrade::Timeout)?; self.done_with_peer(DownGrade::Timeout); } else { return Ok(()); @@ -653,6 +654,8 @@ impl Sync { version: PeerVer::V2, peer_id: peer_info.peer_id, last_used: std::time::Instant::now(), + batch_size: self.max_batch_size, // unused in Phase 2 + reason: DownGrade::None, score: u32::MAX, // used to indicate faux peer, will not be added to the group of peers }); ExternalMessage::MultiBlockRequest(request_hashes) @@ -662,6 +665,8 @@ impl Sync { version: PeerVer::V1, peer_id: peer_info.peer_id, last_used: std::time::Instant::now(), + batch_size: self.max_batch_size, // unused in Phase 2 + reason: DownGrade::None, score: u32::MAX, // used to indicate faux peer, will not be added to the group of peers }); // do not add VIEW_DRIFT - the stored marker is accurate! @@ -680,45 +685,6 @@ impl Sync { Ok(()) } - /// Phase 1: Dynamic Batch Sizing - /// - /// Due to a hard-coded 10MB response limit in libp2p, we may be limited in how many blocks we can request - /// for in a single request, between 1-100 blocks. - /// TODO: Make this a pro-active setting instead. - fn dynamic_batch_sizing(&mut self, from: PeerId, reason: DownGrade) -> Result<()> { - let Some(peer) = self.in_flight.as_ref() else { - todo!("invalid peer"); - }; - - match (&self.state, &peer.version, reason) { - // V1 response may be too large. Reduce request range. - (SyncState::Phase1(_), PeerVer::V1, DownGrade::Timeout) => { - self.max_batch_size = self - .max_batch_size - .saturating_sub(self.max_batch_size / 2) - .max(1); - } - (SyncState::Phase1(_), PeerVer::V1, DownGrade::Empty) => { - self.max_batch_size = self - .max_batch_size - .saturating_sub(self.max_batch_size / 3) - .max(1); - } - // V1 responses are going well, increase the request range linearly - (SyncState::Phase1(_), PeerVer::V1, DownGrade::None) if from == peer.peer_id => { - self.max_batch_size = self - .max_batch_size - .saturating_add(self.max_batch_size_const / 10) - // For V1, ~100 empty blocks saturates the response payload - .min(100); - } - // V2 response may be too large, which can induce a timeout. Split into 10 block segments - _ => {} - } - - Ok(()) - } - /// Phase 1 / 2: Handle a V1 block response /// /// If the response if from a V2 peer, it will upgrade that peer to V2. @@ -828,11 +794,9 @@ impl Sync { if response.is_empty() { // Empty response, downgrade peer and retry with a new peer. tracing::warn!("sync::MetadataResponse : empty blocks {from}",); - self.dynamic_batch_sizing(from, DownGrade::Empty)?; self.done_with_peer(DownGrade::Empty); return Ok(()); } else { - self.dynamic_batch_sizing(from, DownGrade::None)?; self.done_with_peer(DownGrade::None); } @@ -956,7 +920,6 @@ impl Sync { "sync::RequestMissingMetadata : in-flight request {} timed out, requesting from new peer", peer.peer_id ); - self.dynamic_batch_sizing(peer.peer_id, DownGrade::Timeout)?; self.done_with_peer(DownGrade::Timeout); } else { return Ok(()); @@ -973,7 +936,7 @@ impl Sync { if let Some(peer) = self.get_next_peer() { tracing::info!( "sync::RequestMissingMetadata : requesting {} metadata of segment #{} from {}", - self.max_batch_size, + peer.batch_size, self.count_segments()? + 1, peer.peer_id ); @@ -984,7 +947,7 @@ impl Sync { ExternalMessage::MetaDataRequest(BlockRequestV2 { request_at: SystemTime::now(), from_hash: parent_hash, - batch_size: self.max_batch_size, + batch_size: peer.batch_size, }) } SyncState::Phase1(ChainMetaData { view_number, .. }) @@ -993,10 +956,10 @@ impl Sync { // For V1 BlockRequest, we request a little more than we need, due to drift // Since the view number is an 'internal' clock, it is possible for the same block number // to have different view numbers. - let drift = self.max_batch_size as u64 / 10; + let drift = peer.batch_size as u64 / 10; ExternalMessage::BlockRequest(BlockRequest { to_view: view_number.saturating_add(drift), - from_view: view_number.saturating_sub(self.max_batch_size as u64), + from_view: view_number.saturating_sub(peer.batch_size as u64), }) } SyncState::Phase0 if meta.is_some() && matches!(peer.version, PeerVer::V2) => { @@ -1006,17 +969,17 @@ impl Sync { ExternalMessage::MetaDataRequest(BlockRequestV2 { request_at: SystemTime::now(), from_hash: parent_hash, - batch_size: self.max_batch_size, + batch_size: peer.batch_size, }) } SyncState::Phase0 if meta.is_some() && matches!(peer.version, PeerVer::V1) => { let meta = meta.unwrap(); let view_number = meta.view_number; self.state = SyncState::Phase1(meta); - let drift = self.max_batch_size as u64 / 10; + let drift = peer.batch_size as u64 / 10; ExternalMessage::BlockRequest(BlockRequest { to_view: view_number.saturating_add(drift), - from_view: view_number.saturating_sub(self.max_batch_size as u64), + from_view: view_number.saturating_sub(peer.batch_size as u64), }) } _ => anyhow::bail!("sync::MissingMetadata : invalid state"), @@ -1103,6 +1066,7 @@ impl Sync { fn done_with_peer(&mut self, downgrade: DownGrade) { if let Some(mut peer) = self.in_flight.take() { tracing::trace!("sync::DoneWithPeer {} {:?}", peer.peer_id, downgrade); + peer.reason = downgrade.clone(); peer.score = peer.score.saturating_add(downgrade as u32); // Ensure that the next peer is equal or better peer.score = peer.score.max(self.peers.peek().unwrap().score); @@ -1129,6 +1093,8 @@ impl Sync { score: self.peers.iter().map(|p| p.score).min().unwrap_or_default(), peer_id: peer, last_used: Instant::now(), + reason: DownGrade::None, + batch_size: self.max_batch_size, }; tracing::trace!("sync::AddPeer {peer}"); // ensure that it is unique - avoids single source of truth @@ -1147,12 +1113,43 @@ impl Sync { if self.peers.len() >= Self::MIN_PEERS { let mut peer = self.peers.pop()?; peer.last_used = std::time::Instant::now(); // used to determine stale requests. + peer.batch_size = self.dynamic_batch_sizing(& peer); tracing::trace!("sync::GetNextPeer {} ({})", peer.peer_id, peer.score); return Some(peer); } None } + /// Phase 1: Dynamic Batch Sizing + /// + /// Due to a hard-coded 10MB response limit in libp2p, we may be limited in how many blocks we can request + /// for in a single request, between 1-100 blocks. + fn dynamic_batch_sizing(&self, peer: &PeerInfo) -> usize { + match (&self.state, &peer.version, &peer.reason) { + // V1 response may be too large. Reduce request range. + (SyncState::Phase1(_), PeerVer::V1, DownGrade::Timeout) => { + peer.batch_size + .saturating_sub(peer.batch_size / 2) + .max(1) + } + // V1 response may be too large. Reduce request range. + (SyncState::Phase1(_), PeerVer::V1, DownGrade::Empty) => { + peer.batch_size + .saturating_sub(peer.batch_size / 3) + .max(1) + } + // V1 responses are going well, increase the request range linearly + (SyncState::Phase1(_), PeerVer::V1, DownGrade::None) => { + peer.batch_size + .saturating_add(self.max_batch_size / 10) + // For V1, ~100 empty blocks saturates the response payload + .min(100) + } + // V2 response may be too large, which can induce a timeout. Split into 10 block segments + _ => { self.max_batch_size} + } + } + /// Returns (am_syncing, current_highest_block) pub fn am_syncing(&self) -> Result { Ok(self.in_pipeline != 0 @@ -1193,6 +1190,8 @@ struct PeerInfo { peer_id: PeerId, last_used: Instant, version: PeerVer, + batch_size: usize, + reason: DownGrade, } impl Ord for PeerInfo { @@ -1212,13 +1211,25 @@ impl PartialOrd for PeerInfo { /// For downgrading a peer from being selected in get_next_peer(). /// Ordered by degree of offence i.e. None is good, Timeout is worst -#[derive(Debug)] +#[derive(Debug, Clone, Eq, PartialEq)] enum DownGrade { None, Empty, Timeout, } +impl Ord for DownGrade { + fn cmp(&self, other: &Self) -> Ordering { + (self.clone() as u32).cmp(&(other.clone() as u32)) + } +} + +impl PartialOrd for DownGrade { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + /// Sync state #[derive(Debug)] enum SyncState { From f12793e89ac077f0131b9a426d3a0c135cea983d Mon Sep 17 00:00:00 2001 From: Shawn Date: Fri, 17 Jan 2025 14:36:26 +0800 Subject: [PATCH 072/119] fix: wire up peers in test Network. --- zilliqa/src/sync.rs | 35 +++++++++++++++++++++-------------- zilliqa/tests/it/consensus.rs | 2 +- zilliqa/tests/it/main.rs | 19 +++++++++++++++++++ 3 files changed, 41 insertions(+), 15 deletions(-) diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index 9c588dff9..6ebd89114 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -234,7 +234,7 @@ impl Sync { version: row.get(5)?, reason: DownGrade::None, batch_size: self.max_batch_size, - peer_id: PeerId::from_bytes(row.get::<_,Vec>(4)?.as_slice()).unwrap(), + peer_id: PeerId::from_bytes(row.get::<_,Vec>(4)?.as_slice()).unwrap(), }, ))) .optional()?; @@ -680,7 +680,10 @@ impl Sync { .send_external_message(peer_info.peer_id, message)?; } } else { - tracing::warn!("sync::RequestMissingBlocks : insufficient peers to handle request"); + tracing::warn!( + "sync::RequestMissingBlocks : {} insufficient peers to handle request", + self.peers.len() + ); } Ok(()) } @@ -988,7 +991,10 @@ impl Sync { .send_external_message(peer.peer_id, message)?; self.in_flight = Some(peer); } else { - tracing::warn!("sync::RequestMissingMetadata : insufficient peers to handle request"); + tracing::warn!( + "sync::RequestMissingBlocks : {} insufficient peers to handle request", + self.peers.len() + ); } Ok(()) } @@ -1079,8 +1085,11 @@ impl Sync { /// Add bulk peers pub fn add_peers(&mut self, peers: Vec) { + tracing::debug!("sync::AddPeers {:?}", peers); for peer in peers { - self.add_peer(peer); + if peer != self.peer_id { + self.add_peer(peer); + } } } @@ -1096,10 +1105,11 @@ impl Sync { reason: DownGrade::None, batch_size: self.max_batch_size, }; - tracing::trace!("sync::AddPeer {peer}"); // ensure that it is unique - avoids single source of truth self.peers.retain(|p: &PeerInfo| p.peer_id != peer); self.peers.push(new_peer); + + tracing::trace!("sync::AddPeer {peer}/{}", self.peers.len()); } /// Remove a peer from the list of peers. @@ -1113,10 +1123,11 @@ impl Sync { if self.peers.len() >= Self::MIN_PEERS { let mut peer = self.peers.pop()?; peer.last_used = std::time::Instant::now(); // used to determine stale requests. - peer.batch_size = self.dynamic_batch_sizing(& peer); + peer.batch_size = self.dynamic_batch_sizing(&peer); tracing::trace!("sync::GetNextPeer {} ({})", peer.peer_id, peer.score); return Some(peer); } + tracing::warn!("sync::NextPeer : {} insufficient peers", self.peers.len()); None } @@ -1128,25 +1139,21 @@ impl Sync { match (&self.state, &peer.version, &peer.reason) { // V1 response may be too large. Reduce request range. (SyncState::Phase1(_), PeerVer::V1, DownGrade::Timeout) => { - peer.batch_size - .saturating_sub(peer.batch_size / 2) - .max(1) + peer.batch_size.saturating_sub(peer.batch_size / 2).max(1) } // V1 response may be too large. Reduce request range. (SyncState::Phase1(_), PeerVer::V1, DownGrade::Empty) => { - peer.batch_size - .saturating_sub(peer.batch_size / 3) - .max(1) + peer.batch_size.saturating_sub(peer.batch_size / 3).max(1) } // V1 responses are going well, increase the request range linearly (SyncState::Phase1(_), PeerVer::V1, DownGrade::None) => { - peer.batch_size + peer.batch_size .saturating_add(self.max_batch_size / 10) // For V1, ~100 empty blocks saturates the response payload .min(100) } // V2 response may be too large, which can induce a timeout. Split into 10 block segments - _ => { self.max_batch_size} + _ => self.max_batch_size, } } diff --git a/zilliqa/tests/it/consensus.rs b/zilliqa/tests/it/consensus.rs index 670ee3613..607c00fee 100644 --- a/zilliqa/tests/it/consensus.rs +++ b/zilliqa/tests/it/consensus.rs @@ -116,7 +116,7 @@ async fn block_production(mut network: Network) { .map_or(0, |b| b.number()) >= 10 }, - 100, + 1000, ) .await .unwrap(); diff --git a/zilliqa/tests/it/main.rs b/zilliqa/tests/it/main.rs index 9be342e59..481b73c8b 100644 --- a/zilliqa/tests/it/main.rs +++ b/zilliqa/tests/it/main.rs @@ -401,6 +401,8 @@ impl Network { let receive_resend_message = UnboundedReceiverStream::new(receive_resend_message).boxed(); receivers.push(receive_resend_message); + let peers = nodes.iter().map(|n| n.peer_id).collect_vec(); + for node in &nodes { trace!( "Node {}: {} (dir: {})", @@ -408,6 +410,12 @@ impl Network { node.peer_id, node.dir.as_ref().unwrap().path().to_string_lossy(), ); + node.inner + .lock() + .unwrap() + .consensus + .sync + .add_peers(peers.clone()); } Network { @@ -505,6 +513,9 @@ impl Network { let (node, receiver, local_receiver, request_responses) = node(config, secret_key, onchain_key, self.nodes.len(), None).unwrap(); + let peers = self.nodes.iter().map(|n| n.peer_id).collect_vec(); + node.inner.lock().unwrap().consensus.sync.add_peers(peers); + trace!("Node {}: {}", node.index, node.peer_id); let index = node.index; @@ -567,6 +578,8 @@ impl Network { .chain(request_response_receivers) .collect(); + let peers = nodes.iter().map(|n| n.peer_id).collect_vec(); + for node in &nodes { trace!( "Node {}: {} (dir: {})", @@ -574,6 +587,12 @@ impl Network { node.peer_id, node.dir.as_ref().unwrap().path().to_string_lossy(), ); + node.inner + .lock() + .unwrap() + .consensus + .sync + .add_peers(peers.clone()); } let (resend_message, receive_resend_message) = mpsc::unbounded_channel::(); From b58d7f76bc78a52167887e3e517f85579bdd57c3 Mon Sep 17 00:00:00 2001 From: Shawn Date: Fri, 17 Jan 2025 15:57:17 +0800 Subject: [PATCH 073/119] fix: handle when V2 BlockResponse is late. --- zilliqa/src/sync.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index 6ebd89114..9caea8cdf 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -700,8 +700,12 @@ impl Sync { && response.from_view == u64::MAX { tracing::info!("sync::HandleBlockResponse : upgrading {from}",); - self.in_flight.as_mut().unwrap().version = PeerVer::V2; - self.done_with_peer(DownGrade::None); + if let Some(peer) = self.in_flight.as_mut() { + if peer.peer_id == from { + peer.version = PeerVer::V2; + self.done_with_peer(DownGrade::None); + } + } return Ok(()); } From cf2b9c7cf2587308dc34ea3f89710acd22ca17e9 Mon Sep 17 00:00:00 2001 From: Shawn Date: Fri, 17 Jan 2025 17:54:35 +0800 Subject: [PATCH 074/119] feat: sync batch_size should depend on the current request_range, not peer specific. --- zilliqa/src/sync.rs | 59 +++++++++++++++++++-------------------------- 1 file changed, 25 insertions(+), 34 deletions(-) diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index 9caea8cdf..7cb9e8df0 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -68,6 +68,7 @@ pub struct Sync { peers: BinaryHeap, // peer handling an in-flight request in_flight: Option, + in_flight_reason: DownGrade, // in-flight request timeout, before retry request_timeout: Duration, // how many blocks to request at once @@ -114,8 +115,6 @@ impl Sync { score: 0, peer_id, last_used: Instant::now(), - reason: DownGrade::None, - batch_size: max_batch_size, }) .collect(); @@ -161,6 +160,7 @@ impl Sync { max_batch_size, max_blocks_in_flight, in_flight: None, + in_flight_reason: DownGrade::None, in_pipeline: usize::MIN, state, recent_proposals: VecDeque::with_capacity(max_batch_size), @@ -232,8 +232,6 @@ impl Sync { last_used: Instant::now(), score:u32::MAX, version: row.get(5)?, - reason: DownGrade::None, - batch_size: self.max_batch_size, peer_id: PeerId::from_bytes(row.get::<_,Vec>(4)?.as_slice()).unwrap(), }, ))) @@ -654,8 +652,6 @@ impl Sync { version: PeerVer::V2, peer_id: peer_info.peer_id, last_used: std::time::Instant::now(), - batch_size: self.max_batch_size, // unused in Phase 2 - reason: DownGrade::None, score: u32::MAX, // used to indicate faux peer, will not be added to the group of peers }); ExternalMessage::MultiBlockRequest(request_hashes) @@ -665,8 +661,6 @@ impl Sync { version: PeerVer::V1, peer_id: peer_info.peer_id, last_used: std::time::Instant::now(), - batch_size: self.max_batch_size, // unused in Phase 2 - reason: DownGrade::None, score: u32::MAX, // used to indicate faux peer, will not be added to the group of peers }); // do not add VIEW_DRIFT - the stored marker is accurate! @@ -943,7 +937,7 @@ impl Sync { if let Some(peer) = self.get_next_peer() { tracing::info!( "sync::RequestMissingMetadata : requesting {} metadata of segment #{} from {}", - peer.batch_size, + self.max_batch_size, self.count_segments()? + 1, peer.peer_id ); @@ -954,7 +948,7 @@ impl Sync { ExternalMessage::MetaDataRequest(BlockRequestV2 { request_at: SystemTime::now(), from_hash: parent_hash, - batch_size: peer.batch_size, + batch_size: self.max_batch_size, }) } SyncState::Phase1(ChainMetaData { view_number, .. }) @@ -963,10 +957,10 @@ impl Sync { // For V1 BlockRequest, we request a little more than we need, due to drift // Since the view number is an 'internal' clock, it is possible for the same block number // to have different view numbers. - let drift = peer.batch_size as u64 / 10; + let drift = self.max_batch_size as u64 / 10; ExternalMessage::BlockRequest(BlockRequest { to_view: view_number.saturating_add(drift), - from_view: view_number.saturating_sub(peer.batch_size as u64), + from_view: view_number.saturating_sub(self.max_batch_size as u64), }) } SyncState::Phase0 if meta.is_some() && matches!(peer.version, PeerVer::V2) => { @@ -976,17 +970,17 @@ impl Sync { ExternalMessage::MetaDataRequest(BlockRequestV2 { request_at: SystemTime::now(), from_hash: parent_hash, - batch_size: peer.batch_size, + batch_size: self.max_batch_size, }) } SyncState::Phase0 if meta.is_some() && matches!(peer.version, PeerVer::V1) => { let meta = meta.unwrap(); let view_number = meta.view_number; self.state = SyncState::Phase1(meta); - let drift = peer.batch_size as u64 / 10; + let drift = self.max_batch_size as u64 / 10; ExternalMessage::BlockRequest(BlockRequest { to_view: view_number.saturating_add(drift), - from_view: view_number.saturating_sub(peer.batch_size as u64), + from_view: view_number.saturating_sub(self.max_batch_size as u64), }) } _ => anyhow::bail!("sync::MissingMetadata : invalid state"), @@ -1076,7 +1070,7 @@ impl Sync { fn done_with_peer(&mut self, downgrade: DownGrade) { if let Some(mut peer) = self.in_flight.take() { tracing::trace!("sync::DoneWithPeer {} {:?}", peer.peer_id, downgrade); - peer.reason = downgrade.clone(); + self.in_flight_reason = downgrade.clone(); peer.score = peer.score.saturating_add(downgrade as u32); // Ensure that the next peer is equal or better peer.score = peer.score.max(self.peers.peek().unwrap().score); @@ -1106,8 +1100,6 @@ impl Sync { score: self.peers.iter().map(|p| p.score).min().unwrap_or_default(), peer_id: peer, last_used: Instant::now(), - reason: DownGrade::None, - batch_size: self.max_batch_size, }; // ensure that it is unique - avoids single source of truth self.peers.retain(|p: &PeerInfo| p.peer_id != peer); @@ -1127,7 +1119,7 @@ impl Sync { if self.peers.len() >= Self::MIN_PEERS { let mut peer = self.peers.pop()?; peer.last_used = std::time::Instant::now(); // used to determine stale requests. - peer.batch_size = self.dynamic_batch_sizing(&peer); + self.max_batch_size = self.dynamic_batch_sizing(&peer); tracing::trace!("sync::GetNextPeer {} ({})", peer.peer_id, peer.score); return Some(peer); } @@ -1140,22 +1132,23 @@ impl Sync { /// Due to a hard-coded 10MB response limit in libp2p, we may be limited in how many blocks we can request /// for in a single request, between 1-100 blocks. fn dynamic_batch_sizing(&self, peer: &PeerInfo) -> usize { - match (&self.state, &peer.version, &peer.reason) { + match (&self.state, &peer.version, &self.in_flight_reason) { // V1 response may be too large. Reduce request range. - (SyncState::Phase1(_), PeerVer::V1, DownGrade::Timeout) => { - peer.batch_size.saturating_sub(peer.batch_size / 2).max(1) - } + (SyncState::Phase1(_), PeerVer::V1, DownGrade::Timeout) => self + .max_batch_size + .saturating_sub(self.max_batch_size / 2) + .max(1), // V1 response may be too large. Reduce request range. - (SyncState::Phase1(_), PeerVer::V1, DownGrade::Empty) => { - peer.batch_size.saturating_sub(peer.batch_size / 3).max(1) - } + (SyncState::Phase1(_), PeerVer::V1, DownGrade::Empty) => self + .max_batch_size + .saturating_sub(self.max_batch_size / 3) + .max(1), // V1 responses are going well, increase the request range linearly - (SyncState::Phase1(_), PeerVer::V1, DownGrade::None) => { - peer.batch_size - .saturating_add(self.max_batch_size / 10) - // For V1, ~100 empty blocks saturates the response payload - .min(100) - } + (SyncState::Phase1(_), PeerVer::V1, DownGrade::None) => self + .max_batch_size + .saturating_add(self.max_batch_size) + // For V1, ~100 empty blocks saturates the response payload + .min(100), // V2 response may be too large, which can induce a timeout. Split into 10 block segments _ => self.max_batch_size, } @@ -1201,8 +1194,6 @@ struct PeerInfo { peer_id: PeerId, last_used: Instant, version: PeerVer, - batch_size: usize, - reason: DownGrade, } impl Ord for PeerInfo { From 8b40988cb5c35a0dac4720b3f4ba0a1c5d533fce Mon Sep 17 00:00:00 2001 From: Shawn Date: Fri, 17 Jan 2025 22:42:37 +0800 Subject: [PATCH 075/119] feat: simplified the request_missing_metadata() match selector. --- zilliqa/src/sync.rs | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index 7cb9e8df0..907049429 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -941,19 +941,15 @@ impl Sync { self.count_segments()? + 1, peer.peer_id ); - let message = match self.state { - SyncState::Phase1(ChainMetaData { parent_hash, .. }) - if matches!(peer.version, PeerVer::V2) => - { + let message = match (self.state.clone(), &peer.version) { + (SyncState::Phase1(ChainMetaData { parent_hash, .. }), PeerVer::V2) => { ExternalMessage::MetaDataRequest(BlockRequestV2 { request_at: SystemTime::now(), from_hash: parent_hash, batch_size: self.max_batch_size, }) } - SyncState::Phase1(ChainMetaData { view_number, .. }) - if matches!(peer.version, PeerVer::V1) => - { + (SyncState::Phase1(ChainMetaData { view_number, .. }), PeerVer::V1) => { // For V1 BlockRequest, we request a little more than we need, due to drift // Since the view number is an 'internal' clock, it is possible for the same block number // to have different view numbers. @@ -963,7 +959,7 @@ impl Sync { from_view: view_number.saturating_sub(self.max_batch_size as u64), }) } - SyncState::Phase0 if meta.is_some() && matches!(peer.version, PeerVer::V2) => { + (SyncState::Phase0, PeerVer::V2) if meta.is_some() => { let meta = meta.unwrap(); let parent_hash = meta.parent_hash; self.state = SyncState::Phase1(meta); @@ -973,7 +969,7 @@ impl Sync { batch_size: self.max_batch_size, }) } - SyncState::Phase0 if meta.is_some() && matches!(peer.version, PeerVer::V1) => { + (SyncState::Phase0, PeerVer::V1) if meta.is_some() => { let meta = meta.unwrap(); let view_number = meta.view_number; self.state = SyncState::Phase1(meta); @@ -1133,17 +1129,12 @@ impl Sync { /// for in a single request, between 1-100 blocks. fn dynamic_batch_sizing(&self, peer: &PeerInfo) -> usize { match (&self.state, &peer.version, &self.in_flight_reason) { - // V1 response may be too large. Reduce request range. - (SyncState::Phase1(_), PeerVer::V1, DownGrade::Timeout) => self - .max_batch_size - .saturating_sub(self.max_batch_size / 2) - .max(1), - // V1 response may be too large. Reduce request range. + // V1 response may be too large, reduce request range. (SyncState::Phase1(_), PeerVer::V1, DownGrade::Empty) => self .max_batch_size .saturating_sub(self.max_batch_size / 3) .max(1), - // V1 responses are going well, increase the request range linearly + // V1 response going well, increase the request range (SyncState::Phase1(_), PeerVer::V1, DownGrade::None) => self .max_batch_size .saturating_add(self.max_batch_size) @@ -1233,7 +1224,7 @@ impl PartialOrd for DownGrade { } /// Sync state -#[derive(Debug)] +#[derive(Debug, Clone)] enum SyncState { Phase0, Phase1(ChainMetaData), From 2146f8409158e6a77c80862f7df71d9135fbee61 Mon Sep 17 00:00:00 2001 From: Shawn Date: Mon, 20 Jan 2025 11:17:16 +0800 Subject: [PATCH 076/119] fix: improve test sync, added Network::run_until_synced(); --- zilliqa/tests/it/main.rs | 20 ++++++++++++++++++++ zilliqa/tests/it/staking.rs | 20 +++++++++++++++++++- 2 files changed, 39 insertions(+), 1 deletion(-) diff --git a/zilliqa/tests/it/main.rs b/zilliqa/tests/it/main.rs index 481b73c8b..a0fa93758 100644 --- a/zilliqa/tests/it/main.rs +++ b/zilliqa/tests/it/main.rs @@ -836,6 +836,10 @@ impl Network { true } } + AnyMessage::External(ExternalMessage::InjectedProposal(_)) => { + self.handle_message(m.clone()); + false + } _ => true, }); @@ -1049,6 +1053,22 @@ impl Network { } } + async fn run_until_synced(&mut self, index: usize) { + let mut check = self.rng.lock().unwrap().gen_range(0..self.nodes.len()); + while check == index { + check = self.rng.lock().unwrap().gen_range(0..self.nodes.len()); + } + self.run_until( + |net| { + net.get_node(index).get_finalized_height().unwrap() + >= net.get_node(check).get_finalized_height().unwrap() + }, + 1000, + ) + .await + .unwrap(); + } + async fn run_until( &mut self, mut condition: impl FnMut(&mut Network) -> bool, diff --git a/zilliqa/tests/it/staking.rs b/zilliqa/tests/it/staking.rs index ed5c83473..9c62b02f6 100644 --- a/zilliqa/tests/it/staking.rs +++ b/zilliqa/tests/it/staking.rs @@ -425,12 +425,28 @@ async fn rewards_are_sent_to_reward_address_of_proposer(mut network: Network) { check_miner_got_reward(&wallet, 1).await; } +async fn wait_for_sync(network: &mut Network, index: usize) { + info!("syncing node #{}", index); + // sync up new node + network + .run_until( + |net| { + net.get_node(index).get_finalized_height().unwrap() + >= net.get_node(0).get_finalized_height().unwrap() + }, + 1000, + ) + .await + .unwrap(); + info!("synced node #{}", index); +} + #[zilliqa_macros::test(blocks_per_epoch = 2, deposit_v3_upgrade_block_height = 12)] async fn validators_can_join_and_become_proposer(mut network: Network) { let wallet = network.genesis_wallet().await; // randomise the current epoch state and current leader - let blocks_to_prerun = network.rng.lock().unwrap().gen_range(0..8); + let blocks_to_prerun = network.rng.lock().unwrap().gen_range(1..8); network .run_until_block(&wallet, blocks_to_prerun.into(), 100) .await; @@ -447,6 +463,7 @@ async fn validators_can_join_and_become_proposer(mut network: Network) { let staker_wallet = network.wallet_of_node(index).await; let pop_sinature = new_validator_key.pop_prove(); + wait_for_sync(&mut network, index).await; let deposit_hash = deposit_stake( &mut network, &wallet, @@ -535,6 +552,7 @@ async fn validators_can_join_and_become_proposer(mut network: Network) { .run_until_block(&staker_wallet, deposit_v3_deploy_block.into(), 200) .await; + wait_for_sync(&mut network, index).await; let deposit_hash = deposit_v3_stake( &mut network, &wallet, From e0b2914e0e9133039fbd8fef1a9042f88a09256e Mon Sep 17 00:00:00 2001 From: Shawn Date: Mon, 20 Jan 2025 11:20:28 +0800 Subject: [PATCH 077/119] fix: fixed unreliable::blocks_are_produced_while_a_node_restarts() test. --- zilliqa/tests/it/main.rs | 6 ++---- zilliqa/tests/it/unreliable.rs | 1 + 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/zilliqa/tests/it/main.rs b/zilliqa/tests/it/main.rs index a0fa93758..3462b64e7 100644 --- a/zilliqa/tests/it/main.rs +++ b/zilliqa/tests/it/main.rs @@ -1054,10 +1054,8 @@ impl Network { } async fn run_until_synced(&mut self, index: usize) { - let mut check = self.rng.lock().unwrap().gen_range(0..self.nodes.len()); - while check == index { - check = self.rng.lock().unwrap().gen_range(0..self.nodes.len()); - } + assert!(self.nodes.len() > 1); + let check = if index != 0 { 0 } else { 1 }; self.run_until( |net| { net.get_node(index).get_finalized_height().unwrap() diff --git a/zilliqa/tests/it/unreliable.rs b/zilliqa/tests/it/unreliable.rs index 7518a3131..b35787e93 100644 --- a/zilliqa/tests/it/unreliable.rs +++ b/zilliqa/tests/it/unreliable.rs @@ -26,6 +26,7 @@ async fn blocks_are_produced_while_a_node_restarts(mut network: Network) { // Reconnect the 'restarted' node. network.connect_node(restarted_node); + network.run_until_synced(restarted_node).await; // TODO(#721): We should assert here that a new view occurred if-and-only-if the 'restarted' node was the proposer // of blocks 3 or 4. This would tell us that we aren't producing new views unnecessarily. From 2e1a11178a6828483af0e78004e16bcc245fd04f Mon Sep 17 00:00:00 2001 From: Shawn Date: Mon, 20 Jan 2025 12:27:14 +0800 Subject: [PATCH 078/119] fix: staking::validators_can_join_and_become_proposer() test. --- zilliqa/tests/it/staking.rs | 28 +++++----------------------- 1 file changed, 5 insertions(+), 23 deletions(-) diff --git a/zilliqa/tests/it/staking.rs b/zilliqa/tests/it/staking.rs index 9c62b02f6..ed2954b0c 100644 --- a/zilliqa/tests/it/staking.rs +++ b/zilliqa/tests/it/staking.rs @@ -55,7 +55,7 @@ async fn deposit_stake( .await .unwrap() .tx_hash(); - network.run_until_receipt(staker_wallet, hash, 80).await; + network.run_until_receipt(staker_wallet, hash, 180).await; // Stake the new validator's funds. let tx = TransactionRequest::new() @@ -82,7 +82,7 @@ async fn deposit_stake( .await .unwrap() .tx_hash(); - let receipt = network.run_until_receipt(staker_wallet, hash, 80).await; + let receipt = network.run_until_receipt(staker_wallet, hash, 180).await; assert_eq!(receipt.status.unwrap().as_u64(), 1); hash } @@ -105,7 +105,7 @@ async fn deposit_v3_stake( .await .unwrap() .tx_hash(); - network.run_until_receipt(staker_wallet, hash, 80).await; + network.run_until_receipt(staker_wallet, hash, 180).await; // Stake the new validator's funds. let tx = TransactionRequest::new() @@ -133,7 +133,7 @@ async fn deposit_v3_stake( .await .unwrap() .tx_hash(); - let receipt = network.run_until_receipt(staker_wallet, hash, 80).await; + let receipt = network.run_until_receipt(staker_wallet, hash, 180).await; assert_eq!(receipt.status.unwrap().as_u64(), 1); hash } @@ -425,28 +425,12 @@ async fn rewards_are_sent_to_reward_address_of_proposer(mut network: Network) { check_miner_got_reward(&wallet, 1).await; } -async fn wait_for_sync(network: &mut Network, index: usize) { - info!("syncing node #{}", index); - // sync up new node - network - .run_until( - |net| { - net.get_node(index).get_finalized_height().unwrap() - >= net.get_node(0).get_finalized_height().unwrap() - }, - 1000, - ) - .await - .unwrap(); - info!("synced node #{}", index); -} - #[zilliqa_macros::test(blocks_per_epoch = 2, deposit_v3_upgrade_block_height = 12)] async fn validators_can_join_and_become_proposer(mut network: Network) { let wallet = network.genesis_wallet().await; // randomise the current epoch state and current leader - let blocks_to_prerun = network.rng.lock().unwrap().gen_range(1..8); + let blocks_to_prerun = network.rng.lock().unwrap().gen_range(0..8); network .run_until_block(&wallet, blocks_to_prerun.into(), 100) .await; @@ -463,7 +447,6 @@ async fn validators_can_join_and_become_proposer(mut network: Network) { let staker_wallet = network.wallet_of_node(index).await; let pop_sinature = new_validator_key.pop_prove(); - wait_for_sync(&mut network, index).await; let deposit_hash = deposit_stake( &mut network, &wallet, @@ -552,7 +535,6 @@ async fn validators_can_join_and_become_proposer(mut network: Network) { .run_until_block(&staker_wallet, deposit_v3_deploy_block.into(), 200) .await; - wait_for_sync(&mut network, index).await; let deposit_hash = deposit_v3_stake( &mut network, &wallet, From 25102c5f641890ff50ec28989a954c5e090d1519 Mon Sep 17 00:00:00 2001 From: Shawn Date: Mon, 20 Jan 2025 14:34:12 +0800 Subject: [PATCH 079/119] fix: tests. --- zilliqa/tests/it/consensus.rs | 3 ++- zilliqa/tests/it/main.rs | 35 ++++++++++++++++++---------------- zilliqa/tests/it/staking.rs | 2 ++ zilliqa/tests/it/unreliable.rs | 2 +- 4 files changed, 24 insertions(+), 18 deletions(-) diff --git a/zilliqa/tests/it/consensus.rs b/zilliqa/tests/it/consensus.rs index 607c00fee..7d110fb90 100644 --- a/zilliqa/tests/it/consensus.rs +++ b/zilliqa/tests/it/consensus.rs @@ -106,6 +106,7 @@ async fn block_production(mut network: Network) { info!("Adding networked node."); let index = network.add_node(); + network.run_until_synced(index).await; network .run_until( @@ -116,7 +117,7 @@ async fn block_production(mut network: Network) { .map_or(0, |b| b.number()) >= 10 }, - 1000, + 2000, ) .await .unwrap(); diff --git a/zilliqa/tests/it/main.rs b/zilliqa/tests/it/main.rs index 3462b64e7..4b357d524 100644 --- a/zilliqa/tests/it/main.rs +++ b/zilliqa/tests/it/main.rs @@ -844,19 +844,20 @@ impl Network { }); // Pick a random message - let index = self.rng.lock().unwrap().gen_range(0..messages.len()); - let (source, destination, message) = messages.swap_remove(index); - // Requeue the other messages - for message in messages { - self.resend_message.send(message).unwrap(); - } - - trace!( - "{}", - format_message(&self.nodes, source, destination, &message) - ); + if !messages.is_empty() { + let index = self.rng.lock().unwrap().gen_range(0..messages.len()); + let (source, destination, message) = messages.swap_remove(index); + // Requeue the other messages + for message in messages { + self.resend_message.send(message).unwrap(); + } + trace!( + "{}", + format_message(&self.nodes, source, destination, &message) + ); - self.handle_message((source, destination, message)) + self.handle_message((source, destination, message)) + } } fn handle_message(&mut self, message: StreamMessage) { @@ -1054,14 +1055,16 @@ impl Network { } async fn run_until_synced(&mut self, index: usize) { - assert!(self.nodes.len() > 1); - let check = if index != 0 { 0 } else { 1 }; + let mut check = self.rng.lock().unwrap().gen_range(0..self.nodes.len()); + while index == check { + check = self.rng.lock().unwrap().gen_range(0..self.nodes.len()); + } self.run_until( |net| { net.get_node(index).get_finalized_height().unwrap() - >= net.get_node(check).get_finalized_height().unwrap() + == net.get_node(check).get_finalized_height().unwrap() }, - 1000, + 10000, ) .await .unwrap(); diff --git a/zilliqa/tests/it/staking.rs b/zilliqa/tests/it/staking.rs index ed2954b0c..f6d4455fc 100644 --- a/zilliqa/tests/it/staking.rs +++ b/zilliqa/tests/it/staking.rs @@ -447,6 +447,7 @@ async fn validators_can_join_and_become_proposer(mut network: Network) { let staker_wallet = network.wallet_of_node(index).await; let pop_sinature = new_validator_key.pop_prove(); + network.run_until_synced(index).await; let deposit_hash = deposit_stake( &mut network, &wallet, @@ -535,6 +536,7 @@ async fn validators_can_join_and_become_proposer(mut network: Network) { .run_until_block(&staker_wallet, deposit_v3_deploy_block.into(), 200) .await; + network.run_until_synced(index).await; let deposit_hash = deposit_v3_stake( &mut network, &wallet, diff --git a/zilliqa/tests/it/unreliable.rs b/zilliqa/tests/it/unreliable.rs index b35787e93..8a68acecf 100644 --- a/zilliqa/tests/it/unreliable.rs +++ b/zilliqa/tests/it/unreliable.rs @@ -32,5 +32,5 @@ async fn blocks_are_produced_while_a_node_restarts(mut network: Network) { // of blocks 3 or 4. This would tell us that we aren't producing new views unnecessarily. // Ensure more blocks are produced. - network.run_until_block(&wallet, 12.into(), 400).await; + network.run_until_block(&wallet, 12.into(), 1400).await; } From cab93f3a2eb62181ad52fb91fe4e918505f417f6 Mon Sep 17 00:00:00 2001 From: Shawn Date: Tue, 21 Jan 2025 15:35:32 +0800 Subject: [PATCH 080/119] nit: use Db::contains_block() instead of Db::get_block_by_hash(). --- zilliqa/src/node.rs | 2 +- zilliqa/src/sync.rs | 23 +++++++++-------------- 2 files changed, 10 insertions(+), 15 deletions(-) diff --git a/zilliqa/src/node.rs b/zilliqa/src/node.rs index 58744ce98..d04a7a7ec 100644 --- a/zilliqa/src/node.rs +++ b/zilliqa/src/node.rs @@ -917,8 +917,8 @@ impl Node { return Ok(()); } trace!("Handling proposal for view {0}", req.block.header.view); - self.consensus.sync.mark_received_proposal(&req)?; let proposal = self.consensus.receive_block(from, req.block)?; + self.consensus.sync.mark_received_proposal(req.from)?; if let Some(proposal) = proposal { trace!( " ... broadcasting proposal for view {0}", diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index 907049429..5ef335a3e 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -353,7 +353,7 @@ impl Sync { // Check if we are out of sync SyncState::Phase0 if self.in_pipeline == 0 => { let parent_hash = self.recent_proposals.back().unwrap().header.qc.block_hash; - if self.db.get_block_by_hash(&parent_hash)?.is_none() { + if !self.db.contains_block(&parent_hash)? { // No parent block, trigger sync tracing::warn!("sync::SyncProposal : syncing from {parent_hash}",); let block_hash = self.recent_proposals.back().unwrap().hash(); @@ -389,7 +389,7 @@ impl Sync { // Wait till 99% synced, zip it up! SyncState::Phase3 if self.in_pipeline == 0 => { let ancestor_hash = self.recent_proposals.front().unwrap().header.qc.block_hash; - if self.db.get_block_by_hash(&ancestor_hash)?.is_some() { + if self.db.contains_block(&ancestor_hash)? { tracing::info!( "sync::SyncProposal : finishing {} blocks for segment #{} from {}", self.recent_proposals.len(), @@ -399,9 +399,10 @@ impl Sync { // inject the proposals let proposals = self.recent_proposals.drain(..).collect_vec(); self.inject_proposals(proposals)?; + } else { + self.empty_metadata()?; + self.state = SyncState::Phase0; } - self.empty_metadata()?; - self.state = SyncState::Phase0; } // Retry to fix sync issues e.g. peers that are now offline SyncState::Retry1 if self.in_pipeline == 0 => { @@ -852,7 +853,7 @@ impl Sync { self.insert_metadata(segment)?; // If the segment hits our history, start Phase 2. - if self.db.get_block_by_hash(&last_block_hash)?.is_some() { + if self.db.contains_block(&last_block_hash)? { self.state = SyncState::Phase2(Hash::ZERO); } else if Self::DO_SPECULATIVE { self.request_missing_metadata(None)?; @@ -1042,19 +1043,13 @@ impl Sync { /// Mark a received proposal /// /// Mark a proposal as received, and remove it from the chain. - pub fn mark_received_proposal(&mut self, prop: &InjectedProposal) -> Result<()> { - if prop.from != self.peer_id { + pub fn mark_received_proposal(&mut self, from: PeerId) -> Result<()> { + if from != self.peer_id { tracing::error!( "sync::MarkReceivedProposal : foreign InjectedProposal from {}", - prop.from + from ); } - // if let Some(p) = self.chain_metadata.remove(&prop.block.hash()) { - // tracing::warn!( - // "sync::MarkReceivedProposal : removing stale metadata {}", - // p.block_hash - // ); - // } self.in_pipeline = self.in_pipeline.saturating_sub(1); Ok(()) } From 497e80720f2361d0b35e6b59cdc7dcb8a8006bb2 Mon Sep 17 00:00:00 2001 From: Shawn Date: Tue, 21 Jan 2025 15:36:10 +0800 Subject: [PATCH 081/119] fix: tests. --- zilliqa/tests/it/main.rs | 8 +++++--- zilliqa/tests/it/staking.rs | 3 +-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/zilliqa/tests/it/main.rs b/zilliqa/tests/it/main.rs index 4b357d524..8c81b83ce 100644 --- a/zilliqa/tests/it/main.rs +++ b/zilliqa/tests/it/main.rs @@ -1061,10 +1061,12 @@ impl Network { } self.run_until( |net| { - net.get_node(index).get_finalized_height().unwrap() - == net.get_node(check).get_finalized_height().unwrap() + let height_i = net.get_node(index).get_finalized_height().unwrap(); + let height_c = net.get_node(check).get_finalized_height().unwrap(); + info!("syncing {}/{}", height_i, height_c); + height_c == height_i && height_i > 0 }, - 10000, + 1000, ) .await .unwrap(); diff --git a/zilliqa/tests/it/staking.rs b/zilliqa/tests/it/staking.rs index f6d4455fc..b745a27db 100644 --- a/zilliqa/tests/it/staking.rs +++ b/zilliqa/tests/it/staking.rs @@ -447,7 +447,6 @@ async fn validators_can_join_and_become_proposer(mut network: Network) { let staker_wallet = network.wallet_of_node(index).await; let pop_sinature = new_validator_key.pop_prove(); - network.run_until_synced(index).await; let deposit_hash = deposit_stake( &mut network, &wallet, @@ -536,7 +535,6 @@ async fn validators_can_join_and_become_proposer(mut network: Network) { .run_until_block(&staker_wallet, deposit_v3_deploy_block.into(), 200) .await; - network.run_until_synced(index).await; let deposit_hash = deposit_v3_stake( &mut network, &wallet, @@ -609,6 +607,7 @@ async fn block_proposers_are_selected_proportionally_to_their_stake(mut network: let staker_wallet = network.wallet_of_node(index).await; let pop_signature = new_validator_key.pop_prove(); + network.run_until_synced(index).await; deposit_stake( &mut network, &wallet, From 37fb852bd265abd06d16fac94a299cea5cee739a Mon Sep 17 00:00:00 2001 From: Shawn Date: Tue, 21 Jan 2025 19:57:23 +0800 Subject: [PATCH 082/119] feat: retry sync against upgraded Peer, immediately. --- zilliqa/src/consensus.rs | 2 +- zilliqa/src/sync.rs | 26 ++++++++++---------------- 2 files changed, 11 insertions(+), 17 deletions(-) diff --git a/zilliqa/src/consensus.rs b/zilliqa/src/consensus.rs index f079b538d..393ba5200 100644 --- a/zilliqa/src/consensus.rs +++ b/zilliqa/src/consensus.rs @@ -205,7 +205,7 @@ impl Consensus { )?; } - let sync = Sync::new(&config, db.clone(), message_sender.clone(), Vec::new())?; + let sync = Sync::new(&config, db.clone(), message_sender.clone())?; let latest_block = db .get_finalized_view()? diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index 5ef335a3e..d9746bb7a 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -99,24 +99,10 @@ impl Sync { // Minimum of 2 peers to avoid single source of truth. const MIN_PEERS: usize = 2; - pub fn new( - config: &NodeConfig, - db: Arc, - message_sender: MessageSender, - peers: Vec, - ) -> Result { + pub fn new(config: &NodeConfig, db: Arc, message_sender: MessageSender) -> Result { let peer_id = message_sender.our_peer_id; let max_batch_size = config.block_request_batch_size.clamp(30, 180); // up to 180 sec of blocks at a time. let max_blocks_in_flight = config.max_blocks_in_flight.clamp(max_batch_size, 1800); // up to 30-mins worth of blocks in-pipeline. - let peers = peers - .into_iter() - .map(|peer_id| PeerInfo { - version: PeerVer::V1, // default to V1 peer, until otherwise proven. - score: 0, - peer_id, - last_used: Instant::now(), - }) - .collect(); // This DB could be left in-here as it is only used in this module // TODO: Make this in-memory by exploiting SQLite TEMP tables i.e. CREATE TEMP TABLE @@ -154,7 +140,7 @@ impl Sync { Ok(Self { db, message_sender, - peers, + peers: BinaryHeap::new(), peer_id, request_timeout: config.consensus.consensus_timeout, max_batch_size, @@ -698,6 +684,14 @@ impl Sync { if let Some(peer) = self.in_flight.as_mut() { if peer.peer_id == from { peer.version = PeerVer::V2; + // retry with upgraded peer + peer.last_used = self + .peers + .peek() + .expect("peers.len() > 1") + .last_used + .checked_sub(Duration::from_secs(1)) + .expect("time is ordinal"); self.done_with_peer(DownGrade::None); } } From dd1b9f89f92e4f094d4010f46645a101b5bd9897 Mon Sep 17 00:00:00 2001 From: Shawn Date: Tue, 21 Jan 2025 21:47:10 +0800 Subject: [PATCH 083/119] fix: checkpoints_test(), randomized add_peers() for tests. --- zilliqa/tests/it/main.rs | 9 ++++++--- zilliqa/tests/it/persistence.rs | 1 + 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/zilliqa/tests/it/main.rs b/zilliqa/tests/it/main.rs index 8c81b83ce..59bc90c11 100644 --- a/zilliqa/tests/it/main.rs +++ b/zilliqa/tests/it/main.rs @@ -401,7 +401,8 @@ impl Network { let receive_resend_message = UnboundedReceiverStream::new(receive_resend_message).boxed(); receivers.push(receive_resend_message); - let peers = nodes.iter().map(|n| n.peer_id).collect_vec(); + let mut peers = nodes.iter().map(|n| n.peer_id).collect_vec(); + peers.shuffle(rng.lock().unwrap().deref_mut()); for node in &nodes { trace!( @@ -513,7 +514,8 @@ impl Network { let (node, receiver, local_receiver, request_responses) = node(config, secret_key, onchain_key, self.nodes.len(), None).unwrap(); - let peers = self.nodes.iter().map(|n| n.peer_id).collect_vec(); + let mut peers = self.nodes.iter().map(|n| n.peer_id).collect_vec(); + peers.shuffle(self.rng.lock().unwrap().deref_mut()); node.inner.lock().unwrap().consensus.sync.add_peers(peers); trace!("Node {}: {}", node.index, node.peer_id); @@ -578,7 +580,8 @@ impl Network { .chain(request_response_receivers) .collect(); - let peers = nodes.iter().map(|n| n.peer_id).collect_vec(); + let mut peers = nodes.iter().map(|n| n.peer_id).collect_vec(); + peers.shuffle(self.rng.lock().unwrap().deref_mut()); for node in &nodes { trace!( diff --git a/zilliqa/tests/it/persistence.rs b/zilliqa/tests/it/persistence.rs index b4230308a..5505d522b 100644 --- a/zilliqa/tests/it/persistence.rs +++ b/zilliqa/tests/it/persistence.rs @@ -268,6 +268,7 @@ async fn checkpoints_test(mut network: Network) { assert_eq!(state["welcome_msg"], "default"); // check the new node catches up and keeps up with block production + network.run_until_synced(new_node_idx).await; network .run_until_block(&new_node_wallet, 20.into(), 200) .await; From df7fc9447289255701bdf02911fd88532a5a0356 Mon Sep 17 00:00:00 2001 From: Shawn Date: Wed, 22 Jan 2025 09:29:40 +0800 Subject: [PATCH 084/119] fix: handle_forking(), validators_can_join() test. --- zilliqa/tests/it/consensus.rs | 1 + zilliqa/tests/it/staking.rs | 2 ++ 2 files changed, 3 insertions(+) diff --git a/zilliqa/tests/it/consensus.rs b/zilliqa/tests/it/consensus.rs index 7d110fb90..4d87d39c8 100644 --- a/zilliqa/tests/it/consensus.rs +++ b/zilliqa/tests/it/consensus.rs @@ -170,6 +170,7 @@ async fn handle_forking_correctly(mut network: Network) { let original_receipt = first.unwrap(); trace!("Running until the network has reverted the block"); + network.run_until_synced(0).await; // Now we should be able to run the network until we get a different tx receipt from the first // node, which indicates that it has reverted the block network diff --git a/zilliqa/tests/it/staking.rs b/zilliqa/tests/it/staking.rs index b745a27db..4817faf00 100644 --- a/zilliqa/tests/it/staking.rs +++ b/zilliqa/tests/it/staking.rs @@ -447,6 +447,7 @@ async fn validators_can_join_and_become_proposer(mut network: Network) { let staker_wallet = network.wallet_of_node(index).await; let pop_sinature = new_validator_key.pop_prove(); + network.run_until_synced(index).await; let deposit_hash = deposit_stake( &mut network, &wallet, @@ -531,6 +532,7 @@ async fn validators_can_join_and_become_proposer(mut network: Network) { ); // Give new node time to catch up to block including deposit_v3 deployment + network.run_until_synced(index).await; network .run_until_block(&staker_wallet, deposit_v3_deploy_block.into(), 200) .await; From a57ed20db9fc33c2867e9b084bea8fe14b5da587 Mon Sep 17 00:00:00 2001 From: Shawn Date: Wed, 22 Jan 2025 10:10:21 +0800 Subject: [PATCH 085/119] fix: experiment. --- zilliqa/src/db.rs | 2 +- zilliqa/src/sync.rs | 18 +++++++++++++----- zilliqa/tests/it/main.rs | 14 ++++++++++---- zilliqa/tests/it/staking.rs | 16 ++++++++-------- 4 files changed, 32 insertions(+), 18 deletions(-) diff --git a/zilliqa/src/db.rs b/zilliqa/src/db.rs index 31c104b90..aba4e0968 100644 --- a/zilliqa/src/db.rs +++ b/zilliqa/src/db.rs @@ -1191,7 +1191,7 @@ fn decompress_file + Debug>(input_file_path: P, output_file_path: /// An implementor of [eth_trie::DB] which uses a [Connection] to persist data. #[derive(Debug, Clone)] pub struct TrieStorage { - pub db: Arc>, + db: Arc>, cache: Arc, Vec>>>, } diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index d9746bb7a..5ad85aba8 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -385,10 +385,9 @@ impl Sync { // inject the proposals let proposals = self.recent_proposals.drain(..).collect_vec(); self.inject_proposals(proposals)?; - } else { - self.empty_metadata()?; - self.state = SyncState::Phase0; } + self.empty_metadata()?; + self.state = SyncState::Phase0; } // Retry to fix sync issues e.g. peers that are now offline SyncState::Retry1 if self.in_pipeline == 0 => { @@ -693,6 +692,14 @@ impl Sync { .checked_sub(Duration::from_secs(1)) .expect("time is ordinal"); self.done_with_peer(DownGrade::None); + + if Self::DO_SPECULATIVE { + match self.state { + SyncState::Phase1(_) => self.request_missing_metadata(None)?, + SyncState::Phase2(_) => self.request_missing_blocks()?, + _ => {} + } + } } } return Ok(()); @@ -1137,8 +1144,9 @@ impl Sync { /// Returns (am_syncing, current_highest_block) pub fn am_syncing(&self) -> Result { Ok(self.in_pipeline != 0 - || self.count_segments()? != 0 - || !self.recent_proposals.is_empty()) + || !matches!(self.state, SyncState::Phase0) + || !self.recent_proposals.is_empty() + || self.count_segments()? != 0) } // Returns (starting_block, current_block, highest_block) if we're syncing, diff --git a/zilliqa/tests/it/main.rs b/zilliqa/tests/it/main.rs index 59bc90c11..e826aea73 100644 --- a/zilliqa/tests/it/main.rs +++ b/zilliqa/tests/it/main.rs @@ -995,7 +995,7 @@ impl Network { external_message.clone(), response_channel, ) - .unwrap(); + .ok(); // TODO: better error handling } }); } @@ -1062,14 +1062,20 @@ impl Network { while index == check { check = self.rng.lock().unwrap().gen_range(0..self.nodes.len()); } + let mut debounce = 0; + let mut old_height = 0; self.run_until( |net| { let height_i = net.get_node(index).get_finalized_height().unwrap(); let height_c = net.get_node(check).get_finalized_height().unwrap(); - info!("syncing {}/{}", height_i, height_c); - height_c == height_i && height_i > 0 + info!("syncing {}/{}/{}", height_i, height_c, debounce); + if height_c == height_i && height_i > old_height { + debounce += 1; + old_height = height_i; + } + debounce == 3 }, - 1000, + 10000, ) .await .unwrap(); diff --git a/zilliqa/tests/it/staking.rs b/zilliqa/tests/it/staking.rs index 4817faf00..65dd1a7e8 100644 --- a/zilliqa/tests/it/staking.rs +++ b/zilliqa/tests/it/staking.rs @@ -55,7 +55,7 @@ async fn deposit_stake( .await .unwrap() .tx_hash(); - network.run_until_receipt(staker_wallet, hash, 180).await; + network.run_until_receipt(staker_wallet, hash, 80).await; // Stake the new validator's funds. let tx = TransactionRequest::new() @@ -82,7 +82,7 @@ async fn deposit_stake( .await .unwrap() .tx_hash(); - let receipt = network.run_until_receipt(staker_wallet, hash, 180).await; + let receipt = network.run_until_receipt(staker_wallet, hash, 80).await; assert_eq!(receipt.status.unwrap().as_u64(), 1); hash } @@ -105,7 +105,7 @@ async fn deposit_v3_stake( .await .unwrap() .tx_hash(); - network.run_until_receipt(staker_wallet, hash, 180).await; + network.run_until_receipt(staker_wallet, hash, 80).await; // Stake the new validator's funds. let tx = TransactionRequest::new() @@ -133,7 +133,7 @@ async fn deposit_v3_stake( .await .unwrap() .tx_hash(); - let receipt = network.run_until_receipt(staker_wallet, hash, 180).await; + let receipt = network.run_until_receipt(staker_wallet, hash, 80).await; assert_eq!(receipt.status.unwrap().as_u64(), 1); hash } @@ -430,10 +430,10 @@ async fn validators_can_join_and_become_proposer(mut network: Network) { let wallet = network.genesis_wallet().await; // randomise the current epoch state and current leader - let blocks_to_prerun = network.rng.lock().unwrap().gen_range(0..8); - network - .run_until_block(&wallet, blocks_to_prerun.into(), 100) - .await; + // let blocks_to_prerun = network.rng.lock().unwrap().gen_range(0..8); + // network + // .run_until_block(&wallet, blocks_to_prerun.into(), 100) + // .await; // First test joining deposit_v2 let index = network.add_node(); From d720cd5e30da9f002480870074679eee7b447df2 Mon Sep 17 00:00:00 2001 From: Shawn Date: Thu, 23 Jan 2025 09:59:07 +0800 Subject: [PATCH 086/119] fix: checkpoints_test(). --- zilliqa/tests/it/persistence.rs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/zilliqa/tests/it/persistence.rs b/zilliqa/tests/it/persistence.rs index 5505d522b..ccaf27eea 100644 --- a/zilliqa/tests/it/persistence.rs +++ b/zilliqa/tests/it/persistence.rs @@ -169,7 +169,7 @@ async fn checkpoints_test(mut network: Network) { .await .unwrap() .tx_hash(); - network.run_until_receipt(&wallet, update_tx_hash, 50).await; + network.run_until_receipt(&wallet, update_tx_hash, 51).await; // Scilla let (secret_key, address) = zilliqa_account(&mut network).await; let code = scilla_test_contract_code(); @@ -178,7 +178,7 @@ async fn checkpoints_test(mut network: Network) { deploy_scilla_contract(&mut network, &secret_key, &code, &data).await; // Run until block 9 so that we can insert a tx in block 10 (note that this transaction may not *always* appear in the desired block, therefore we do not assert its presence later) - network.run_until_block(&wallet, 9.into(), 200).await; + network.run_until_block(&wallet, 9.into(), 209).await; let _hash = wallet .send_transaction(TransactionRequest::pay(wallet.address(), 10), None) @@ -187,7 +187,7 @@ async fn checkpoints_test(mut network: Network) { .tx_hash(); // wait 10 blocks for checkpoint to happen - then 3 more to finalize that block - network.run_until_block(&wallet, 13.into(), 200).await; + network.run_until_block(&wallet, 13.into(), 213).await; let checkpoint_files = network .nodes @@ -268,9 +268,8 @@ async fn checkpoints_test(mut network: Network) { assert_eq!(state["welcome_msg"], "default"); // check the new node catches up and keeps up with block production - network.run_until_synced(new_node_idx).await; network - .run_until_block(&new_node_wallet, 20.into(), 200) + .run_until_block(&new_node_wallet, 20.into(), 220) .await; // check account nonce of old wallet From 902e38b0ea78eea6aa3429b7c21d9e745dd1c1a0 Mon Sep 17 00:00:00 2001 From: Shawn Date: Thu, 23 Jan 2025 12:03:04 +0800 Subject: [PATCH 087/119] feat: check for checkpoint block, not just history. --- zilliqa/src/consensus.rs | 2 ++ zilliqa/src/sync.rs | 22 +++++++++++++++++++--- zilliqa/tests/it/main.rs | 2 +- zilliqa/tests/it/persistence.rs | 1 + 4 files changed, 23 insertions(+), 4 deletions(-) diff --git a/zilliqa/src/consensus.rs b/zilliqa/src/consensus.rs index 393ba5200..dbe036077 100644 --- a/zilliqa/src/consensus.rs +++ b/zilliqa/src/consensus.rs @@ -367,6 +367,8 @@ impl Consensus { .at_root(parent.state_root_hash().into()) .get_stakers(block.header)?, )?; + + consensus.sync.set_checkpoint(&block); } // If timestamp of when current high_qc was written exists then use it to estimate the minimum number of blocks the network has moved on since shut down diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index 5ad85aba8..7a70788ca 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -87,6 +87,8 @@ pub struct Sync { inject_at: Option<(std::time::Instant, usize)>, // record starting number, for eth_syncing() RPC call. started_at_block_number: u64, + // checkpoint + checkpoint_hash: Option, } impl Sync { @@ -152,9 +154,16 @@ impl Sync { recent_proposals: VecDeque::with_capacity(max_batch_size), inject_at: None, started_at_block_number: 0, + checkpoint_hash: None, }) } + pub fn set_checkpoint(&mut self, checkpoint: &Block) { + let hash = checkpoint.hash(); + tracing::info!("sync::Checkpoint {}", hash); + self.checkpoint_hash = Some(hash); + } + /// Returns the number of stored segments fn count_segments(&self) -> Result { let mut segments = 0; @@ -284,7 +293,7 @@ impl Sync { } /// Bulk inserts a bunch of metadata. - fn insert_metadata(&self, metas: Vec) -> Result<()> { + fn insert_metadata(&self, metas: &Vec) -> Result<()> { self.db.with_sqlite_tx(|c| { for meta in metas { c.prepare_cached( @@ -851,10 +860,17 @@ impl Sync { ); // Record the constructed chain metadata - self.insert_metadata(segment)?; + self.insert_metadata(&segment)?; + + // If the checkpoint is in this segment, + let checkpointed = if let Some(checkpoint) = self.checkpoint_hash { + segment.iter().any(|b| b.block_hash == checkpoint) + } else { + false + }; // If the segment hits our history, start Phase 2. - if self.db.contains_block(&last_block_hash)? { + if checkpointed || self.db.contains_block(&last_block_hash)? { self.state = SyncState::Phase2(Hash::ZERO); } else if Self::DO_SPECULATIVE { self.request_missing_metadata(None)?; diff --git a/zilliqa/tests/it/main.rs b/zilliqa/tests/it/main.rs index e826aea73..ac370166c 100644 --- a/zilliqa/tests/it/main.rs +++ b/zilliqa/tests/it/main.rs @@ -1075,7 +1075,7 @@ impl Network { } debounce == 3 }, - 10000, + 2000, ) .await .unwrap(); diff --git a/zilliqa/tests/it/persistence.rs b/zilliqa/tests/it/persistence.rs index ccaf27eea..7757f1859 100644 --- a/zilliqa/tests/it/persistence.rs +++ b/zilliqa/tests/it/persistence.rs @@ -268,6 +268,7 @@ async fn checkpoints_test(mut network: Network) { assert_eq!(state["welcome_msg"], "default"); // check the new node catches up and keeps up with block production + network.run_until_synced(new_node_idx).await; network .run_until_block(&new_node_wallet, 20.into(), 220) .await; From 626e49d8f1b059804596c931a96771bfea224b59 Mon Sep 17 00:00:00 2001 From: Shawn Date: Thu, 23 Jan 2025 12:53:17 +0800 Subject: [PATCH 088/119] fix: undoing some test timeouts. --- zilliqa/tests/it/consensus.rs | 2 +- zilliqa/tests/it/main.rs | 1 - zilliqa/tests/it/persistence.rs | 8 ++++---- zilliqa/tests/it/unreliable.rs | 2 +- 4 files changed, 6 insertions(+), 7 deletions(-) diff --git a/zilliqa/tests/it/consensus.rs b/zilliqa/tests/it/consensus.rs index 4d87d39c8..e456f52c3 100644 --- a/zilliqa/tests/it/consensus.rs +++ b/zilliqa/tests/it/consensus.rs @@ -117,7 +117,7 @@ async fn block_production(mut network: Network) { .map_or(0, |b| b.number()) >= 10 }, - 2000, + 100, ) .await .unwrap(); diff --git a/zilliqa/tests/it/main.rs b/zilliqa/tests/it/main.rs index ac370166c..cf07bcecb 100644 --- a/zilliqa/tests/it/main.rs +++ b/zilliqa/tests/it/main.rs @@ -1068,7 +1068,6 @@ impl Network { |net| { let height_i = net.get_node(index).get_finalized_height().unwrap(); let height_c = net.get_node(check).get_finalized_height().unwrap(); - info!("syncing {}/{}/{}", height_i, height_c, debounce); if height_c == height_i && height_i > old_height { debounce += 1; old_height = height_i; diff --git a/zilliqa/tests/it/persistence.rs b/zilliqa/tests/it/persistence.rs index 7757f1859..5505d522b 100644 --- a/zilliqa/tests/it/persistence.rs +++ b/zilliqa/tests/it/persistence.rs @@ -169,7 +169,7 @@ async fn checkpoints_test(mut network: Network) { .await .unwrap() .tx_hash(); - network.run_until_receipt(&wallet, update_tx_hash, 51).await; + network.run_until_receipt(&wallet, update_tx_hash, 50).await; // Scilla let (secret_key, address) = zilliqa_account(&mut network).await; let code = scilla_test_contract_code(); @@ -178,7 +178,7 @@ async fn checkpoints_test(mut network: Network) { deploy_scilla_contract(&mut network, &secret_key, &code, &data).await; // Run until block 9 so that we can insert a tx in block 10 (note that this transaction may not *always* appear in the desired block, therefore we do not assert its presence later) - network.run_until_block(&wallet, 9.into(), 209).await; + network.run_until_block(&wallet, 9.into(), 200).await; let _hash = wallet .send_transaction(TransactionRequest::pay(wallet.address(), 10), None) @@ -187,7 +187,7 @@ async fn checkpoints_test(mut network: Network) { .tx_hash(); // wait 10 blocks for checkpoint to happen - then 3 more to finalize that block - network.run_until_block(&wallet, 13.into(), 213).await; + network.run_until_block(&wallet, 13.into(), 200).await; let checkpoint_files = network .nodes @@ -270,7 +270,7 @@ async fn checkpoints_test(mut network: Network) { // check the new node catches up and keeps up with block production network.run_until_synced(new_node_idx).await; network - .run_until_block(&new_node_wallet, 20.into(), 220) + .run_until_block(&new_node_wallet, 20.into(), 200) .await; // check account nonce of old wallet diff --git a/zilliqa/tests/it/unreliable.rs b/zilliqa/tests/it/unreliable.rs index 8a68acecf..b35787e93 100644 --- a/zilliqa/tests/it/unreliable.rs +++ b/zilliqa/tests/it/unreliable.rs @@ -32,5 +32,5 @@ async fn blocks_are_produced_while_a_node_restarts(mut network: Network) { // of blocks 3 or 4. This would tell us that we aren't producing new views unnecessarily. // Ensure more blocks are produced. - network.run_until_block(&wallet, 12.into(), 1400).await; + network.run_until_block(&wallet, 12.into(), 400).await; } From b6509d4e296518a2dd6a4732894bf64d6207fe59 Mon Sep 17 00:00:00 2001 From: Shawn Date: Thu, 23 Jan 2025 19:40:09 +0800 Subject: [PATCH 089/119] feat: replace ChainMetaData with BlockHeader. --- zilliqa/src/consensus.rs | 3 +- zilliqa/src/message.rs | 13 +--- zilliqa/src/sync.rs | 164 +++++++++++++++++++++------------------ zilliqa/tests/it/main.rs | 10 ++- 4 files changed, 98 insertions(+), 92 deletions(-) diff --git a/zilliqa/src/consensus.rs b/zilliqa/src/consensus.rs index dbe036077..294ca6b93 100644 --- a/zilliqa/src/consensus.rs +++ b/zilliqa/src/consensus.rs @@ -3160,9 +3160,8 @@ impl Consensus { trace!("consensus::tick()"); trace!("request_missing_blocks from timer"); - // Drives syncing from timeouts, not just new Proposals + // TODO: Drive passive-sync from Timeouts if self.sync.am_syncing()? { - // TODO: Sync from Timeouts self.sync.sync_internal()?; } else { trace!("not syncing ..."); diff --git a/zilliqa/src/message.rs b/zilliqa/src/message.rs index 690ad67c6..5f298b021 100644 --- a/zilliqa/src/message.rs +++ b/zilliqa/src/message.rs @@ -242,16 +242,7 @@ pub struct InjectedProposal { pub block: Proposal, } -/// Used to hold metadata about the chain -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct ChainMetaData { - pub parent_hash: Hash, - pub block_hash: Hash, - pub block_number: u64, - pub view_number: u64, -} - -/// Used to convey proposal processing internally, to avoid blocking threads for too long. +/// TODO: Remove. Unused in RFC161 algorithm #[derive(Debug, Clone, Serialize, Deserialize)] pub struct ProcessProposal { // An encoded PeerId @@ -287,7 +278,7 @@ pub enum ExternalMessage { RemovePeer, InjectedProposal(InjectedProposal), MetaDataRequest(BlockRequestV2), - MetaDataResponse(Vec), + MetaDataResponse(Vec), MultiBlockRequest(Vec), MultiBlockResponse(Vec), } diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index 7a70788ca..e0d463cd8 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -20,8 +20,8 @@ use crate::{ crypto::Hash, db::Db, message::{ - Block, BlockRequest, BlockRequestV2, BlockResponse, ChainMetaData, ExternalMessage, - InjectedProposal, Proposal, + Block, BlockHeader, BlockRequest, BlockRequestV2, BlockResponse, ExternalMessage, + InjectedProposal, Proposal, QuorumCertificate, }, node::MessageSender, time::SystemTime, @@ -211,18 +211,13 @@ impl Sync { } /// Peeks into the top of the segment stack. - fn last_segment(&self) -> Result> { + fn last_segment(&self) -> Result> { let mut result = None; self.db.with_sqlite_tx(|c| { result = c .prepare_cached("SELECT parent_hash, block_hash, block_number, view_number, peer, version FROM sync_data WHERE peer IS NOT NULL ORDER BY block_number ASC LIMIT 1")? .query_row([], |row| Ok(( - ChainMetaData{ - parent_hash: row.get(0)?, - block_hash: row.get(1)?, - block_number: row.get(2)?, - view_number: row.get(3)?, - }, + BlockHeader::from_meta_data(row.get(0)?,row.get(1)?, row.get(2)?, row.get(3)?), PeerInfo { last_used: Instant::now(), score:u32::MAX, @@ -274,16 +269,16 @@ impl Sync { } /// Pushes a particular segment into the stack. - fn push_segment(&self, peer: PeerInfo, meta: ChainMetaData) -> Result<()> { + fn push_segment(&self, peer: PeerInfo, meta: BlockHeader) -> Result<()> { self.db.with_sqlite_tx(|c| { c.prepare_cached( "INSERT OR REPLACE INTO sync_data (parent_hash, block_hash, block_number, view_number, peer, version) VALUES (:parent_hash, :block_hash, :block_number, :view_number, :peer, :version)")? .execute( named_params! { - ":parent_hash": meta.parent_hash, - ":block_hash": meta.block_hash, - ":block_number": meta.block_number, - ":view_number": meta.view_number, + ":parent_hash": meta.qc.block_hash, + ":block_hash": meta.hash, + ":block_number": meta.number, + ":view_number": meta.view, ":peer": peer.peer_id.to_bytes(), ":version": peer.version, }, @@ -293,18 +288,18 @@ impl Sync { } /// Bulk inserts a bunch of metadata. - fn insert_metadata(&self, metas: &Vec) -> Result<()> { + fn insert_metadata(&self, metas: &Vec) -> Result<()> { self.db.with_sqlite_tx(|c| { for meta in metas { c.prepare_cached( "INSERT OR REPLACE INTO sync_data (parent_hash, block_hash, block_number, view_number) VALUES (:parent_hash, :block_hash, :block_number, :view_number)")? .execute( named_params! { - ":parent_hash": meta.parent_hash, - ":block_hash": meta.block_hash, - ":block_number": meta.block_number, - ":view_number": meta.view_number, - }, + ":parent_hash": meta.qc.block_hash, + ":block_hash": meta.hash, + ":block_number": meta.number, + ":view_number": meta.view, + }, )?; } Ok(()) @@ -321,7 +316,7 @@ impl Sync { /// Phase 0: Sync a block proposal. /// - /// This is the main entry point for syncing a block proposal. + /// This is the main entry point for active-syncing a block proposal. /// We start by enqueuing all proposals, and then check if the parent block exists in history. /// If the parent block exists, we do nothing. Otherwise, we check the least recent one. /// If we find its parent in history, we inject the entire queue. Otherwise, we start syncing. @@ -354,12 +349,12 @@ impl Sync { let block_hash = self.recent_proposals.back().unwrap().hash(); let block_number = self.recent_proposals.back().unwrap().number(); let view_number = self.recent_proposals.back().unwrap().view(); - let meta = ChainMetaData { - block_hash, + let meta = BlockHeader::from_meta_data( parent_hash, + block_hash, block_number, view_number, - }; + ); self.request_missing_metadata(Some(meta))?; let highest_block = self @@ -439,16 +434,6 @@ impl Sync { Proposal::from_parts(block, txs) } - /// Convenience function to extract metadata from the block. - fn block_to_metadata(&self, block: Block) -> ChainMetaData { - ChainMetaData { - parent_hash: block.parent_hash(), - block_hash: block.hash(), - block_number: block.number(), - view_number: block.view(), - } - } - /// Phase 2: Retry Phase 1 /// /// If something went wrong in Phase 2, Phase 1 may need to be retried for the recently used segment. @@ -623,7 +608,7 @@ impl Sync { // If we have no chain_segments, we have nothing to do if let Some((meta, peer_info)) = self.last_segment()? { - let request_hashes = self.get_segment(meta.parent_hash)?; + let request_hashes = self.get_segment(meta.qc.block_hash)?; // Checksum of the request hashes let checksum = request_hashes @@ -660,8 +645,8 @@ impl Sync { }); // do not add VIEW_DRIFT - the stored marker is accurate! ExternalMessage::BlockRequest(BlockRequest { - to_view: meta.view_number.saturating_sub(1), - from_view: meta.view_number.saturating_sub(self.max_batch_size as u64), + to_view: meta.view.saturating_sub(1), + from_view: meta.view.saturating_sub(self.max_batch_size as u64), }) } }; @@ -722,13 +707,18 @@ impl Sync { // Convert the V1 response into a V2 response. match self.state { // Phase 1 - construct the metadata chain from the set of received proposals - SyncState::Phase1(ChainMetaData { - block_number, - mut parent_hash, + SyncState::Phase1(BlockHeader { + number: block_number, + qc: + QuorumCertificate { + block_hash: parent_hash, + .. + }, .. }) => { // We do not buffer the proposals, as it takes 250MB/day! // Instead, we will re-request the proposals again, in Phase 2. + let mut parent_hash = parent_hash; let metadata = response .proposals .into_iter() @@ -743,12 +733,7 @@ impl Sync { parent_hash = p.header.qc.block_hash; true }) - .map(|p| ChainMetaData { - block_hash: p.hash(), - parent_hash: p.header.qc.block_hash, - block_number: p.number(), - view_number: p.view(), - }) + .map(|p| p.header) .collect_vec(); self.handle_metadata_response(from, metadata)?; @@ -784,7 +769,7 @@ impl Sync { pub fn handle_metadata_response( &mut self, from: PeerId, - response: Vec, + response: Vec, ) -> Result<()> { // Check for expected response let segment_peer = if let Some(peer) = self.in_flight.as_ref() { @@ -817,27 +802,24 @@ impl Sync { anyhow::bail!("sync::MetadataResponse : invalid state"); }; - let mut block_hash = meta.parent_hash; - let mut block_num = meta.block_number; + let mut block_hash = meta.qc.block_hash; + let mut block_num = meta.number; for meta in response.iter() { // check that the block hash and number is as expected. - if meta.block_hash != Hash::ZERO - && block_hash == meta.block_hash - && block_num == meta.block_number + 1 - { - block_hash = meta.parent_hash; - block_num = meta.block_number; + if meta.hash != Hash::ZERO && block_hash == meta.hash && block_num == meta.number + 1 { + block_hash = meta.qc.block_hash; + block_num = meta.number; } else { // TODO: possibly, discard and rebuild entire chain // if something does not match, do nothing and retry the request with the next peer. tracing::error!( "sync::MetadataResponse : unexpected metadata hash={block_hash} != {}, num={block_num} != {}", - meta.block_hash, - meta.block_number, + meta.hash, + meta.number, ); return Ok(()); } - if meta.block_hash == response.last().unwrap().block_hash { + if meta.hash == response.last().unwrap().hash { break; // done, we do not check the last parent, because that's outside this segment } } @@ -846,11 +828,11 @@ impl Sync { let segment = response; // Record landmark, including peer that has this set of blocks - self.push_segment(segment_peer, meta.clone())?; + self.push_segment(segment_peer, *meta)?; // Record the oldest block in the chain's parent self.state = SyncState::Phase1(segment.last().cloned().unwrap()); - let last_block_hash = segment.last().as_ref().unwrap().block_hash; + let last_block_hash = segment.last().as_ref().unwrap().hash; tracing::info!( "sync::MetadataResponse : received {} metadata segment #{} from {}", @@ -864,7 +846,7 @@ impl Sync { // If the checkpoint is in this segment, let checkpointed = if let Some(checkpoint) = self.checkpoint_hash { - segment.iter().any(|b| b.block_hash == checkpoint) + segment.iter().any(|b| b.hash == checkpoint) } else { false }; @@ -912,7 +894,7 @@ impl Sync { break; // that's all we have! }; hash = block.parent_hash(); - metas.push(self.block_to_metadata(block)); + metas.push(block.header); } let message = ExternalMessage::MetaDataResponse(metas); @@ -928,7 +910,7 @@ impl Sync { /// This constructs a chain history by requesting blocks from a peer, going backwards from a given block. /// If Phase 1 is in progress, it continues requesting blocks from the last known Phase 1 block. /// Otherwise, it requests blocks from the given starting metadata. - pub fn request_missing_metadata(&mut self, meta: Option) -> Result<()> { + pub fn request_missing_metadata(&mut self, meta: Option) -> Result<()> { if !matches!(self.state, SyncState::Phase1(_)) && !matches!(self.state, SyncState::Phase0) { anyhow::bail!("sync::RequestMissingMetadata : invalid state"); } @@ -960,14 +942,27 @@ impl Sync { peer.peer_id ); let message = match (self.state.clone(), &peer.version) { - (SyncState::Phase1(ChainMetaData { parent_hash, .. }), PeerVer::V2) => { - ExternalMessage::MetaDataRequest(BlockRequestV2 { - request_at: SystemTime::now(), - from_hash: parent_hash, - batch_size: self.max_batch_size, - }) - } - (SyncState::Phase1(ChainMetaData { view_number, .. }), PeerVer::V1) => { + ( + SyncState::Phase1(BlockHeader { + qc: + QuorumCertificate { + block_hash: parent_hash, + .. + }, + .. + }), + PeerVer::V2, + ) => ExternalMessage::MetaDataRequest(BlockRequestV2 { + request_at: SystemTime::now(), + from_hash: parent_hash, + batch_size: self.max_batch_size, + }), + ( + SyncState::Phase1(BlockHeader { + view: view_number, .. + }), + PeerVer::V1, + ) => { // For V1 BlockRequest, we request a little more than we need, due to drift // Since the view number is an 'internal' clock, it is possible for the same block number // to have different view numbers. @@ -979,7 +974,7 @@ impl Sync { } (SyncState::Phase0, PeerVer::V2) if meta.is_some() => { let meta = meta.unwrap(); - let parent_hash = meta.parent_hash; + let parent_hash = meta.qc.block_hash; self.state = SyncState::Phase1(meta); ExternalMessage::MetaDataRequest(BlockRequestV2 { request_at: SystemTime::now(), @@ -989,7 +984,7 @@ impl Sync { } (SyncState::Phase0, PeerVer::V1) if meta.is_some() => { let meta = meta.unwrap(); - let view_number = meta.view_number; + let view_number = meta.view; self.state = SyncState::Phase1(meta); let drift = self.max_batch_size as u64 / 10; ExternalMessage::BlockRequest(BlockRequest { @@ -1104,7 +1099,7 @@ impl Sync { // if the new peer is not synced, it will get downgraded to the back of heap. // but by placing them at the back of the 'best' pack, we get to try them out soon. let new_peer = PeerInfo { - version: PeerVer::V1, // default V2 + version: PeerVer::V1, score: self.peers.iter().map(|p| p.score).min().unwrap_or_default(), peer_id: peer, last_used: Instant::now(), @@ -1237,10 +1232,11 @@ impl PartialOrd for DownGrade { } /// Sync state +#[allow(clippy::large_enum_variant)] #[derive(Debug, Clone)] enum SyncState { Phase0, - Phase1(ChainMetaData), + Phase1(BlockHeader), Phase2(Hash), Phase3, Retry1, @@ -1268,3 +1264,21 @@ impl ToSql for PeerVer { Ok((self.clone() as u32).into()) } } + +impl BlockHeader { + pub fn from_meta_data( + parent_hash: Hash, + block_hash: Hash, + block_number: u64, + view_number: u64, + ) -> BlockHeader { + let mut meta = BlockHeader { + view: view_number, + number: block_number, + hash: block_hash, + ..Default::default() + }; + meta.qc.block_hash = parent_hash; + meta + } +} diff --git a/zilliqa/tests/it/main.rs b/zilliqa/tests/it/main.rs index cf07bcecb..01332edd8 100644 --- a/zilliqa/tests/it/main.rs +++ b/zilliqa/tests/it/main.rs @@ -1058,10 +1058,12 @@ impl Network { } async fn run_until_synced(&mut self, index: usize) { - let mut check = self.rng.lock().unwrap().gen_range(0..self.nodes.len()); - while index == check { - check = self.rng.lock().unwrap().gen_range(0..self.nodes.len()); - } + let check = loop { + let i = self.random_index(); + if i != index { + break i; + } + }; let mut debounce = 0; let mut old_height = 0; self.run_until( From e0007e30d36f6a57caa1270f61bf4bd181ba1e4e Mon Sep 17 00:00:00 2001 From: Shawn Date: Thu, 23 Jan 2025 21:50:27 +0800 Subject: [PATCH 090/119] feat: changed BlockRequestV2 from 'hash'-based to 'height'-based. --- zilliqa/src/message.rs | 10 +++++++--- zilliqa/src/p2p_node.rs | 13 ++++++++++++- zilliqa/src/sync.rs | 30 +++++++++++++++++------------- 3 files changed, 36 insertions(+), 17 deletions(-) diff --git a/zilliqa/src/message.rs b/zilliqa/src/message.rs index 5f298b021..fca919a3e 100644 --- a/zilliqa/src/message.rs +++ b/zilliqa/src/message.rs @@ -230,8 +230,8 @@ impl fmt::Debug for BlockResponse { #[derive(Debug, Clone, Serialize, Deserialize)] pub struct BlockRequestV2 { pub request_at: SystemTime, - pub from_hash: Hash, - pub batch_size: usize, + pub from_height: u64, + pub to_height: u64, } /// Used to convey proposal processing internally, to avoid blocking threads for too long. @@ -306,7 +306,11 @@ impl Display for ExternalMessage { write!(f, "MetaDataResponse({})", r.len()) } ExternalMessage::MetaDataRequest(r) => { - write!(f, "MetaDataRequest({}, num={})", r.from_hash, r.batch_size) + write!( + f, + "MetaDataRequest(from={}, to={})", + r.from_height, r.to_height + ) } ExternalMessage::InjectedProposal(p) => { write!(f, "InjectedProposal {}", p.block.number()) diff --git a/zilliqa/src/p2p_node.rs b/zilliqa/src/p2p_node.rs index 6fd941b23..9cec57729 100644 --- a/zilliqa/src/p2p_node.rs +++ b/zilliqa/src/p2p_node.rs @@ -34,9 +34,10 @@ use crate::{ cfg::{Config, ConsensusConfig, NodeConfig}, crypto::SecretKey, db, - message::{ExternalMessage, InternalMessage}, + message::{BlockRequestV2, ExternalMessage, InternalMessage}, node::{OutgoingMessageFailure, RequestId}, node_launcher::{NodeInputChannels, NodeLauncher, ResponseChannel}, + time::SystemTime, }; /// Messages are a tuple of the destination shard ID and the actual message. @@ -312,6 +313,16 @@ impl P2pNode { debug!(source = %_source, %to, external_message = %_external_message, request_id = %_request_id, "message received"); let _topic = Self::shard_id_to_topic(shard_id); let _id = format!("{}", _request_id); + + // insert local time for BlockRequestV2 - this is checked in Sync::HandleMetadataRequest + let _external_message = match _external_message { + ExternalMessage::MetaDataRequest(BlockRequestV2{from_height, to_height, ..}) => ExternalMessage::MetaDataRequest(BlockRequestV2{ + from_height, to_height, request_at: SystemTime::now(), + }), + // pass-thru everything else + e => e, + }; + cfg_if! { if #[cfg(not(feature = "fake_response_channel"))] { self.send_to(&_topic.hash(), |c| c.requests.send((_source, _id, _external_message, ResponseChannel::Remote(_channel))))?; diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index e0d463cd8..f40fd21d9 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -1,6 +1,7 @@ use std::{ cmp::Ordering, collections::{BinaryHeap, VecDeque}, + ops::Sub, sync::Arc, time::{Duration, Instant}, }; @@ -885,10 +886,17 @@ impl Sync { // TODO: Check if we should service this request // Validators could respond to this request if there is nothing else to do. - let batch_size: usize = self.max_batch_size.min(request.batch_size); // mitigate DOS by limiting the number of blocks we return + let batch_size: usize = self + .max_batch_size + .min(request.to_height.saturating_sub(request.from_height) as usize); // mitigate DOS by limiting the number of blocks we return let mut metas = Vec::with_capacity(batch_size); - let mut hash = request.from_hash; - while metas.len() < batch_size { + let Some(block) = self.db.get_canonical_block_by_number(request.to_height)? else { + tracing::warn!("sync::MetadataRequest : unknown block height"); + return Ok(ExternalMessage::Acknowledgement); + }; + metas.push(block.header); + let mut hash = block.parent_hash(); + while metas.len() <= batch_size { // grab the parent let Some(block) = self.db.get_block_by_hash(&hash)? else { break; // that's all we have! @@ -944,18 +952,14 @@ impl Sync { let message = match (self.state.clone(), &peer.version) { ( SyncState::Phase1(BlockHeader { - qc: - QuorumCertificate { - block_hash: parent_hash, - .. - }, + number: block_number, .. }), PeerVer::V2, ) => ExternalMessage::MetaDataRequest(BlockRequestV2 { request_at: SystemTime::now(), - from_hash: parent_hash, - batch_size: self.max_batch_size, + to_height: block_number.saturating_sub(1), + from_height: block_number.saturating_sub(self.max_batch_size as u64), }), ( SyncState::Phase1(BlockHeader { @@ -974,12 +978,12 @@ impl Sync { } (SyncState::Phase0, PeerVer::V2) if meta.is_some() => { let meta = meta.unwrap(); - let parent_hash = meta.qc.block_hash; + let block_number = meta.number; self.state = SyncState::Phase1(meta); ExternalMessage::MetaDataRequest(BlockRequestV2 { request_at: SystemTime::now(), - from_hash: parent_hash, - batch_size: self.max_batch_size, + to_height: block_number.sub(1), + from_height: block_number.sub(self.max_batch_size as u64), }) } (SyncState::Phase0, PeerVer::V1) if meta.is_some() => { From 7c4d40bba2dfe0b25d44a81e5a3384898bd02ee1 Mon Sep 17 00:00:00 2001 From: Shawn Date: Fri, 24 Jan 2025 08:09:41 +0800 Subject: [PATCH 091/119] feat: simplify checkpointed check. --- zilliqa/src/consensus.rs | 4 ++-- zilliqa/src/sync.rs | 16 ++++++---------- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/zilliqa/src/consensus.rs b/zilliqa/src/consensus.rs index 294ca6b93..3f01a52a9 100644 --- a/zilliqa/src/consensus.rs +++ b/zilliqa/src/consensus.rs @@ -205,8 +205,6 @@ impl Consensus { )?; } - let sync = Sync::new(&config, db.clone(), message_sender.clone())?; - let latest_block = db .get_finalized_view()? .and_then(|view| { @@ -314,6 +312,8 @@ impl Consensus { } }; + let sync = Sync::new(&config, db.clone(), message_sender.clone())?; + let mut consensus = Consensus { secret_key, config, diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index f40fd21d9..9f8b1a93e 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -88,8 +88,8 @@ pub struct Sync { inject_at: Option<(std::time::Instant, usize)>, // record starting number, for eth_syncing() RPC call. started_at_block_number: u64, - // checkpoint - checkpoint_hash: Option, + // checkpoint, if set + checkpoint_hash: Hash, } impl Sync { @@ -155,14 +155,14 @@ impl Sync { recent_proposals: VecDeque::with_capacity(max_batch_size), inject_at: None, started_at_block_number: 0, - checkpoint_hash: None, + checkpoint_hash: Hash::ZERO, }) } pub fn set_checkpoint(&mut self, checkpoint: &Block) { let hash = checkpoint.hash(); tracing::info!("sync::Checkpoint {}", hash); - self.checkpoint_hash = Some(hash); + self.checkpoint_hash = hash; } /// Returns the number of stored segments @@ -845,12 +845,8 @@ impl Sync { // Record the constructed chain metadata self.insert_metadata(&segment)?; - // If the checkpoint is in this segment, - let checkpointed = if let Some(checkpoint) = self.checkpoint_hash { - segment.iter().any(|b| b.hash == checkpoint) - } else { - false - }; + // If the checkpoint is in this segment + let checkpointed = segment.iter().any(|b| b.hash == self.checkpoint_hash); // If the segment hits our history, start Phase 2. if checkpointed || self.db.contains_block(&last_block_hash)? { From 75ab04b3c2938d445c5b4543503d111ff3526e99 Mon Sep 17 00:00:00 2001 From: Shawn Date: Fri, 24 Jan 2025 08:32:56 +0800 Subject: [PATCH 092/119] nit: make sync_data temporary. --- zilliqa/src/sync.rs | 37 +++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index 9f8b1a93e..d738d13f0 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -107,11 +107,10 @@ impl Sync { let max_batch_size = config.block_request_batch_size.clamp(30, 180); // up to 180 sec of blocks at a time. let max_blocks_in_flight = config.max_blocks_in_flight.clamp(max_batch_size, 1800); // up to 30-mins worth of blocks in-pipeline. - // This DB could be left in-here as it is only used in this module - // TODO: Make this in-memory by exploiting SQLite TEMP tables i.e. CREATE TEMP TABLE + // This in-memory DB is placed in-here as it is only used in this module. db.with_sqlite_tx(|c| { c.execute_batch( - "CREATE TABLE IF NOT EXISTS sync_data ( + "CREATE TEMP TABLE IF NOT EXISTS sync_data ( block_hash BLOB NOT NULL UNIQUE, parent_hash BLOB NOT NULL, block_number INTEGER NOT NULL PRIMARY KEY, @@ -140,6 +139,18 @@ impl Sync { SyncState::Retry1 }; + let latest_block_number = db + .get_finalized_view()? + .and_then(|view| { + db.get_block_hash_by_view(view) + .expect("no header found at view {view}") + }) + .and_then(|hash| { + db.get_block_by_hash(&hash) + .expect("no block found for hash {hash}") + }) + .and_then(|block| Some(block.number())); + Ok(Self { db, message_sender, @@ -154,17 +165,11 @@ impl Sync { state, recent_proposals: VecDeque::with_capacity(max_batch_size), inject_at: None, - started_at_block_number: 0, + started_at_block_number: latest_block_number.unwrap_or_default(), checkpoint_hash: Hash::ZERO, }) } - pub fn set_checkpoint(&mut self, checkpoint: &Block) { - let hash = checkpoint.hash(); - tracing::info!("sync::Checkpoint {}", hash); - self.checkpoint_hash = hash; - } - /// Returns the number of stored segments fn count_segments(&self) -> Result { let mut segments = 0; @@ -847,9 +852,10 @@ impl Sync { // If the checkpoint is in this segment let checkpointed = segment.iter().any(|b| b.hash == self.checkpoint_hash); - + let started = self.started_at_block_number <= segment.first().as_ref().unwrap().number + && self.started_at_block_number >= segment.last().as_ref().unwrap().number; // If the segment hits our history, start Phase 2. - if checkpointed || self.db.contains_block(&last_block_hash)? { + if started || checkpointed || self.db.contains_block(&last_block_hash)? { self.state = SyncState::Phase2(Hash::ZERO); } else if Self::DO_SPECULATIVE { self.request_missing_metadata(None)?; @@ -1185,6 +1191,13 @@ impl Sync { ))) } } + + /// Sets the checkpoint, if node was started from a checkpoint. + pub fn set_checkpoint(&mut self, checkpoint: &Block) { + let hash = checkpoint.hash(); + tracing::info!("sync::Checkpoint {}", hash); + self.checkpoint_hash = hash; + } } #[derive(Debug, Clone, Eq, PartialEq)] From 1c8c4124fb78843311dd5e2c120e9b7539183315 Mon Sep 17 00:00:00 2001 From: Shawn Date: Fri, 24 Jan 2025 08:48:08 +0800 Subject: [PATCH 093/119] fix: better error handling for committee_for_hash(). --- zilliqa/src/consensus.rs | 10 +++++++--- zilliqa/src/sync.rs | 2 +- zilliqa/tests/it/main.rs | 2 +- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/zilliqa/src/consensus.rs b/zilliqa/src/consensus.rs index 3f01a52a9..18cb10c2e 100644 --- a/zilliqa/src/consensus.rs +++ b/zilliqa/src/consensus.rs @@ -829,7 +829,6 @@ impl Consensus { let proposer_address = parent_state.get_reward_address(proposer)?; - let mut total_cosigner_stake = 0; let cosigner_stake: Vec<_> = committee .iter() .enumerate() @@ -841,11 +840,15 @@ impl Consensus { .unwrap() .unwrap() .get(); - total_cosigner_stake += stake; (reward_address, stake) }) .collect(); + let total_cosigner_stake = cosigner_stake.iter().fold(0, |sum, c| sum + c.1); + if total_cosigner_stake == 0 { + return Err(anyhow!("total stake is 0")); + } + // Track total awards given out. This may be different to rewards_per_block because we round down on division when we split the rewards let mut total_rewards_issued = 0; @@ -1698,7 +1701,8 @@ impl Consensus { fn committee_for_hash(&self, parent_hash: Hash) -> Result> { let Ok(Some(parent)) = self.get_block(&parent_hash) else { - return Err(anyhow!("parent block not found: {:?}", parent_hash)); + tracing::error!("parent block not found: {:?}", parent_hash); + return Ok(Vec::new()); // return an empty vector instead of Err for graceful app-level error-handling }; let parent_root_hash = parent.state_root_hash(); diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index d738d13f0..2f5956f08 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -149,7 +149,7 @@ impl Sync { db.get_block_by_hash(&hash) .expect("no block found for hash {hash}") }) - .and_then(|block| Some(block.number())); + .map(|block| block.number()); Ok(Self { db, diff --git a/zilliqa/tests/it/main.rs b/zilliqa/tests/it/main.rs index 01332edd8..e1267087f 100644 --- a/zilliqa/tests/it/main.rs +++ b/zilliqa/tests/it/main.rs @@ -995,7 +995,7 @@ impl Network { external_message.clone(), response_channel, ) - .ok(); // TODO: better error handling + .unwrap(); } }); } From 6036668a5cc666d3311ef2b59a78d8f76b209869 Mon Sep 17 00:00:00 2001 From: Shawn Date: Fri, 24 Jan 2025 09:05:17 +0800 Subject: [PATCH 094/119] feat: sets starting_block during Sync::new(). --- zilliqa/src/consensus.rs | 2 +- zilliqa/src/sync.rs | 25 ++++++++++++------------- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/zilliqa/src/consensus.rs b/zilliqa/src/consensus.rs index 18cb10c2e..77ff2ba04 100644 --- a/zilliqa/src/consensus.rs +++ b/zilliqa/src/consensus.rs @@ -312,7 +312,7 @@ impl Consensus { } }; - let sync = Sync::new(&config, db.clone(), message_sender.clone())?; + let sync = Sync::new(&config, db.clone(), &latest_block, message_sender.clone())?; let mut consensus = Consensus { secret_key, diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index 2f5956f08..db01d1e71 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -102,7 +102,12 @@ impl Sync { // Minimum of 2 peers to avoid single source of truth. const MIN_PEERS: usize = 2; - pub fn new(config: &NodeConfig, db: Arc, message_sender: MessageSender) -> Result { + pub fn new( + config: &NodeConfig, + db: Arc, + latest_block: &Option, + message_sender: MessageSender, + ) -> Result { let peer_id = message_sender.our_peer_id; let max_batch_size = config.block_request_batch_size.clamp(30, 180); // up to 180 sec of blocks at a time. let max_blocks_in_flight = config.max_blocks_in_flight.clamp(max_batch_size, 1800); // up to 30-mins worth of blocks in-pipeline. @@ -139,17 +144,11 @@ impl Sync { SyncState::Retry1 }; - let latest_block_number = db - .get_finalized_view()? - .and_then(|view| { - db.get_block_hash_by_view(view) - .expect("no header found at view {view}") - }) - .and_then(|hash| { - db.get_block_by_hash(&hash) - .expect("no block found for hash {hash}") - }) - .map(|block| block.number()); + let latest_block_number = latest_block + .as_ref() + .expect("Some(block) expected") + .number(); + tracing::info!("latest_block_number = {latest_block_number}"); Ok(Self { db, @@ -165,7 +164,7 @@ impl Sync { state, recent_proposals: VecDeque::with_capacity(max_batch_size), inject_at: None, - started_at_block_number: latest_block_number.unwrap_or_default(), + started_at_block_number: latest_block_number, checkpoint_hash: Hash::ZERO, }) } From deec0fa15cffba2c22746fbbcb83f3755269c53a Mon Sep 17 00:00:00 2001 From: Shawn Date: Fri, 24 Jan 2025 09:31:12 +0800 Subject: [PATCH 095/119] feat: store gas_used as a proxy for block size. --- zilliqa/src/sync.rs | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index db01d1e71..4d4845ede 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -26,6 +26,7 @@ use crate::{ }, node::MessageSender, time::SystemTime, + transaction::EvmGas, }; // Syncing Algorithm @@ -112,7 +113,7 @@ impl Sync { let max_batch_size = config.block_request_batch_size.clamp(30, 180); // up to 180 sec of blocks at a time. let max_blocks_in_flight = config.max_blocks_in_flight.clamp(max_batch_size, 1800); // up to 30-mins worth of blocks in-pipeline. - // This in-memory DB is placed in-here as it is only used in this module. + // This in-memory DB is placed here as it is only used in this module. db.with_sqlite_tx(|c| { c.execute_batch( "CREATE TEMP TABLE IF NOT EXISTS sync_data ( @@ -120,8 +121,9 @@ impl Sync { parent_hash BLOB NOT NULL, block_number INTEGER NOT NULL PRIMARY KEY, view_number INTEGER NOT NULL, - peer BLOB DEFAULT NULL, - version INTEGER DEFAULT 0 + gas_used INTEGER NOT NULL, + version INTEGER DEFAULT 0, + peer BLOB DEFAULT NULL ); CREATE INDEX IF NOT EXISTS idx_sync_data ON sync_data(block_number) WHERE peer IS NOT NULL;", )?; @@ -148,7 +150,6 @@ impl Sync { .as_ref() .expect("Some(block) expected") .number(); - tracing::info!("latest_block_number = {latest_block_number}"); Ok(Self { db, @@ -220,14 +221,14 @@ impl Sync { let mut result = None; self.db.with_sqlite_tx(|c| { result = c - .prepare_cached("SELECT parent_hash, block_hash, block_number, view_number, peer, version FROM sync_data WHERE peer IS NOT NULL ORDER BY block_number ASC LIMIT 1")? + .prepare_cached("SELECT parent_hash, block_hash, block_number, view_number, gas_used, version, peer FROM sync_data WHERE peer IS NOT NULL ORDER BY block_number ASC LIMIT 1")? .query_row([], |row| Ok(( - BlockHeader::from_meta_data(row.get(0)?,row.get(1)?, row.get(2)?, row.get(3)?), + BlockHeader::from_meta_data(row.get(0)?,row.get(1)?, row.get(2)?, row.get(3)?, row.get(4)?), PeerInfo { last_used: Instant::now(), score:u32::MAX, version: row.get(5)?, - peer_id: PeerId::from_bytes(row.get::<_,Vec>(4)?.as_slice()).unwrap(), + peer_id: PeerId::from_bytes(row.get::<_,Vec>(6)?.as_slice()).unwrap(), }, ))) .optional()?; @@ -277,13 +278,14 @@ impl Sync { fn push_segment(&self, peer: PeerInfo, meta: BlockHeader) -> Result<()> { self.db.with_sqlite_tx(|c| { c.prepare_cached( - "INSERT OR REPLACE INTO sync_data (parent_hash, block_hash, block_number, view_number, peer, version) VALUES (:parent_hash, :block_hash, :block_number, :view_number, :peer, :version)")? + "INSERT OR REPLACE INTO sync_data (parent_hash, block_hash, block_number, view_number, gas_used, version, peer) VALUES (:parent_hash, :block_hash, :block_number, :view_number, :gas_used, :version, :peer)")? .execute( named_params! { ":parent_hash": meta.qc.block_hash, ":block_hash": meta.hash, ":block_number": meta.number, ":view_number": meta.view, + ":gas_used": meta.gas_used, ":peer": peer.peer_id.to_bytes(), ":version": peer.version, }, @@ -297,13 +299,14 @@ impl Sync { self.db.with_sqlite_tx(|c| { for meta in metas { c.prepare_cached( - "INSERT OR REPLACE INTO sync_data (parent_hash, block_hash, block_number, view_number) VALUES (:parent_hash, :block_hash, :block_number, :view_number)")? + "INSERT OR REPLACE INTO sync_data (parent_hash, block_hash, block_number, view_number, gas_used) VALUES (:parent_hash, :block_hash, :block_number, :view_number, :gas_used)")? .execute( named_params! { ":parent_hash": meta.qc.block_hash, ":block_hash": meta.hash, ":block_number": meta.number, ":view_number": meta.view, + ":gas_used": meta.gas_used, }, )?; } @@ -354,11 +357,13 @@ impl Sync { let block_hash = self.recent_proposals.back().unwrap().hash(); let block_number = self.recent_proposals.back().unwrap().number(); let view_number = self.recent_proposals.back().unwrap().view(); + let gas_used = self.recent_proposals.back().unwrap().header.gas_used; let meta = BlockHeader::from_meta_data( parent_hash, block_hash, block_number, view_number, + gas_used, ); self.request_missing_metadata(Some(meta))?; @@ -1283,8 +1288,10 @@ impl BlockHeader { block_hash: Hash, block_number: u64, view_number: u64, + gas_used: EvmGas, ) -> BlockHeader { let mut meta = BlockHeader { + gas_used, view: view_number, number: block_number, hash: block_hash, From 4d68b82aade69f69fd840032cbe81b3cc37e44ca Mon Sep 17 00:00:00 2001 From: Shawn Date: Fri, 24 Jan 2025 09:42:20 +0800 Subject: [PATCH 096/119] feat: reordered handle_metadata_response() to allow for micro-segmentation. --- zilliqa/src/sync.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index 4d4845ede..b3d5163e3 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -837,13 +837,6 @@ impl Sync { // Chain segment is sane let segment = response; - // Record landmark, including peer that has this set of blocks - self.push_segment(segment_peer, *meta)?; - - // Record the oldest block in the chain's parent - self.state = SyncState::Phase1(segment.last().cloned().unwrap()); - let last_block_hash = segment.last().as_ref().unwrap().hash; - tracing::info!( "sync::MetadataResponse : received {} metadata segment #{} from {}", segment.len(), @@ -854,6 +847,13 @@ impl Sync { // Record the constructed chain metadata self.insert_metadata(&segment)?; + // Record landmark(s), including peer that has this set of blocks + self.push_segment(segment_peer, *meta)?; + + // Record the oldest block in the chain's parent + self.state = SyncState::Phase1(segment.last().cloned().unwrap()); + let last_block_hash = segment.last().as_ref().unwrap().hash; + // If the checkpoint is in this segment let checkpointed = segment.iter().any(|b| b.hash == self.checkpoint_hash); let started = self.started_at_block_number <= segment.first().as_ref().unwrap().number From a00851a801f9c818f49862e5fc84438c7d34325a Mon Sep 17 00:00:00 2001 From: Shawn Date: Fri, 24 Jan 2025 14:08:03 +0800 Subject: [PATCH 097/119] fix: removed dynamic_batch_sizing() as it should be unnecessary until block 1.0M in protomainnet. --- zilliqa/src/lib.rs | 2 +- zilliqa/src/sync.rs | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/zilliqa/src/lib.rs b/zilliqa/src/lib.rs index 9b783a862..8da13f370 100644 --- a/zilliqa/src/lib.rs +++ b/zilliqa/src/lib.rs @@ -22,8 +22,8 @@ pub mod scilla; mod scilla_proto; pub mod serde_util; pub mod state; -pub mod test_util; pub mod sync; +pub mod test_util; pub mod time; pub mod transaction; pub mod zq1_proto; diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index b3d5163e3..364b28582 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -1131,8 +1131,9 @@ impl Sync { fn get_next_peer(&mut self) -> Option { if self.peers.len() >= Self::MIN_PEERS { let mut peer = self.peers.pop()?; - peer.last_used = std::time::Instant::now(); // used to determine stale requests. - self.max_batch_size = self.dynamic_batch_sizing(&peer); + peer.last_used = std::time::Instant::now(); + // dynamic sizing should not be needed, if we're syncing recent blocks. + // self.max_batch_size = self.dynamic_batch_sizing(&peer); tracing::trace!("sync::GetNextPeer {} ({})", peer.peer_id, peer.score); return Some(peer); } @@ -1144,7 +1145,7 @@ impl Sync { /// /// Due to a hard-coded 10MB response limit in libp2p, we may be limited in how many blocks we can request /// for in a single request, between 1-100 blocks. - fn dynamic_batch_sizing(&self, peer: &PeerInfo) -> usize { + fn _dynamic_batch_sizing(&self, peer: &PeerInfo) -> usize { match (&self.state, &peer.version, &self.in_flight_reason) { // V1 response may be too large, reduce request range. (SyncState::Phase1(_), PeerVer::V1, DownGrade::Empty) => self From 4ed1f3865597f09391e94cecf15ecbb6f8bde578 Mon Sep 17 00:00:00 2001 From: Shawn Date: Fri, 24 Jan 2025 16:05:07 +0800 Subject: [PATCH 098/119] feat: shifts txn verification from server side to client side. --- zilliqa/src/consensus.rs | 2 +- zilliqa/src/p2p_node.rs | 13 +------------ zilliqa/src/sync.rs | 33 ++++++++++++++++++++++++++------- 3 files changed, 28 insertions(+), 20 deletions(-) diff --git a/zilliqa/src/consensus.rs b/zilliqa/src/consensus.rs index 77ff2ba04..08f1b3a98 100644 --- a/zilliqa/src/consensus.rs +++ b/zilliqa/src/consensus.rs @@ -1701,7 +1701,7 @@ impl Consensus { fn committee_for_hash(&self, parent_hash: Hash) -> Result> { let Ok(Some(parent)) = self.get_block(&parent_hash) else { - tracing::error!("parent block not found: {:?}", parent_hash); + // tracing::error!("parent block not found: {:?}", parent_hash); return Ok(Vec::new()); // return an empty vector instead of Err for graceful app-level error-handling }; diff --git a/zilliqa/src/p2p_node.rs b/zilliqa/src/p2p_node.rs index 9cec57729..6fd941b23 100644 --- a/zilliqa/src/p2p_node.rs +++ b/zilliqa/src/p2p_node.rs @@ -34,10 +34,9 @@ use crate::{ cfg::{Config, ConsensusConfig, NodeConfig}, crypto::SecretKey, db, - message::{BlockRequestV2, ExternalMessage, InternalMessage}, + message::{ExternalMessage, InternalMessage}, node::{OutgoingMessageFailure, RequestId}, node_launcher::{NodeInputChannels, NodeLauncher, ResponseChannel}, - time::SystemTime, }; /// Messages are a tuple of the destination shard ID and the actual message. @@ -313,16 +312,6 @@ impl P2pNode { debug!(source = %_source, %to, external_message = %_external_message, request_id = %_request_id, "message received"); let _topic = Self::shard_id_to_topic(shard_id); let _id = format!("{}", _request_id); - - // insert local time for BlockRequestV2 - this is checked in Sync::HandleMetadataRequest - let _external_message = match _external_message { - ExternalMessage::MetaDataRequest(BlockRequestV2{from_height, to_height, ..}) => ExternalMessage::MetaDataRequest(BlockRequestV2{ - from_height, to_height, request_at: SystemTime::now(), - }), - // pass-thru everything else - e => e, - }; - cfg_if! { if #[cfg(not(feature = "fake_response_channel"))] { self.send_to(&_topic.hash(), |c| c.requests.send((_source, _id, _external_message, ResponseChannel::Remote(_channel))))?; diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index 364b28582..e8bdfb1f4 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -430,18 +430,20 @@ impl Sync { } /// Convenience function to convert a block to a proposal (add full txs) - /// NOTE: Includes intershard transactions. Should only be used for syncing history, - /// not for consensus messages regarding new blocks. + /// Should only be used for syncing history, not for consensus messages regarding new blocks. fn block_to_proposal(&self, block: Block) -> Proposal { // since block must be valid, unwrap(s) are safe let txs = block .transactions .iter() .map(|hash| self.db.get_transaction(hash).unwrap().unwrap()) - .map(|tx| tx.verify().unwrap()) + // handle verification on the client-side + .map(|tx| { + let hash = tx.calculate_hash(); + (tx, hash) + }) .collect_vec(); - - Proposal::from_parts(block, txs) + Proposal::from_parts_with_hashes(block, txs) } /// Phase 2: Retry Phase 1 @@ -480,6 +482,23 @@ impl Sync { &mut self, from: PeerId, response: Vec, + ) -> Result<()> { + // Verify transactions on the client-side + let proposals = response + .into_iter() + .map(|p| { + let (b, t) = p.into_parts(); + let txns = t.into_iter().map(|t| t.verify().unwrap()).collect_vec(); + Proposal::from_parts(b, txns) + }) + .collect_vec(); + self.inner_handle_multiblock_response(from, proposals) + } + + pub fn inner_handle_multiblock_response( + &mut self, + from: PeerId, + response: Vec, ) -> Result<()> { if let Some(peer) = self.in_flight.as_ref() { if peer.peer_id != from { @@ -759,7 +778,7 @@ impl Sync { .sorted_by(|a, b| b.number().cmp(&a.number())) .collect_vec(); - self.handle_multiblock_response(from, multi_blocks)?; + self.inner_handle_multiblock_response(from, multi_blocks)?; } _ => { tracing::error!( @@ -883,7 +902,7 @@ impl Sync { from ); - // Do not respond to stale requests + // Do not respond to stale requests as the client has timed-out if request.request_at.elapsed()? > self.request_timeout { tracing::warn!("sync::MetadataRequest : stale request"); return Ok(ExternalMessage::Acknowledgement); From 3da126ff6cf8ddf44e359f5ed4ba6c3f9ee58796 Mon Sep 17 00:00:00 2001 From: Shawn Date: Fri, 24 Jan 2025 17:51:17 +0800 Subject: [PATCH 099/119] fix: validators_can_join..() --- zilliqa/tests/it/staking.rs | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/zilliqa/tests/it/staking.rs b/zilliqa/tests/it/staking.rs index 65dd1a7e8..179b46d84 100644 --- a/zilliqa/tests/it/staking.rs +++ b/zilliqa/tests/it/staking.rs @@ -430,10 +430,10 @@ async fn validators_can_join_and_become_proposer(mut network: Network) { let wallet = network.genesis_wallet().await; // randomise the current epoch state and current leader - // let blocks_to_prerun = network.rng.lock().unwrap().gen_range(0..8); - // network - // .run_until_block(&wallet, blocks_to_prerun.into(), 100) - // .await; + let blocks_to_prerun = network.rng.lock().unwrap().gen_range(0..8); + network + .run_until_block(&wallet, blocks_to_prerun.into(), 200) + .await; // First test joining deposit_v2 let index = network.add_node(); @@ -447,7 +447,7 @@ async fn validators_can_join_and_become_proposer(mut network: Network) { let staker_wallet = network.wallet_of_node(index).await; let pop_sinature = new_validator_key.pop_prove(); - network.run_until_synced(index).await; + // This has to be done before `contract_upgrade_block_heights` which is 12, by default in the tests let deposit_hash = deposit_stake( &mut network, &wallet, @@ -532,7 +532,6 @@ async fn validators_can_join_and_become_proposer(mut network: Network) { ); // Give new node time to catch up to block including deposit_v3 deployment - network.run_until_synced(index).await; network .run_until_block(&staker_wallet, deposit_v3_deploy_block.into(), 200) .await; From 4449e64c07b2b29426dbea87c75ec56202f7c8e4 Mon Sep 17 00:00:00 2001 From: Shawn Date: Mon, 27 Jan 2025 10:08:58 +0800 Subject: [PATCH 100/119] fix PR comments: - https://github.com/Zilliqa/zq2/pull/2089/files#r1926920199 - https://github.com/Zilliqa/zq2/pull/2089/files#r1926894987 - https://github.com/Zilliqa/zq2/pull/2089/files#r1927243328 - https://github.com/Zilliqa/zq2/pull/2089/files#r1927261156 --- zilliqa/src/consensus.rs | 111 ++++++++++++---------------------- zilliqa/src/message.rs | 4 +- zilliqa/src/sync.rs | 29 ++++----- zilliqa/tests/it/consensus.rs | 1 - 4 files changed, 51 insertions(+), 94 deletions(-) diff --git a/zilliqa/src/consensus.rs b/zilliqa/src/consensus.rs index 08f1b3a98..f7408beb5 100644 --- a/zilliqa/src/consensus.rs +++ b/zilliqa/src/consensus.rs @@ -659,10 +659,8 @@ impl Consensus { return Ok(None); } - if let Err((e, temporary)) = self.check_block(&block, during_sync) { - if !temporary { - warn!(?e, "invalid block proposal received!"); - } + if let Err(e) = self.check_block(&block, during_sync) { + warn!(?e, "invalid block proposal received!"); return Ok(None); } @@ -2225,41 +2223,32 @@ impl Consensus { /// Check the validity of a block. Returns `Err(_, true)` if this block could become valid in the future and /// `Err(_, false)` if this block could never be valid. - fn check_block(&self, block: &Block, during_sync: bool) -> Result<(), (anyhow::Error, bool)> { - block.verify_hash().map_err(|e| (e, false))?; + fn check_block(&self, block: &Block, during_sync: bool) -> Result<()> { + block.verify_hash()?; if block.view() == 0 { // We only check a block if we receive it from an external source. We obviously already have the genesis // block, so we aren't ever expecting to receive it. - return Err((anyhow!("tried to check genesis block"), false)); + return Err(anyhow!("tried to check genesis block")); } - let Some(parent) = self - .get_block(&block.parent_hash()) - .map_err(|e| (e, false))? - else { + let Some(parent) = self.get_block(&block.parent_hash())? else { warn!( "Missing parent block while trying to check validity of block number {}", block.number() ); - return Err((MissingBlockError::from(block.parent_hash()).into(), true)); + return Err(MissingBlockError::from(block.parent_hash()).into()); }; - let finalized_view = self.get_finalized_view().map_err(|e| (e, false))?; - let Some(finalized_block) = self - .get_block_by_view(finalized_view) - .map_err(|e| (e, false))? - else { - return Err((MissingBlockError::from(finalized_view).into(), false)); + let finalized_view = self.get_finalized_view()?; + let Some(finalized_block) = self.get_block_by_view(finalized_view)? else { + return Err(MissingBlockError::from(finalized_view).into()); }; if block.view() < finalized_block.view() { - return Err(( - anyhow!( - "block is too old: view is {} but we have finalized {}", - block.view(), - finalized_block.view() - ), - false, + return Err(anyhow!( + "block is too old: view is {} but we have finalized {}", + block.view(), + finalized_block.view() )); } @@ -2274,12 +2263,11 @@ impl Consensus { let committee = self .state .at_root(parent.state_root_hash().into()) - .get_stakers(block.header) - .map_err(|e| (e, false))?; + .get_stakers(block.header)?; if verified.is_err() { info!(?block, "Unable to verify block = "); - return Err((anyhow!("invalid block signature found! block hash: {:?} block view: {:?} committee len {:?}", block.hash(), block.view(), committee.len()), false)); + return Err(anyhow!("invalid block signature found! block hash: {:?} block view: {:?} committee len {:?}", block.hash(), block.view(), committee.len())); } // Check if the co-signers of the block's QC represent the supermajority. @@ -2288,13 +2276,11 @@ impl Consensus { &committee, parent.state_root_hash(), block, - ) - .map_err(|e| (e, false))?; + )?; // Verify the block's QC signature - note the parent should be the committee the QC // was signed over. - self.verify_qc_signature(&block.header.qc, committee.clone()) - .map_err(|e| (e, false))?; + self.verify_qc_signature(&block.header.qc, committee.clone())?; if let Some(agg) = &block.agg { // Check if the signers of the block's aggregate QC represent the supermajority self.check_quorum_in_indices( @@ -2302,24 +2288,16 @@ impl Consensus { &committee, parent.state_root_hash(), block, - ) - .map_err(|e| (e, false))?; + )?; // Verify the aggregate QC's signature - self.batch_verify_agg_signature(agg, &committee) - .map_err(|e| (e, false))?; + self.batch_verify_agg_signature(agg, &committee)?; } // Retrieve the highest among the aggregated QCs and check if it equals the block's QC. - let block_high_qc = self.get_high_qc_from_block(block).map_err(|e| (e, false))?; - let Some(block_high_qc_block) = self - .get_block(&block_high_qc.block_hash) - .map_err(|e| (e, false))? - else { + let block_high_qc = self.get_high_qc_from_block(block)?; + let Some(block_high_qc_block) = self.get_block(&block_high_qc.block_hash)? else { warn!("missing finalized block4"); - return Err(( - MissingBlockError::from(block_high_qc.block_hash).into(), - false, - )); + return Err(MissingBlockError::from(block_high_qc.block_hash).into()); }; // Prevent the creation of forks from the already committed chain if block_high_qc_block.view() < finalized_block.view() { @@ -2329,19 +2307,16 @@ impl Consensus { finalized_block.view(), self.high_qc, block); - return Err(( - anyhow!( - "invalid block - high QC view is {} while finalized is {}", - block_high_qc_block.view(), - finalized_block.view() - ), - false, + return Err(anyhow!( + "invalid block - high QC view is {} while finalized is {}", + block_high_qc_block.view(), + finalized_block.view() )); } // This block's timestamp must be greater than or equal to the parent block's timestamp. if block.timestamp() < parent.timestamp() { - return Err((anyhow!("timestamp decreased from parent"), false)); + return Err(anyhow!("timestamp decreased from parent")); } // This block's timestamp should be at most `self.allowed_timestamp_skew` away from the current time. Note this @@ -2351,31 +2326,22 @@ impl Consensus { .elapsed() .unwrap_or_else(|err| err.duration()); if !during_sync && difference > self.config.allowed_timestamp_skew { - return Err(( - anyhow!( - "timestamp difference for block {} greater than allowed skew: {difference:?}", - block.view() - ), - false, + return Err(anyhow!( + "timestamp difference for block {} greater than allowed skew: {difference:?}", + block.view() )); } // Blocks must be in sequential order if block.header.number != parent.header.number + 1 { - return Err(( - anyhow!( - "block number is not sequential: {} != {} + 1", - block.header.number, - parent.header.number - ), - false, + return Err(anyhow!( + "block number is not sequential: {} != {} + 1", + block.header.number, + parent.header.number )); } - if !self - .block_extends_from(block, &finalized_block) - .map_err(|e| (e, false))? - { + if !self.block_extends_from(block, &finalized_block)? { warn!( "invalid block {:?}, does not extend finalized block {:?} our head is {:?}", block, @@ -2383,9 +2349,8 @@ impl Consensus { self.head_block() ); - return Err(( - anyhow!("invalid block, does not extend from finalized block"), - false, + return Err(anyhow!( + "invalid block, does not extend from finalized block" )); } Ok(()) diff --git a/zilliqa/src/message.rs b/zilliqa/src/message.rs index fca919a3e..8f4a9b28c 100644 --- a/zilliqa/src/message.rs +++ b/zilliqa/src/message.rs @@ -228,7 +228,7 @@ impl fmt::Debug for BlockResponse { } #[derive(Debug, Clone, Serialize, Deserialize)] -pub struct BlockRequestV2 { +pub struct RequestBlocksByHeight { pub request_at: SystemTime, pub from_height: u64, pub to_height: u64, @@ -277,7 +277,7 @@ pub enum ExternalMessage { AddPeer, RemovePeer, InjectedProposal(InjectedProposal), - MetaDataRequest(BlockRequestV2), + MetaDataRequest(RequestBlocksByHeight), MetaDataResponse(Vec), MultiBlockRequest(Vec), MultiBlockResponse(Vec), diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index e8bdfb1f4..88ad98bcb 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -21,8 +21,8 @@ use crate::{ crypto::Hash, db::Db, message::{ - Block, BlockHeader, BlockRequest, BlockRequestV2, BlockResponse, ExternalMessage, - InjectedProposal, Proposal, QuorumCertificate, + Block, BlockHeader, BlockRequest, BlockResponse, ExternalMessage, InjectedProposal, + Proposal, QuorumCertificate, RequestBlocksByHeight, }, node::MessageSender, time::SystemTime, @@ -100,9 +100,6 @@ impl Sync { #[cfg(debug_assertions)] const DO_SPECULATIVE: bool = false; - // Minimum of 2 peers to avoid single source of truth. - const MIN_PEERS: usize = 2; - pub fn new( config: &NodeConfig, db: Arc, @@ -895,7 +892,7 @@ impl Sync { pub fn handle_metadata_request( &mut self, from: PeerId, - request: BlockRequestV2, + request: RequestBlocksByHeight, ) -> Result { tracing::debug!( "sync::MetadataRequest : received a metadata request from {}", @@ -981,7 +978,7 @@ impl Sync { .. }), PeerVer::V2, - ) => ExternalMessage::MetaDataRequest(BlockRequestV2 { + ) => ExternalMessage::MetaDataRequest(RequestBlocksByHeight { request_at: SystemTime::now(), to_height: block_number.saturating_sub(1), from_height: block_number.saturating_sub(self.max_batch_size as u64), @@ -1005,7 +1002,7 @@ impl Sync { let meta = meta.unwrap(); let block_number = meta.number; self.state = SyncState::Phase1(meta); - ExternalMessage::MetaDataRequest(BlockRequestV2 { + ExternalMessage::MetaDataRequest(RequestBlocksByHeight { request_at: SystemTime::now(), to_height: block_number.sub(1), from_height: block_number.sub(self.max_batch_size as u64), @@ -1148,16 +1145,12 @@ impl Sync { /// Get the next best peer to use fn get_next_peer(&mut self) -> Option { - if self.peers.len() >= Self::MIN_PEERS { - let mut peer = self.peers.pop()?; - peer.last_used = std::time::Instant::now(); - // dynamic sizing should not be needed, if we're syncing recent blocks. - // self.max_batch_size = self.dynamic_batch_sizing(&peer); - tracing::trace!("sync::GetNextPeer {} ({})", peer.peer_id, peer.score); - return Some(peer); - } - tracing::warn!("sync::NextPeer : {} insufficient peers", self.peers.len()); - None + let mut peer = self.peers.pop()?; + peer.last_used = std::time::Instant::now(); + // dynamic sizing should not be needed, if we're syncing recent blocks. + // self.max_batch_size = self.dynamic_batch_sizing(&peer); + tracing::trace!("sync::GetNextPeer {} ({})", peer.peer_id, peer.score); + Some(peer) } /// Phase 1: Dynamic Batch Sizing diff --git a/zilliqa/tests/it/consensus.rs b/zilliqa/tests/it/consensus.rs index e456f52c3..6946a5e17 100644 --- a/zilliqa/tests/it/consensus.rs +++ b/zilliqa/tests/it/consensus.rs @@ -106,7 +106,6 @@ async fn block_production(mut network: Network) { info!("Adding networked node."); let index = network.add_node(); - network.run_until_synced(index).await; network .run_until( From 108887c570d1f71bce3e1af132e71ba8ecd483e8 Mon Sep 17 00:00:00 2001 From: Shawn Date: Mon, 27 Jan 2025 15:56:57 +0800 Subject: [PATCH 101/119] feat: moved sync db layer from sync.rs to db.rs --- zilliqa/src/db.rs | 163 ++++++++++++++++++++++++++++++- zilliqa/src/sync.rs | 186 ++++-------------------------------- zilliqa/tests/it/staking.rs | 2 +- 3 files changed, 181 insertions(+), 170 deletions(-) diff --git a/zilliqa/src/db.rs b/zilliqa/src/db.rs index aba4e0968..b0ca82f97 100644 --- a/zilliqa/src/db.rs +++ b/zilliqa/src/db.rs @@ -6,13 +6,14 @@ use std::{ ops::Range, path::{Path, PathBuf}, sync::{Arc, Mutex}, - time::Duration, + time::{Duration, Instant}, }; use alloy::primitives::Address; use anyhow::{anyhow, Context, Result}; use eth_trie::{EthTrie, MemoryDB, Trie, DB}; use itertools::Itertools; +use libp2p::PeerId; use lru_mem::LruCache; use lz4::{Decoder, EncoderBuilder}; use rusqlite::{ @@ -28,6 +29,7 @@ use crate::{ exec::{ScillaError, ScillaException, ScillaTransition}, message::{AggregateQc, Block, BlockHeader, QuorumCertificate}, state::Account, + sync::PeerInfo, time::SystemTime, transaction::{EvmGas, Log, SignedTransaction, TransactionReceipt}, }; @@ -326,6 +328,19 @@ impl Db { CREATE TABLE IF NOT EXISTS state_trie (key BLOB NOT NULL PRIMARY KEY, value BLOB NOT NULL) WITHOUT ROWID; ", )?; + connection.execute_batch( + "CREATE TEMP TABLE IF NOT EXISTS sync_data ( + block_hash BLOB NOT NULL UNIQUE, + parent_hash BLOB NOT NULL, + block_number INTEGER NOT NULL PRIMARY KEY, + view_number INTEGER NOT NULL, + gas_used INTEGER NOT NULL, + version INTEGER DEFAULT 0, + peer BLOB DEFAULT NULL + ); + CREATE INDEX IF NOT EXISTS idx_sync_data ON sync_data(block_number) WHERE peer IS NOT NULL;", + )?; + Ok(()) } @@ -340,6 +355,152 @@ impl Db { Ok(Some(base_path.join("checkpoints").into_boxed_path())) } + /// Returns the number of stored sync segments + pub fn count_sync_segments(&self) -> Result { + Ok(self + .db + .lock() + .unwrap() + .prepare_cached("SELECT COUNT(block_number) FROM sync_data WHERE peer IS NOT NULL")? + .query_row([], |row| row.get(0)) + .optional()? + .unwrap_or_default()) + } + + /// Checks if the stored metadata exists + pub fn contains_sync_metadata(&self, hash: &Hash) -> Result { + Ok(self + .db + .lock() + .unwrap() + .prepare_cached("SELECT block_number FROM sync_data WHERE block_hash = ?1")? + .query_row([hash], |row| row.get::<_, u64>(0)) + .optional()? + .is_some()) + } + + /// Retrieves bulk metadata information from the given block_hash (inclusive) + pub fn get_sync_segment(&self, hash: Hash) -> Result> { + let db = self.db.lock().unwrap(); + + let mut hashes = Vec::new(); + let mut block_hash = hash; + + while let Some(parent_hash) = db + .prepare_cached("SELECT parent_hash FROM sync_data WHERE block_hash = ?1")? + .query_row([block_hash], |row| row.get::<_, Hash>(0)) + .optional()? + { + hashes.push(block_hash); + block_hash = parent_hash; + } + Ok(hashes) + } + + /// Peeks into the top of the segment stack. + pub fn last_sync_segment(&self) -> Result> { + let db = self.db.lock().unwrap(); + let r = db.prepare_cached("SELECT parent_hash, block_hash, block_number, view_number, gas_used, version, peer FROM sync_data WHERE peer IS NOT NULL ORDER BY block_number ASC LIMIT 1")? + .query_row([], |row| Ok(( + BlockHeader::from_meta_data(row.get(0)?,row.get(1)?, row.get(2)?, row.get(3)?, row.get(4)?), + PeerInfo { + last_used: Instant::now(), + score: u32::MAX, + version: row.get(5)?, + peer_id: PeerId::from_bytes(row.get::<_,Vec>(6)?.as_slice()).unwrap(), + }))).optional()?; + Ok(r) + } + + /// Pushes a particular segment into the stack. + pub fn push_sync_segment(&self, peer: PeerInfo, meta: BlockHeader) -> Result<()> { + let db = self.db.lock().unwrap(); + db.prepare_cached( + "INSERT OR REPLACE INTO sync_data (parent_hash, block_hash, block_number, view_number, gas_used, version, peer) VALUES (:parent_hash, :block_hash, :block_number, :view_number, :gas_used, :version, :peer)")? + .execute( + named_params! { + ":parent_hash": meta.qc.block_hash, + ":block_hash": meta.hash, + ":block_number": meta.number, + ":view_number": meta.view, + ":gas_used": meta.gas_used, + ":peer": peer.peer_id.to_bytes(), + ":version": peer.version, + }, + )?; + Ok(()) + } + + /// Bulk inserts a bunch of metadata. + pub fn insert_sync_metadata(&self, metas: &Vec) -> Result<()> { + let mut db = self.db.lock().unwrap(); + let tx = db.transaction()?; + + for meta in metas { + tx.prepare_cached( + "INSERT OR REPLACE INTO sync_data (parent_hash, block_hash, block_number, view_number, gas_used) VALUES (:parent_hash, :block_hash, :block_number, :view_number, :gas_used)")? + .execute( + named_params! { + ":parent_hash": meta.qc.block_hash, + ":block_hash": meta.hash, + ":block_number": meta.number, + ":view_number": meta.view, + ":gas_used": meta.gas_used, + })?; + } + tx.commit()?; + Ok(()) + } + + /// Empty the metadata table. + pub fn empty_sync_metadata(&self) -> Result<()> { + self.db + .lock() + .unwrap() + .execute("DELETE FROM sync_data", [])?; + Ok(()) + } + + /// Pops a segment from the stack; and bulk removes all metadata associated with it. + pub fn pop_sync_segment(&self) -> Result<()> { + let mut db = self.db.lock().unwrap(); + let c = db.transaction()?; + + if let Some(block_hash) = c.prepare_cached("SELECT block_hash FROM sync_data WHERE peer IS NOT NULL ORDER BY block_number ASC LIMIT 1")? + .query_row([], |row| row.get::<_,Hash>(0)).optional()? { + if let Some(parent_hash) = c.prepare_cached("SELECT parent_hash FROM sync_data WHERE block_hash = ?1")? + .query_row([block_hash], |row| row.get(0)).optional()? { + + // update marker + c.prepare_cached( + "UPDATE sync_data SET peer = NULL WHERE block_hash = ?1")? + .execute( + [block_hash] + )?; + + // remove segment + let mut hashes = Vec::new(); + let mut block_hash = parent_hash; + while let Some(parent_hash) = c + .prepare_cached("SELECT parent_hash FROM sync_data WHERE block_hash = ?1")? + .query_row([block_hash], |row| row.get::<_, Hash>(0)) + .optional()? + { + hashes.push(block_hash); + block_hash = parent_hash; + } + + for hash in hashes { + c.prepare_cached("DELETE FROM sync_data WHERE block_hash = ?1")? + .execute([hash])?; + } + } + } + + c.commit()?; + Ok(()) + } + /// Fetch checkpoint data from file and initialise db state /// Return checkpointed block and transactions which must be executed after this function /// Return None if checkpoint already loaded diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index 88ad98bcb..0bb10c7cb 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -10,11 +10,7 @@ use alloy::primitives::BlockNumber; use anyhow::Result; use itertools::Itertools; use libp2p::PeerId; -use rusqlite::{ - named_params, - types::{FromSql, FromSqlResult, ToSql, ToSqlOutput, ValueRef}, - OptionalExtension, -}; +use rusqlite::types::{FromSql, FromSqlResult, ToSql, ToSqlOutput, ValueRef}; use crate::{ cfg::NodeConfig, @@ -110,37 +106,11 @@ impl Sync { let max_batch_size = config.block_request_batch_size.clamp(30, 180); // up to 180 sec of blocks at a time. let max_blocks_in_flight = config.max_blocks_in_flight.clamp(max_batch_size, 1800); // up to 30-mins worth of blocks in-pipeline. - // This in-memory DB is placed here as it is only used in this module. - db.with_sqlite_tx(|c| { - c.execute_batch( - "CREATE TEMP TABLE IF NOT EXISTS sync_data ( - block_hash BLOB NOT NULL UNIQUE, - parent_hash BLOB NOT NULL, - block_number INTEGER NOT NULL PRIMARY KEY, - view_number INTEGER NOT NULL, - gas_used INTEGER NOT NULL, - version INTEGER DEFAULT 0, - peer BLOB DEFAULT NULL - ); - CREATE INDEX IF NOT EXISTS idx_sync_data ON sync_data(block_number) WHERE peer IS NOT NULL;", - )?; - Ok(()) - })?; - - // Restore metadata/segments - let mut segments = 0; - db.with_sqlite_tx(|c| { - segments = c - .prepare_cached("SELECT COUNT(block_number) FROM sync_data WHERE peer IS NOT NULL")? - .query_row([], |row| row.get::<_, usize>(0)) - .optional()? - .unwrap_or_default(); - Ok(()) - })?; - let state = if segments == 0 { + // Start from reset, or continue sync + let state = if db.count_sync_segments()? == 0 { SyncState::Phase0 } else { - SyncState::Retry1 + SyncState::Retry1 // continue sync }; let latest_block_number = latest_block @@ -167,156 +137,36 @@ impl Sync { }) } - /// Returns the number of stored segments fn count_segments(&self) -> Result { - let mut segments = 0; - self.db.with_sqlite_tx(|c| { - segments = c - .prepare_cached("SELECT COUNT(block_number) FROM sync_data WHERE peer IS NOT NULL")? - .query_row([], |row| row.get(0)) - .optional()? - .unwrap_or_default(); - Ok(()) - })?; - Ok(segments) + self.db.count_sync_segments() } - /// Checks if the stored metadata exists fn contains_metadata(&self, hash: &Hash) -> Result { - let mut result = false; - self.db.with_sqlite_tx(|c| { - result = c - .prepare_cached("SELECT block_number FROM sync_data WHERE block_hash = ?1")? - .query_row([hash], |row| row.get::<_, u64>(0)) - .optional()? - .is_some(); - Ok(()) - })?; - Ok(result) + self.db.contains_sync_metadata(hash) } - /// Retrieves bulk metadata information from the given block_hash (inclusive) fn get_segment(&self, hash: Hash) -> Result> { - let mut hashes = Vec::with_capacity(self.max_batch_size); - let mut block_hash = hash; - self.db.with_sqlite_tx(|c| { - while let Some(parent_hash) = c - .prepare_cached("SELECT parent_hash FROM sync_data WHERE block_hash = ?1")? - .query_row([block_hash], |row| row.get::<_, Hash>(0)) - .optional()? - { - hashes.push(block_hash); - block_hash = parent_hash; - } - Ok(()) - })?; - Ok(hashes) + self.db.get_sync_segment(hash) } - /// Peeks into the top of the segment stack. fn last_segment(&self) -> Result> { - let mut result = None; - self.db.with_sqlite_tx(|c| { - result = c - .prepare_cached("SELECT parent_hash, block_hash, block_number, view_number, gas_used, version, peer FROM sync_data WHERE peer IS NOT NULL ORDER BY block_number ASC LIMIT 1")? - .query_row([], |row| Ok(( - BlockHeader::from_meta_data(row.get(0)?,row.get(1)?, row.get(2)?, row.get(3)?, row.get(4)?), - PeerInfo { - last_used: Instant::now(), - score:u32::MAX, - version: row.get(5)?, - peer_id: PeerId::from_bytes(row.get::<_,Vec>(6)?.as_slice()).unwrap(), - }, - ))) - .optional()?; - Ok(()) - })?; - Ok(result) + self.db.last_sync_segment() } - /// Pops a segment from the stack; and bulk removes all metadata associated with it. fn pop_segment(&self) -> Result<()> { - self.db.with_sqlite_tx(|c| { - if let Some(block_hash) = c.prepare_cached("SELECT block_hash FROM sync_data WHERE peer IS NOT NULL ORDER BY block_number ASC LIMIT 1")? - .query_row([], |row| row.get::<_,Hash>(0)).optional()? { - if let Some(parent_hash) = c.prepare_cached("SELECT parent_hash FROM sync_data WHERE block_hash = ?1")? - .query_row([block_hash], |row| row.get(0)).optional()? { - - // update marker - c.prepare_cached( - "UPDATE sync_data SET peer = NULL WHERE block_hash = ?1")? - .execute( - [block_hash] - )?; - - // remove segment - let mut hashes = Vec::with_capacity(self.max_batch_size); - let mut block_hash = parent_hash; - while let Some(parent_hash) = c - .prepare_cached("SELECT parent_hash FROM sync_data WHERE block_hash = ?1")? - .query_row([block_hash], |row| row.get::<_, Hash>(0)) - .optional()? - { - hashes.push(block_hash); - block_hash = parent_hash; - } - - for hash in hashes { - c.prepare_cached("DELETE FROM sync_data WHERE block_hash = ?1")? - .execute([hash])?; - } - } - } - Ok(()) - }) + self.db.pop_sync_segment() } - /// Pushes a particular segment into the stack. fn push_segment(&self, peer: PeerInfo, meta: BlockHeader) -> Result<()> { - self.db.with_sqlite_tx(|c| { - c.prepare_cached( - "INSERT OR REPLACE INTO sync_data (parent_hash, block_hash, block_number, view_number, gas_used, version, peer) VALUES (:parent_hash, :block_hash, :block_number, :view_number, :gas_used, :version, :peer)")? - .execute( - named_params! { - ":parent_hash": meta.qc.block_hash, - ":block_hash": meta.hash, - ":block_number": meta.number, - ":view_number": meta.view, - ":gas_used": meta.gas_used, - ":peer": peer.peer_id.to_bytes(), - ":version": peer.version, - }, - )?; - Ok(()) - }) + self.db.push_sync_segment(peer, meta) } - /// Bulk inserts a bunch of metadata. fn insert_metadata(&self, metas: &Vec) -> Result<()> { - self.db.with_sqlite_tx(|c| { - for meta in metas { - c.prepare_cached( - "INSERT OR REPLACE INTO sync_data (parent_hash, block_hash, block_number, view_number, gas_used) VALUES (:parent_hash, :block_hash, :block_number, :view_number, :gas_used)")? - .execute( - named_params! { - ":parent_hash": meta.qc.block_hash, - ":block_hash": meta.hash, - ":block_number": meta.number, - ":view_number": meta.view, - ":gas_used": meta.gas_used, - }, - )?; - } - Ok(()) - }) + self.db.insert_sync_metadata(metas) } - /// Empty the metadata table. fn empty_metadata(&self) -> Result<()> { - self.db.with_sqlite_tx(|c| { - c.execute("DELETE FROM sync_data", [])?; - Ok(()) - }) + self.db.empty_sync_metadata() } /// Phase 0: Sync a block proposal. @@ -1218,11 +1068,11 @@ impl Sync { } #[derive(Debug, Clone, Eq, PartialEq)] -struct PeerInfo { - score: u32, - peer_id: PeerId, - last_used: Instant, - version: PeerVer, +pub struct PeerInfo { + pub score: u32, + pub peer_id: PeerId, + pub last_used: Instant, + pub version: PeerVer, } impl Ord for PeerInfo { @@ -1274,7 +1124,7 @@ enum SyncState { /// Peer Version #[derive(Debug, Clone, Eq, PartialEq)] -enum PeerVer { +pub enum PeerVer { V1 = 1, V2 = 2, } diff --git a/zilliqa/tests/it/staking.rs b/zilliqa/tests/it/staking.rs index 179b46d84..2fcaab5d3 100644 --- a/zilliqa/tests/it/staking.rs +++ b/zilliqa/tests/it/staking.rs @@ -430,7 +430,7 @@ async fn validators_can_join_and_become_proposer(mut network: Network) { let wallet = network.genesis_wallet().await; // randomise the current epoch state and current leader - let blocks_to_prerun = network.rng.lock().unwrap().gen_range(0..8); + let blocks_to_prerun = network.rng.lock().unwrap().gen_range(0..4); network .run_until_block(&wallet, blocks_to_prerun.into(), 200) .await; From 40c84ce977970221d722c15e4fb1d97302d70ffe Mon Sep 17 00:00:00 2001 From: Shawn Date: Mon, 27 Jan 2025 17:58:39 +0800 Subject: [PATCH 102/119] feat: moved internal sync-peers to SyncPeers shared-state. https://github.com/Zilliqa/zq2/pull/2089/files#r1927206432 --- zilliqa/src/consensus.rs | 13 ++- zilliqa/src/node.rs | 18 +-- zilliqa/src/node_launcher.rs | 10 +- zilliqa/src/p2p_node.rs | 16 ++- zilliqa/src/sync.rs | 214 +++++++++++++++++++---------------- 5 files changed, 155 insertions(+), 116 deletions(-) diff --git a/zilliqa/src/consensus.rs b/zilliqa/src/consensus.rs index f7408beb5..bc044c001 100644 --- a/zilliqa/src/consensus.rs +++ b/zilliqa/src/consensus.rs @@ -36,7 +36,7 @@ use crate::{ node::{MessageSender, NetworkMessage, OutgoingMessageFailure}, pool::{TransactionPool, TxAddResult, TxPoolContent}, state::State, - sync::Sync, + sync::{Sync, SyncPeers}, time::SystemTime, transaction::{EvmGas, SignedTransaction, TransactionReceipt, VerifiedTransaction}, }; @@ -187,6 +187,7 @@ impl Consensus { message_sender: MessageSender, reset_timeout: UnboundedSender, db: Arc, + peers: Arc, ) -> Result { trace!( "Opening database in {:?} for shard {}", @@ -312,7 +313,13 @@ impl Consensus { } }; - let sync = Sync::new(&config, db.clone(), &latest_block, message_sender.clone())?; + let sync = Sync::new( + &config, + db.clone(), + &latest_block, + message_sender.clone(), + peers.clone(), + )?; let mut consensus = Consensus { secret_key, @@ -411,7 +418,7 @@ impl Consensus { }) .collect_vec(); - consensus.sync.add_peers(recent_peer_ids); + peers.add_peers(recent_peer_ids); } Ok(consensus) diff --git a/zilliqa/src/node.rs b/zilliqa/src/node.rs index d04a7a7ec..edd6f835f 100644 --- a/zilliqa/src/node.rs +++ b/zilliqa/src/node.rs @@ -40,6 +40,7 @@ use crate::{ p2p_node::{LocalMessageTuple, OutboundMessageTuple}, pool::{TxAddResult, TxPoolContent}, state::State, + sync::SyncPeers, transaction::{ EvmGas, SignedTransaction, TransactionReceipt, TxIntershard, VerifiedTransaction, }, @@ -170,6 +171,7 @@ impl ChainId { } impl Node { + #[allow(clippy::too_many_arguments)] pub fn new( config: NodeConfig, secret_key: SecretKey, @@ -178,6 +180,7 @@ impl Node { request_responses: UnboundedSender<(ResponseChannel, ExternalMessage)>, reset_timeout: UnboundedSender, peer_num: Arc, + peers: Arc, ) -> Result { config.validate()?; let peer_id = secret_key.to_libp2p_keypair().public().to_peer_id(); @@ -201,7 +204,14 @@ impl Node { reset_timeout: reset_timeout.clone(), db: db.clone(), chain_id: ChainId::new(config.eth_chain_id), - consensus: Consensus::new(secret_key, config, message_sender, reset_timeout, db)?, + consensus: Consensus::new( + secret_key, + config, + message_sender, + reset_timeout, + db, + peers, + )?, peer_num, }; Ok(node) @@ -227,12 +237,6 @@ impl Node { )))?; } } - ExternalMessage::AddPeer => { - self.consensus.sync.add_peer(from); - } - ExternalMessage::RemovePeer => { - self.consensus.sync.remove_peer(from); - } // `Proposals` are re-routed to `handle_request()` _ => { warn!("unexpected message type"); diff --git a/zilliqa/src/node_launcher.rs b/zilliqa/src/node_launcher.rs index 747208918..50d10cf52 100644 --- a/zilliqa/src/node_launcher.rs +++ b/zilliqa/src/node_launcher.rs @@ -32,6 +32,7 @@ use crate::{ message::{ExternalMessage, InternalMessage}, node::{self, OutgoingMessageFailure}, p2p_node::{LocalMessageTuple, OutboundMessageTuple}, + sync::SyncPeers, }; pub struct NodeLauncher { @@ -96,7 +97,7 @@ impl NodeLauncher { local_outbound_message_sender: UnboundedSender, request_responses_sender: UnboundedSender<(ResponseChannel, ExternalMessage)>, peer_num: Arc, - ) -> Result<(Self, NodeInputChannels)> { + ) -> Result<(Self, NodeInputChannels, Arc)> { /// Helper to create a (sender, receiver) pair for a channel. fn sender_receiver() -> (UnboundedSender, UnboundedReceiverStream) { let (sender, receiver) = mpsc::unbounded_channel(); @@ -110,6 +111,9 @@ impl NodeLauncher { let (local_messages_sender, local_messages_receiver) = sender_receiver(); let (reset_timeout_sender, reset_timeout_receiver) = sender_receiver(); + let peer_id = secret_key.to_libp2p_keypair().public().to_peer_id(); + let peers: Arc = Arc::new(SyncPeers::new(peer_id)); + let node = Node::new( config.clone(), secret_key, @@ -118,7 +122,9 @@ impl NodeLauncher { request_responses_sender, reset_timeout_sender.clone(), peer_num, + peers.clone(), )?; + let node = Arc::new(Mutex::new(node)); for api_server in &config.api_servers { @@ -168,7 +174,7 @@ impl NodeLauncher { local_messages: local_messages_sender, }; - Ok((launcher, input_channels)) + Ok((launcher, input_channels, peers)) } pub async fn start_shard_node(&mut self) -> Result<()> { diff --git a/zilliqa/src/p2p_node.rs b/zilliqa/src/p2p_node.rs index 6fd941b23..24cc271ad 100644 --- a/zilliqa/src/p2p_node.rs +++ b/zilliqa/src/p2p_node.rs @@ -37,6 +37,7 @@ use crate::{ message::{ExternalMessage, InternalMessage}, node::{OutgoingMessageFailure, RequestId}, node_launcher::{NodeInputChannels, NodeLauncher, ResponseChannel}, + sync::SyncPeers, }; /// Messages are a tuple of the destination shard ID and the actual message. @@ -61,6 +62,7 @@ pub type OutboundMessageTuple = (Option<(PeerId, RequestId)>, u64, ExternalMessa pub type LocalMessageTuple = (u64, u64, InternalMessage); pub struct P2pNode { + shard_peers: HashMap>, shard_nodes: HashMap, shard_threads: JoinSet>, task_threads: JoinSet>, @@ -148,6 +150,7 @@ impl P2pNode { .build(); Ok(Self { + shard_peers: HashMap::new(), shard_nodes: HashMap::new(), peer_id, secret_key, @@ -194,7 +197,7 @@ impl P2pNode { info!("LaunchShard message received for a shard we're already running. Ignoring..."); return Ok(()); } - let (mut node, input_channels) = NodeLauncher::new( + let (mut node, input_channels, peers) = NodeLauncher::new( self.secret_key, config, self.outbound_message_sender.clone(), @@ -203,6 +206,7 @@ impl P2pNode { self.peer_num.clone(), ) .await?; + self.shard_peers.insert(topic.hash(), peers); self.shard_nodes.insert(topic.hash(), input_channels); self.shard_threads .spawn(async move { node.start_shard_node().await }); @@ -266,12 +270,14 @@ impl P2pNode { .add_address(&peer_id, address.clone()); } SwarmEvent::Behaviour(BehaviourEvent::Gossipsub(gossipsub::Event::Subscribed { peer_id, topic })) => { - let message = ExternalMessage::AddPeer; - self.send_to(&topic, |c| c.broadcasts.send((peer_id, message)))?; + if let Some(peers) = self.shard_peers.get(&topic) { + peers.add_peer(peer_id); + } } SwarmEvent::Behaviour(BehaviourEvent::Gossipsub(gossipsub::Event::Unsubscribed { peer_id, topic })) => { - let message = ExternalMessage::RemovePeer; - self.send_to(&topic, |c| c.broadcasts.send((peer_id, message)))?; + if let Some(peers) = self.shard_peers.get(&topic) { + peers.remove_peer(peer_id); + } } SwarmEvent::Behaviour(BehaviourEvent::Gossipsub(gossipsub::Event::Message{ message_id: msg_id, diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index 0bb10c7cb..db972dcbc 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -2,7 +2,7 @@ use std::{ cmp::Ordering, collections::{BinaryHeap, VecDeque}, ops::Sub, - sync::Arc, + sync::{Arc, Mutex}, time::{Duration, Instant}, }; @@ -62,11 +62,10 @@ pub struct Sync { db: Arc, // message bus message_sender: MessageSender, - // internal list of peers, maintained with add_peer/remove_peer. - peers: BinaryHeap, + // internal peers + peers: Arc, // peer handling an in-flight request in_flight: Option, - in_flight_reason: DownGrade, // in-flight request timeout, before retry request_timeout: Duration, // how many blocks to request at once @@ -101,6 +100,7 @@ impl Sync { db: Arc, latest_block: &Option, message_sender: MessageSender, + peers: Arc, ) -> Result { let peer_id = message_sender.our_peer_id; let max_batch_size = config.block_request_batch_size.clamp(30, 180); // up to 180 sec of blocks at a time. @@ -121,13 +121,12 @@ impl Sync { Ok(Self { db, message_sender, - peers: BinaryHeap::new(), peer_id, + peers, request_timeout: config.consensus.consensus_timeout, max_batch_size, max_blocks_in_flight, in_flight: None, - in_flight_reason: DownGrade::None, in_pipeline: usize::MIN, state, recent_proposals: VecDeque::with_capacity(max_batch_size), @@ -364,11 +363,13 @@ impl Sync { if response.is_empty() { // Empty response, downgrade peer and retry phase 1. tracing::warn!("sync::MultiBlockResponse : empty blocks {from}",); - self.done_with_peer(DownGrade::Empty); + self.peers + .done_with_peer(self.in_flight.take(), DownGrade::Empty); self.state = SyncState::Retry1; return Ok(()); } else { - self.done_with_peer(DownGrade::None); + self.peers + .done_with_peer(self.in_flight.take(), DownGrade::None); } tracing::info!( @@ -465,7 +466,8 @@ impl Sync { "sync::RequestMissingBlocks : in-flight request {} timed out, requesting from new peer", peer.peer_id ); - self.done_with_peer(DownGrade::Timeout); + self.peers + .done_with_peer(self.in_flight.take(), DownGrade::Timeout); } else { return Ok(()); } @@ -478,9 +480,9 @@ impl Sync { } // will be re-inserted below - if let Some(peer) = self.get_next_peer() { + if let Some(peer) = self.peers.get_next_peer() { // reinsert peer, as we will use a faux peer below, to force the request to go to the original responder - self.peers.push(peer); + self.peers.reinsert_peer(peer)?; // If we have no chain_segments, we have nothing to do if let Some((meta, peer_info)) = self.last_segment()? { @@ -550,19 +552,11 @@ impl Sync { && response.from_view == u64::MAX { tracing::info!("sync::HandleBlockResponse : upgrading {from}",); - if let Some(peer) = self.in_flight.as_mut() { + if let Some(mut peer) = self.in_flight.take() { if peer.peer_id == from { peer.version = PeerVer::V2; // retry with upgraded peer - peer.last_used = self - .peers - .peek() - .expect("peers.len() > 1") - .last_used - .checked_sub(Duration::from_secs(1)) - .expect("time is ordinal"); - self.done_with_peer(DownGrade::None); - + self.peers.reinsert_peer(peer)?; if Self::DO_SPECULATIVE { match self.state { SyncState::Phase1(_) => self.request_missing_metadata(None)?, @@ -667,10 +661,12 @@ impl Sync { if response.is_empty() { // Empty response, downgrade peer and retry with a new peer. tracing::warn!("sync::MetadataResponse : empty blocks {from}",); - self.done_with_peer(DownGrade::Empty); + self.peers + .done_with_peer(self.in_flight.take(), DownGrade::Empty); return Ok(()); } else { - self.done_with_peer(DownGrade::None); + self.peers + .done_with_peer(self.in_flight.take(), DownGrade::None); } // Check the linkage of the returned chain @@ -801,7 +797,8 @@ impl Sync { "sync::RequestMissingMetadata : in-flight request {} timed out, requesting from new peer", peer.peer_id ); - self.done_with_peer(DownGrade::Timeout); + self.peers + .done_with_peer(self.in_flight.take(), DownGrade::Timeout); } else { return Ok(()); } @@ -814,7 +811,7 @@ impl Sync { return Ok(()); } - if let Some(peer) = self.get_next_peer() { + if let Some(peer) = self.peers.get_next_peer() { tracing::info!( "sync::RequestMissingMetadata : requesting {} metadata of segment #{} from {}", self.max_batch_size, @@ -942,26 +939,84 @@ impl Sync { Ok(()) } + /// Returns (am_syncing, current_highest_block) + pub fn am_syncing(&self) -> Result { + Ok(self.in_pipeline != 0 + || !matches!(self.state, SyncState::Phase0) + || !self.recent_proposals.is_empty() + || self.count_segments()? != 0) + } + + // Returns (starting_block, current_block, highest_block) if we're syncing, + // None if we're not. + pub fn get_sync_data(&self) -> Result> { + let flag = self.am_syncing()?; + if !flag { + Ok(None) + } else { + let highest_block = self + .db + .get_canonical_block_by_number( + self.db + .get_highest_canonical_block_number()? + .expect("no highest block"), + )? + .expect("missing highest block"); + + let highest_saved_block_number = highest_block.number(); + let highest_block_number_seen = self.recent_proposals.back().unwrap().number(); + Ok(Some(( + self.started_at_block_number, + highest_saved_block_number, + highest_block_number_seen, + ))) + } + } + + /// Sets the checkpoint, if node was started from a checkpoint. + pub fn set_checkpoint(&mut self, checkpoint: &Block) { + let hash = checkpoint.hash(); + tracing::info!("sync::Checkpoint {}", hash); + self.checkpoint_hash = hash; + } +} + +#[derive(Debug)] +pub struct SyncPeers { + peer_id: PeerId, + peers: Arc>>, +} + +impl SyncPeers { + pub fn new(peer_id: PeerId) -> Self { + Self { + peer_id, + peers: Arc::new(Mutex::new(BinaryHeap::::new())), + } + } + /// Downgrade a peer based on the response received. /// /// This algorithm favours good peers that respond quickly (i.e. no timeout). /// In most cases, it eventually degenerates into 2 sources - avoid a single source of truth. - fn done_with_peer(&mut self, downgrade: DownGrade) { - if let Some(mut peer) = self.in_flight.take() { + fn done_with_peer(&self, in_flight: Option, downgrade: DownGrade) { + if let Some(mut peer) = in_flight { tracing::trace!("sync::DoneWithPeer {} {:?}", peer.peer_id, downgrade); - self.in_flight_reason = downgrade.clone(); + let mut peers = self.peers.lock().unwrap(); peer.score = peer.score.saturating_add(downgrade as u32); - // Ensure that the next peer is equal or better - peer.score = peer.score.max(self.peers.peek().unwrap().score); + if !peers.is_empty() { + // Ensure that the next peer is equal or better + peer.score = peer.score.max(peers.peek().unwrap().score); + } // Reinsert peers that are good if peer.score < u32::MAX { - self.peers.push(peer); + peers.push(peer); } } } /// Add bulk peers - pub fn add_peers(&mut self, peers: Vec) { + pub fn add_peers(&self, peers: Vec) { tracing::debug!("sync::AddPeers {:?}", peers); for peer in peers { if peer != self.peer_id { @@ -971,31 +1026,33 @@ impl Sync { } /// Add a peer to the list of peers. - pub fn add_peer(&mut self, peer: PeerId) { + pub fn add_peer(&self, peer: PeerId) { + let mut peers = self.peers.lock().unwrap(); // if the new peer is not synced, it will get downgraded to the back of heap. // but by placing them at the back of the 'best' pack, we get to try them out soon. let new_peer = PeerInfo { version: PeerVer::V1, - score: self.peers.iter().map(|p| p.score).min().unwrap_or_default(), + score: peers.iter().map(|p| p.score).min().unwrap_or_default(), peer_id: peer, last_used: Instant::now(), }; // ensure that it is unique - avoids single source of truth - self.peers.retain(|p: &PeerInfo| p.peer_id != peer); - self.peers.push(new_peer); + peers.retain(|p: &PeerInfo| p.peer_id != peer); + peers.push(new_peer); - tracing::trace!("sync::AddPeer {peer}/{}", self.peers.len()); + tracing::trace!("sync::AddPeer {peer}/{}", peers.len()); } /// Remove a peer from the list of peers. - pub fn remove_peer(&mut self, peer: PeerId) { - tracing::trace!("sync::RemovePeer {peer}"); - self.peers.retain(|p: &PeerInfo| p.peer_id != peer); + pub fn remove_peer(&self, peer: PeerId) { + let mut peers = self.peers.lock().unwrap(); + peers.retain(|p: &PeerInfo| p.peer_id != peer); + tracing::trace!("sync::RemovePeer {peer}/{}", peers.len()); } /// Get the next best peer to use - fn get_next_peer(&mut self) -> Option { - let mut peer = self.peers.pop()?; + pub fn get_next_peer(&self) -> Option { + let mut peer = self.peers.lock().unwrap().pop()?; peer.last_used = std::time::Instant::now(); // dynamic sizing should not be needed, if we're syncing recent blocks. // self.max_batch_size = self.dynamic_batch_sizing(&peer); @@ -1003,67 +1060,26 @@ impl Sync { Some(peer) } - /// Phase 1: Dynamic Batch Sizing - /// - /// Due to a hard-coded 10MB response limit in libp2p, we may be limited in how many blocks we can request - /// for in a single request, between 1-100 blocks. - fn _dynamic_batch_sizing(&self, peer: &PeerInfo) -> usize { - match (&self.state, &peer.version, &self.in_flight_reason) { - // V1 response may be too large, reduce request range. - (SyncState::Phase1(_), PeerVer::V1, DownGrade::Empty) => self - .max_batch_size - .saturating_sub(self.max_batch_size / 3) - .max(1), - // V1 response going well, increase the request range - (SyncState::Phase1(_), PeerVer::V1, DownGrade::None) => self - .max_batch_size - .saturating_add(self.max_batch_size) - // For V1, ~100 empty blocks saturates the response payload - .min(100), - // V2 response may be too large, which can induce a timeout. Split into 10 block segments - _ => self.max_batch_size, - } - } - - /// Returns (am_syncing, current_highest_block) - pub fn am_syncing(&self) -> Result { - Ok(self.in_pipeline != 0 - || !matches!(self.state, SyncState::Phase0) - || !self.recent_proposals.is_empty() - || self.count_segments()? != 0) + /// Reinserts the peer such that it is at the front of the queue. + pub fn reinsert_peer(&self, peer: PeerInfo) -> Result<()> { + let mut peers = self.peers.lock().unwrap(); + let mut peer = peer; + peer.last_used = peers + .peek() + .expect("peers.len() > 1") + .last_used + .checked_sub(Duration::from_secs(1)) + .expect("time is ordinal"); + peers.push(peer); + Ok(()) } - // Returns (starting_block, current_block, highest_block) if we're syncing, - // None if we're not. - pub fn get_sync_data(&self) -> Result> { - let flag = self.am_syncing()?; - if !flag { - Ok(None) - } else { - let highest_block = self - .db - .get_canonical_block_by_number( - self.db - .get_highest_canonical_block_number()? - .expect("no highest block"), - )? - .expect("missing highest block"); - - let highest_saved_block_number = highest_block.number(); - let highest_block_number_seen = self.recent_proposals.back().unwrap().number(); - Ok(Some(( - self.started_at_block_number, - highest_saved_block_number, - highest_block_number_seen, - ))) - } + pub fn len(&self) -> usize { + self.peers.lock().unwrap().len() } - /// Sets the checkpoint, if node was started from a checkpoint. - pub fn set_checkpoint(&mut self, checkpoint: &Block) { - let hash = checkpoint.hash(); - tracing::info!("sync::Checkpoint {}", hash); - self.checkpoint_hash = hash; + pub fn is_empty(&self) -> bool { + self.peers.lock().unwrap().is_empty() } } From 3ede63115aaa5e2a8e791443bdb762c6bbcbfc1d Mon Sep 17 00:00:00 2001 From: Shawn Date: Mon, 27 Jan 2025 17:59:33 +0800 Subject: [PATCH 103/119] fix: tests with shared-state SyncPeers. --- z2/src/docgen.rs | 16 +++++++++++++--- zilliqa/benches/it.rs | 7 ++++++- zilliqa/tests/it/main.rs | 25 +++++++++++-------------- 3 files changed, 30 insertions(+), 18 deletions(-) diff --git a/z2/src/docgen.rs b/z2/src/docgen.rs index 1f9b25a52..76d71a30c 100644 --- a/z2/src/docgen.rs +++ b/z2/src/docgen.rs @@ -14,7 +14,7 @@ use regex::Regex; use serde::{Deserialize, Serialize}; use tera::Tera; use tokio::fs; -use zilliqa::{cfg::NodeConfig, crypto::SecretKey}; +use zilliqa::{cfg::NodeConfig, crypto::SecretKey, sync::SyncPeers}; const SUPPORTED_APIS_PATH_NAME: &str = "index"; @@ -352,10 +352,20 @@ pub fn get_implemented_jsonrpc_methods() -> Result Consensus { let secret_key = genesis_deposits[index].0; + let peer_id = secret_key.to_libp2p_keypair().public().to_peer_id(); let (outbound_message_sender, a) = mpsc::unbounded_channel(); let (local_message_sender, b) = mpsc::unbounded_channel(); let (reset_timeout_sender, c) = mpsc::unbounded_channel(); @@ -208,6 +212,7 @@ fn consensus( message_sender, reset_timeout_sender, Arc::new(db), + Arc::new(SyncPeers::new(peer_id)), ) .unwrap() } diff --git a/zilliqa/tests/it/main.rs b/zilliqa/tests/it/main.rs index e1267087f..c90064ef7 100644 --- a/zilliqa/tests/it/main.rs +++ b/zilliqa/tests/it/main.rs @@ -77,6 +77,7 @@ use zilliqa::{ message::{ExternalMessage, InternalMessage}, node::{Node, RequestId}, node_launcher::ResponseChannel, + sync::SyncPeers, transaction::EvmGas, }; @@ -165,6 +166,9 @@ fn node( let (reset_timeout_sender, reset_timeout_receiver) = mpsc::unbounded_channel(); std::mem::forget(reset_timeout_receiver); + let peer_id = secret_key.to_libp2p_keypair().public().to_peer_id(); + let peers = Arc::new(SyncPeers::new(peer_id)); + let node = Node::new( NodeConfig { data_dir: datadir @@ -178,6 +182,7 @@ fn node( request_responses_sender, reset_timeout_sender, Arc::new(AtomicUsize::new(0)), + peers.clone(), )?; let node = Arc::new(Mutex::new(node)); let rpc_module: RpcModule>> = @@ -186,12 +191,13 @@ fn node( Ok(( TestNode { index, - peer_id: secret_key.to_libp2p_keypair().public().to_peer_id(), + peer_id, secret_key, onchain_key, inner: node, dir: datadir, rpc_module, + peers, }, message_receiver, local_message_receiver, @@ -208,6 +214,7 @@ struct TestNode { rpc_module: RpcModule>>, inner: Arc>, dir: Option, + peers: Arc, } struct Network { @@ -411,12 +418,7 @@ impl Network { node.peer_id, node.dir.as_ref().unwrap().path().to_string_lossy(), ); - node.inner - .lock() - .unwrap() - .consensus - .sync - .add_peers(peers.clone()); + node.peers.add_peers(peers.clone()); } Network { @@ -516,7 +518,7 @@ impl Network { let mut peers = self.nodes.iter().map(|n| n.peer_id).collect_vec(); peers.shuffle(self.rng.lock().unwrap().deref_mut()); - node.inner.lock().unwrap().consensus.sync.add_peers(peers); + node.peers.add_peers(peers.clone()); trace!("Node {}: {}", node.index, node.peer_id); @@ -590,12 +592,7 @@ impl Network { node.peer_id, node.dir.as_ref().unwrap().path().to_string_lossy(), ); - node.inner - .lock() - .unwrap() - .consensus - .sync - .add_peers(peers.clone()); + node.peers.add_peers(peers.clone()); } let (resend_message, receive_resend_message) = mpsc::unbounded_channel::(); From d4f6d26f4662f59c88828fbbe1be8931590205a7 Mon Sep 17 00:00:00 2001 From: Shawn Date: Mon, 27 Jan 2025 23:08:56 +0800 Subject: [PATCH 104/119] fix: issue of upgraded node, encountered in protomainnet where a node was recorded as V1 in Phase1, but was updated to V2 in Phase2, causing the sync to be stuck in a loop. --- zilliqa/src/sync.rs | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index db972dcbc..27a431dd9 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -546,23 +546,25 @@ impl Sync { /// In phase 1, it will extract the metadata and feed it into handle_metadata_response. /// In phase 2, it will extract the blocks and feed it into handle_multiblock_response. pub fn handle_block_response(&mut self, from: PeerId, response: BlockResponse) -> Result<()> { - // Upgrade to V2 peer. + // V2 response if response.availability.is_none() && response.proposals.is_empty() && response.from_view == u64::MAX { - tracing::info!("sync::HandleBlockResponse : upgrading {from}",); + tracing::info!("sync::HandleBlockResponse : new response from {from}",); if let Some(mut peer) = self.in_flight.take() { - if peer.peer_id == from { + if peer.peer_id == from && peer.version == PeerVer::V1 { + // upgrade to V2 peer peer.version = PeerVer::V2; - // retry with upgraded peer self.peers.reinsert_peer(peer)?; - if Self::DO_SPECULATIVE { - match self.state { - SyncState::Phase1(_) => self.request_missing_metadata(None)?, - SyncState::Phase2(_) => self.request_missing_blocks()?, - _ => {} + match self.state { + SyncState::Phase2(_) => { + self.state = SyncState::Retry1; } + SyncState::Phase1(_) if Self::DO_SPECULATIVE => { + self.request_missing_metadata(None)?; + } + _ => {} } } } @@ -1062,6 +1064,9 @@ impl SyncPeers { /// Reinserts the peer such that it is at the front of the queue. pub fn reinsert_peer(&self, peer: PeerInfo) -> Result<()> { + if peer.score == u32::MAX { + return Ok(()); + } let mut peers = self.peers.lock().unwrap(); let mut peer = peer; peer.last_used = peers From 7cd32214f51b43e6787c62bdeb1dfc4471f3445d Mon Sep 17 00:00:00 2001 From: Shawn Date: Tue, 28 Jan 2025 09:04:06 +0800 Subject: [PATCH 105/119] nit: increase deposit_v3 boundary to 24. --- zilliqa/tests/it/staking.rs | 7 +++---- zilliqa/tests/it/unreliable.rs | 1 - 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/zilliqa/tests/it/staking.rs b/zilliqa/tests/it/staking.rs index 2fcaab5d3..b423ffcde 100644 --- a/zilliqa/tests/it/staking.rs +++ b/zilliqa/tests/it/staking.rs @@ -425,7 +425,7 @@ async fn rewards_are_sent_to_reward_address_of_proposer(mut network: Network) { check_miner_got_reward(&wallet, 1).await; } -#[zilliqa_macros::test(blocks_per_epoch = 2, deposit_v3_upgrade_block_height = 12)] +#[zilliqa_macros::test(blocks_per_epoch = 2, deposit_v3_upgrade_block_height = 24)] async fn validators_can_join_and_become_proposer(mut network: Network) { let wallet = network.genesis_wallet().await; @@ -447,7 +447,7 @@ async fn validators_can_join_and_become_proposer(mut network: Network) { let staker_wallet = network.wallet_of_node(index).await; let pop_sinature = new_validator_key.pop_prove(); - // This has to be done before `contract_upgrade_block_heights` which is 12, by default in the tests + // This has to be done before `contract_upgrade_block_heights` which is 24, by default in this test let deposit_hash = deposit_stake( &mut network, &wallet, @@ -514,7 +514,6 @@ async fn validators_can_join_and_become_proposer(mut network: Network) { check_miner_got_reward(&wallet, BlockNumber::Latest).await; // Now test joining deposit_v3 - let deposit_v3_deploy_block = 12; let index = network.add_node(); let new_validator_priv_key = network.get_node_raw(index).secret_key; let new_validator_pub_key = new_validator_priv_key.node_public_key(); @@ -533,7 +532,7 @@ async fn validators_can_join_and_become_proposer(mut network: Network) { // Give new node time to catch up to block including deposit_v3 deployment network - .run_until_block(&staker_wallet, deposit_v3_deploy_block.into(), 200) + .run_until_block(&staker_wallet, 24.into(), 200) .await; let deposit_hash = deposit_v3_stake( diff --git a/zilliqa/tests/it/unreliable.rs b/zilliqa/tests/it/unreliable.rs index b35787e93..7518a3131 100644 --- a/zilliqa/tests/it/unreliable.rs +++ b/zilliqa/tests/it/unreliable.rs @@ -26,7 +26,6 @@ async fn blocks_are_produced_while_a_node_restarts(mut network: Network) { // Reconnect the 'restarted' node. network.connect_node(restarted_node); - network.run_until_synced(restarted_node).await; // TODO(#721): We should assert here that a new view occurred if-and-only-if the 'restarted' node was the proposer // of blocks 3 or 4. This would tell us that we aren't producing new views unnecessarily. From 19da84821457d2d36d074399d46e443f2f866b1a Mon Sep 17 00:00:00 2001 From: Shawn Date: Tue, 28 Jan 2025 10:15:00 +0800 Subject: [PATCH 106/119] feat: use libp2p timeout instead of internal sync timeout. --- zilliqa/src/consensus.rs | 9 +- zilliqa/src/node.rs | 2 +- zilliqa/src/p2p_node.rs | 3 +- zilliqa/src/sync.rs | 214 +++++++++++++++++---------------------- 4 files changed, 98 insertions(+), 130 deletions(-) diff --git a/zilliqa/src/consensus.rs b/zilliqa/src/consensus.rs index bc044c001..9487ae98b 100644 --- a/zilliqa/src/consensus.rs +++ b/zilliqa/src/consensus.rs @@ -33,7 +33,7 @@ use crate::{ ExternalMessage, InternalMessage, NewView, Proposal, QuorumCertificate, Vote, MAX_COMMITTEE_SIZE, }, - node::{MessageSender, NetworkMessage, OutgoingMessageFailure}, + node::{MessageSender, NetworkMessage}, pool::{TransactionPool, TxAddResult, TxPoolContent}, state::State, sync::{Sync, SyncPeers}, @@ -3125,13 +3125,6 @@ impl Consensus { Ok(count) } - pub fn report_outgoing_message_failure( - &mut self, - _failure: OutgoingMessageFailure, - ) -> Result<()> { - Ok(()) // FIXME: Stub - } - pub fn tick(&mut self) -> Result<()> { trace!("consensus::tick()"); trace!("request_missing_blocks from timer"); diff --git a/zilliqa/src/node.rs b/zilliqa/src/node.rs index edd6f835f..c1dccc01f 100644 --- a/zilliqa/src/node.rs +++ b/zilliqa/src/node.rs @@ -331,7 +331,7 @@ impl Node { failure: OutgoingMessageFailure, ) -> Result<()> { debug!(from = %self.peer_id, %to, ?failure, "handling message failure"); - self.consensus.report_outgoing_message_failure(failure)?; + self.consensus.sync.handle_request_failure(failure)?; Ok(()) } diff --git a/zilliqa/src/p2p_node.rs b/zilliqa/src/p2p_node.rs index 24cc271ad..21737468e 100644 --- a/zilliqa/src/p2p_node.rs +++ b/zilliqa/src/p2p_node.rs @@ -112,7 +112,8 @@ impl P2pNode { Ok(Behaviour { request_response: request_response::cbor::Behaviour::new( iter::once((StreamProtocol::new("/zq2-message/1"), ProtocolSupport::Full)), - Default::default(), + request_response::Config::default() + .with_request_timeout(Duration::from_secs(10)), ), gossipsub: gossipsub::Behaviour::new( MessageAuthenticity::Signed(key_pair.clone()), diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index 27a431dd9..b5b829063 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -20,7 +20,7 @@ use crate::{ Block, BlockHeader, BlockRequest, BlockResponse, ExternalMessage, InjectedProposal, Proposal, QuorumCertificate, RequestBlocksByHeight, }, - node::MessageSender, + node::{MessageSender, OutgoingMessageFailure, RequestId}, time::SystemTime, transaction::EvmGas, }; @@ -65,9 +65,7 @@ pub struct Sync { // internal peers peers: Arc, // peer handling an in-flight request - in_flight: Option, - // in-flight request timeout, before retry - request_timeout: Duration, + in_flight: Option<(PeerInfo, RequestId)>, // how many blocks to request at once max_batch_size: usize, // how many blocks to inject into the queue @@ -123,7 +121,6 @@ impl Sync { message_sender, peer_id, peers, - request_timeout: config.consensus.consensus_timeout, max_batch_size, max_blocks_in_flight, in_flight: None, @@ -136,36 +133,19 @@ impl Sync { }) } - fn count_segments(&self) -> Result { - self.db.count_sync_segments() - } - - fn contains_metadata(&self, hash: &Hash) -> Result { - self.db.contains_sync_metadata(hash) - } - - fn get_segment(&self, hash: Hash) -> Result> { - self.db.get_sync_segment(hash) - } - - fn last_segment(&self) -> Result> { - self.db.last_sync_segment() - } - - fn pop_segment(&self) -> Result<()> { - self.db.pop_sync_segment() - } - - fn push_segment(&self, peer: PeerInfo, meta: BlockHeader) -> Result<()> { - self.db.push_sync_segment(peer, meta) - } - - fn insert_metadata(&self, metas: &Vec) -> Result<()> { - self.db.insert_sync_metadata(metas) - } - - fn empty_metadata(&self) -> Result<()> { - self.db.empty_sync_metadata() + pub fn handle_request_failure(&mut self, failure: OutgoingMessageFailure) -> Result<()> { + // chekc if the request is a sync messages + if let Some((peer, req_id)) = self.in_flight.as_ref() { + // downgrade peer due to timeout + if peer.peer_id == failure.peer && *req_id == failure.request_id { + tracing::warn!(to = %peer.peer_id, err = %failure.error, + "sync::RequestFailure : in-flight request failed" + ); + self.peers + .done_with_peer(self.in_flight.take(), DownGrade::Timeout); + } + } + Ok(()) } /// Phase 0: Sync a block proposal. @@ -239,14 +219,14 @@ impl Sync { tracing::info!( "sync::SyncProposal : finishing {} blocks for segment #{} from {}", self.recent_proposals.len(), - self.count_segments()?, + self.db.count_sync_segments()?, self.peer_id, ); // inject the proposals let proposals = self.recent_proposals.drain(..).collect_vec(); self.inject_proposals(proposals)?; } - self.empty_metadata()?; + self.db.empty_sync_metadata()?; self.state = SyncState::Phase0; } // Retry to fix sync issues e.g. peers that are now offline @@ -301,7 +281,7 @@ impl Sync { /// This will rebuild history from the previous marker, with another peer. /// If this function is called many times, it will eventually restart from Phase 0. fn retry_phase1(&mut self) -> Result<()> { - if self.count_segments()? == 0 { + if self.db.count_sync_segments()? == 0 { tracing::error!("sync::RetryPhase1 : cannot retry phase 1 without chain segments!"); self.state = SyncState::Phase0; return Ok(()); @@ -309,12 +289,12 @@ impl Sync { tracing::debug!( "sync::RetryPhase1 : retrying segment #{}", - self.count_segments()?, + self.db.count_sync_segments()?, ); // remove the last segment from the chain metadata - let (meta, _) = self.last_segment()?.unwrap(); - self.pop_segment()?; + let (meta, _) = self.db.last_sync_segment()?.unwrap(); + self.db.pop_sync_segment()?; self.state = SyncState::Phase1(meta); Ok(()) @@ -346,7 +326,7 @@ impl Sync { from: PeerId, response: Vec, ) -> Result<()> { - if let Some(peer) = self.in_flight.as_ref() { + if let Some((peer, _)) = self.in_flight.as_ref() { if peer.peer_id != from { tracing::warn!( "sync::MultiBlockResponse : unexpected peer={} != {from}", @@ -375,7 +355,7 @@ impl Sync { tracing::info!( "sync::MultiBlockResponse : received {} blocks for segment #{} from {}", response.len(), - self.count_segments()?, + self.db.count_sync_segments()?, from ); @@ -405,11 +385,11 @@ impl Sync { .sorted_by_key(|p| p.number()) .collect_vec(); - self.pop_segment()?; + self.db.pop_sync_segment()?; self.inject_proposals(proposals)?; // Done with phase 2 - if self.count_segments()? == 0 { + if self.db.count_sync_segments()? == 0 { self.state = SyncState::Phase3; } else if Self::DO_SPECULATIVE { // Speculatively request more blocks @@ -460,21 +440,11 @@ impl Sync { anyhow::bail!("sync::RequestMissingBlocks : invalid state"); } // Early exit if there's a request in-flight; and if it has not expired. - if let Some(peer) = self.in_flight.as_ref() { - if peer.last_used.elapsed() > self.request_timeout { - tracing::warn!( - "sync::RequestMissingBlocks : in-flight request {} timed out, requesting from new peer", - peer.peer_id - ); - self.peers - .done_with_peer(self.in_flight.take(), DownGrade::Timeout); - } else { - return Ok(()); - } - } else if self.in_pipeline > self.max_blocks_in_flight { + if self.in_flight.is_some() || self.in_pipeline > self.max_blocks_in_flight { tracing::warn!( - "sync::RequestMissingBlocks : syncing {} blocks in pipeline", - self.in_pipeline + "sync::RequestMissingBlocks : syncing {}/{} blocks in pipeline", + self.in_pipeline, + self.max_blocks_in_flight ); return Ok(()); } @@ -485,8 +455,8 @@ impl Sync { self.peers.reinsert_peer(peer)?; // If we have no chain_segments, we have nothing to do - if let Some((meta, peer_info)) = self.last_segment()? { - let request_hashes = self.get_segment(meta.qc.block_hash)?; + if let Some((meta, peer_info)) = self.db.last_sync_segment()? { + let request_hashes = self.db.get_sync_segment(meta.qc.block_hash)?; // Checksum of the request hashes let checksum = request_hashes @@ -501,35 +471,41 @@ impl Sync { tracing::info!( "sync::RequestMissingBlocks : requesting {} blocks of segment #{} from {}", request_hashes.len(), - self.count_segments()?, + self.db.count_sync_segments()?, peer_info.peer_id, ); - let message = match peer_info.version { + let (peer_info, message) = match peer_info.version { PeerVer::V2 => { - self.in_flight = Some(PeerInfo { - version: PeerVer::V2, - peer_id: peer_info.peer_id, - last_used: std::time::Instant::now(), - score: u32::MAX, // used to indicate faux peer, will not be added to the group of peers - }); - ExternalMessage::MultiBlockRequest(request_hashes) + ( + PeerInfo { + version: PeerVer::V2, + peer_id: peer_info.peer_id, + last_used: std::time::Instant::now(), + score: u32::MAX, // used to indicate faux peer, will not be added to the group of peers + }, + ExternalMessage::MultiBlockRequest(request_hashes), + ) } PeerVer::V1 => { - self.in_flight = Some(PeerInfo { - version: PeerVer::V1, - peer_id: peer_info.peer_id, - last_used: std::time::Instant::now(), - score: u32::MAX, // used to indicate faux peer, will not be added to the group of peers - }); - // do not add VIEW_DRIFT - the stored marker is accurate! - ExternalMessage::BlockRequest(BlockRequest { - to_view: meta.view.saturating_sub(1), - from_view: meta.view.saturating_sub(self.max_batch_size as u64), - }) + ( + PeerInfo { + version: PeerVer::V1, + peer_id: peer_info.peer_id, + last_used: std::time::Instant::now(), + score: u32::MAX, // used to indicate faux peer, will not be added to the group of peers + }, + // do not add VIEW_DRIFT - the stored marker is accurate! + ExternalMessage::BlockRequest(BlockRequest { + to_view: meta.view.saturating_sub(1), + from_view: meta.view.saturating_sub(self.max_batch_size as u64), + }), + ) } }; - self.message_sender + let request_id = self + .message_sender .send_external_message(peer_info.peer_id, message)?; + self.in_flight = Some((peer_info, request_id)); } } else { tracing::warn!( @@ -552,7 +528,7 @@ impl Sync { && response.from_view == u64::MAX { tracing::info!("sync::HandleBlockResponse : new response from {from}",); - if let Some(mut peer) = self.in_flight.take() { + if let Some((mut peer, _)) = self.in_flight.take() { if peer.peer_id == from && peer.version == PeerVer::V1 { // upgrade to V2 peer peer.version = PeerVer::V2; @@ -617,7 +593,11 @@ impl Sync { .proposals .into_iter() // filter any blocks that are not in the chain e.g. forks - .filter(|p| self.contains_metadata(&p.hash()).unwrap_or_default()) + .filter(|p| { + self.db + .contains_sync_metadata(&p.hash()) + .unwrap_or_default() + }) .sorted_by(|a, b| b.number().cmp(&a.number())) .collect_vec(); @@ -644,7 +624,7 @@ impl Sync { response: Vec, ) -> Result<()> { // Check for expected response - let segment_peer = if let Some(peer) = self.in_flight.as_ref() { + let segment_peer = if let Some((peer, _)) = self.in_flight.as_ref() { if peer.peer_id != from { tracing::warn!( "sync::MetadataResponse : unexpected peer={} != {from}", @@ -704,15 +684,15 @@ impl Sync { tracing::info!( "sync::MetadataResponse : received {} metadata segment #{} from {}", segment.len(), - self.count_segments()?, + self.db.count_sync_segments()?, from ); // Record the constructed chain metadata - self.insert_metadata(&segment)?; + self.db.insert_sync_metadata(&segment)?; // Record landmark(s), including peer that has this set of blocks - self.push_segment(segment_peer, *meta)?; + self.db.push_sync_segment(segment_peer, *meta)?; // Record the oldest block in the chain's parent self.state = SyncState::Phase1(segment.last().cloned().unwrap()); @@ -747,8 +727,8 @@ impl Sync { from ); - // Do not respond to stale requests as the client has timed-out - if request.request_at.elapsed()? > self.request_timeout { + // Do not respond to stale requests as the client has probably timed-out + if request.request_at.elapsed()? > Duration::from_secs(5) { tracing::warn!("sync::MetadataRequest : stale request"); return Ok(ExternalMessage::Acknowledgement); } @@ -793,34 +773,24 @@ impl Sync { anyhow::bail!("sync::RequestMissingMetadata : invalid state"); } // Early exit if there's a request in-flight; and if it has not expired. - if let Some(peer) = self.in_flight.as_ref() { - if peer.last_used.elapsed() > self.request_timeout { - tracing::warn!( - "sync::RequestMissingMetadata : in-flight request {} timed out, requesting from new peer", - peer.peer_id - ); - self.peers - .done_with_peer(self.in_flight.take(), DownGrade::Timeout); - } else { - return Ok(()); - } - } else if self.in_pipeline > self.max_batch_size { + if self.in_flight.is_some() || self.in_pipeline > self.max_batch_size { // anything more than this and we cannot be sure whether the segment hits history tracing::warn!( - "sync::RequestMissingMetadata : syncing {} blocks in pipeline", - self.in_pipeline + "sync::RequestMissingMetadata : syncing {}/{} blocks in pipeline", + self.in_pipeline, + self.max_batch_size ); return Ok(()); } - if let Some(peer) = self.peers.get_next_peer() { + if let Some(peer_info) = self.peers.get_next_peer() { tracing::info!( "sync::RequestMissingMetadata : requesting {} metadata of segment #{} from {}", self.max_batch_size, - self.count_segments()? + 1, - peer.peer_id + self.db.count_sync_segments()? + 1, + peer_info.peer_id ); - let message = match (self.state.clone(), &peer.version) { + let message = match (self.state.clone(), &peer_info.version) { ( SyncState::Phase1(BlockHeader { number: block_number, @@ -869,9 +839,10 @@ impl Sync { } _ => anyhow::bail!("sync::MissingMetadata : invalid state"), }; - self.message_sender - .send_external_message(peer.peer_id, message)?; - self.in_flight = Some(peer); + let request_id = self + .message_sender + .send_external_message(peer_info.peer_id, message)?; + self.in_flight = Some((peer_info, request_id)); } else { tracing::warn!( "sync::RequestMissingBlocks : {} insufficient peers to handle request", @@ -946,7 +917,7 @@ impl Sync { Ok(self.in_pipeline != 0 || !matches!(self.state, SyncState::Phase0) || !self.recent_proposals.is_empty() - || self.count_segments()? != 0) + || self.db.count_sync_segments()? != 0) } // Returns (starting_block, current_block, highest_block) if we're syncing, @@ -1001,8 +972,8 @@ impl SyncPeers { /// /// This algorithm favours good peers that respond quickly (i.e. no timeout). /// In most cases, it eventually degenerates into 2 sources - avoid a single source of truth. - fn done_with_peer(&self, in_flight: Option, downgrade: DownGrade) { - if let Some(mut peer) = in_flight { + fn done_with_peer(&self, in_flight: Option<(PeerInfo, RequestId)>, downgrade: DownGrade) { + if let Some((mut peer, _)) = in_flight { tracing::trace!("sync::DoneWithPeer {} {:?}", peer.peer_id, downgrade); let mut peers = self.peers.lock().unwrap(); peer.score = peer.score.saturating_add(downgrade as u32); @@ -1069,12 +1040,15 @@ impl SyncPeers { } let mut peers = self.peers.lock().unwrap(); let mut peer = peer; - peer.last_used = peers - .peek() - .expect("peers.len() > 1") - .last_used - .checked_sub(Duration::from_secs(1)) - .expect("time is ordinal"); + if !peers.is_empty() { + // Ensure that it gets to the head of the line + peer.last_used = peers + .peek() + .expect("peers.len() > 1") + .last_used + .checked_sub(Duration::from_secs(1)) + .expect("time is ordinal"); + } peers.push(peer); Ok(()) } From 5bbfe26a05d0ae833121db8aa693b84e9e72a496 Mon Sep 17 00:00:00 2001 From: Shawn Date: Tue, 28 Jan 2025 10:37:12 +0800 Subject: [PATCH 107/119] nit: change sync_data to sync_metadata table name; misc nits. --- zilliqa/src/db.rs | 32 ++++++++++++++++---------------- zilliqa/src/message.rs | 5 +---- zilliqa/src/sync.rs | 9 +++------ 3 files changed, 20 insertions(+), 26 deletions(-) diff --git a/zilliqa/src/db.rs b/zilliqa/src/db.rs index b0ca82f97..b4d1b8e77 100644 --- a/zilliqa/src/db.rs +++ b/zilliqa/src/db.rs @@ -329,7 +329,7 @@ impl Db { ", )?; connection.execute_batch( - "CREATE TEMP TABLE IF NOT EXISTS sync_data ( + "CREATE TEMP TABLE IF NOT EXISTS sync_metadata ( block_hash BLOB NOT NULL UNIQUE, parent_hash BLOB NOT NULL, block_number INTEGER NOT NULL PRIMARY KEY, @@ -338,7 +338,7 @@ impl Db { version INTEGER DEFAULT 0, peer BLOB DEFAULT NULL ); - CREATE INDEX IF NOT EXISTS idx_sync_data ON sync_data(block_number) WHERE peer IS NOT NULL;", + CREATE INDEX IF NOT EXISTS idx_sync_metadata ON sync_metadata(block_number) WHERE peer IS NOT NULL;", )?; Ok(()) @@ -361,20 +361,20 @@ impl Db { .db .lock() .unwrap() - .prepare_cached("SELECT COUNT(block_number) FROM sync_data WHERE peer IS NOT NULL")? + .prepare_cached("SELECT COUNT(block_number) FROM sync_metadata WHERE peer IS NOT NULL")? .query_row([], |row| row.get(0)) .optional()? .unwrap_or_default()) } /// Checks if the stored metadata exists - pub fn contains_sync_metadata(&self, hash: &Hash) -> Result { + pub fn contains_sync_metadata(&self, block_hash: &Hash) -> Result { Ok(self .db .lock() .unwrap() - .prepare_cached("SELECT block_number FROM sync_data WHERE block_hash = ?1")? - .query_row([hash], |row| row.get::<_, u64>(0)) + .prepare_cached("SELECT parent_hash FROM sync_metadata WHERE block_hash = ?1")? + .query_row([block_hash], |row| row.get::<_, Hash>(0)) .optional()? .is_some()) } @@ -387,7 +387,7 @@ impl Db { let mut block_hash = hash; while let Some(parent_hash) = db - .prepare_cached("SELECT parent_hash FROM sync_data WHERE block_hash = ?1")? + .prepare_cached("SELECT parent_hash FROM sync_metadata WHERE block_hash = ?1")? .query_row([block_hash], |row| row.get::<_, Hash>(0)) .optional()? { @@ -400,7 +400,7 @@ impl Db { /// Peeks into the top of the segment stack. pub fn last_sync_segment(&self) -> Result> { let db = self.db.lock().unwrap(); - let r = db.prepare_cached("SELECT parent_hash, block_hash, block_number, view_number, gas_used, version, peer FROM sync_data WHERE peer IS NOT NULL ORDER BY block_number ASC LIMIT 1")? + let r = db.prepare_cached("SELECT parent_hash, block_hash, block_number, view_number, gas_used, version, peer FROM sync_metadata WHERE peer IS NOT NULL ORDER BY block_number ASC LIMIT 1")? .query_row([], |row| Ok(( BlockHeader::from_meta_data(row.get(0)?,row.get(1)?, row.get(2)?, row.get(3)?, row.get(4)?), PeerInfo { @@ -416,7 +416,7 @@ impl Db { pub fn push_sync_segment(&self, peer: PeerInfo, meta: BlockHeader) -> Result<()> { let db = self.db.lock().unwrap(); db.prepare_cached( - "INSERT OR REPLACE INTO sync_data (parent_hash, block_hash, block_number, view_number, gas_used, version, peer) VALUES (:parent_hash, :block_hash, :block_number, :view_number, :gas_used, :version, :peer)")? + "INSERT OR REPLACE INTO sync_metadata (parent_hash, block_hash, block_number, view_number, gas_used, version, peer) VALUES (:parent_hash, :block_hash, :block_number, :view_number, :gas_used, :version, :peer)")? .execute( named_params! { ":parent_hash": meta.qc.block_hash, @@ -438,7 +438,7 @@ impl Db { for meta in metas { tx.prepare_cached( - "INSERT OR REPLACE INTO sync_data (parent_hash, block_hash, block_number, view_number, gas_used) VALUES (:parent_hash, :block_hash, :block_number, :view_number, :gas_used)")? + "INSERT OR REPLACE INTO sync_metadata (parent_hash, block_hash, block_number, view_number, gas_used) VALUES (:parent_hash, :block_hash, :block_number, :view_number, :gas_used)")? .execute( named_params! { ":parent_hash": meta.qc.block_hash, @@ -457,7 +457,7 @@ impl Db { self.db .lock() .unwrap() - .execute("DELETE FROM sync_data", [])?; + .execute("DELETE FROM sync_metadata", [])?; Ok(()) } @@ -466,14 +466,14 @@ impl Db { let mut db = self.db.lock().unwrap(); let c = db.transaction()?; - if let Some(block_hash) = c.prepare_cached("SELECT block_hash FROM sync_data WHERE peer IS NOT NULL ORDER BY block_number ASC LIMIT 1")? + if let Some(block_hash) = c.prepare_cached("SELECT block_hash FROM sync_metadata WHERE peer IS NOT NULL ORDER BY block_number ASC LIMIT 1")? .query_row([], |row| row.get::<_,Hash>(0)).optional()? { - if let Some(parent_hash) = c.prepare_cached("SELECT parent_hash FROM sync_data WHERE block_hash = ?1")? + if let Some(parent_hash) = c.prepare_cached("SELECT parent_hash FROM sync_metadata WHERE block_hash = ?1")? .query_row([block_hash], |row| row.get(0)).optional()? { // update marker c.prepare_cached( - "UPDATE sync_data SET peer = NULL WHERE block_hash = ?1")? + "UPDATE sync_metadata SET peer = NULL WHERE block_hash = ?1")? .execute( [block_hash] )?; @@ -482,7 +482,7 @@ impl Db { let mut hashes = Vec::new(); let mut block_hash = parent_hash; while let Some(parent_hash) = c - .prepare_cached("SELECT parent_hash FROM sync_data WHERE block_hash = ?1")? + .prepare_cached("SELECT parent_hash FROM sync_metadata WHERE block_hash = ?1")? .query_row([block_hash], |row| row.get::<_, Hash>(0)) .optional()? { @@ -491,7 +491,7 @@ impl Db { } for hash in hashes { - c.prepare_cached("DELETE FROM sync_data WHERE block_hash = ?1")? + c.prepare_cached("DELETE FROM sync_metadata WHERE block_hash = ?1")? .execute([hash])?; } } diff --git a/zilliqa/src/message.rs b/zilliqa/src/message.rs index 8f4a9b28c..9e1088d57 100644 --- a/zilliqa/src/message.rs +++ b/zilliqa/src/message.rs @@ -274,8 +274,7 @@ pub enum ExternalMessage { /// An acknowledgement of the receipt of a message. Note this is only used as a response when the caller doesn't /// require any data in the response. Acknowledgement, - AddPeer, - RemovePeer, + /// The following are used for the new sync protocol InjectedProposal(InjectedProposal), MetaDataRequest(RequestBlocksByHeight), MetaDataResponse(Vec), @@ -315,8 +314,6 @@ impl Display for ExternalMessage { ExternalMessage::InjectedProposal(p) => { write!(f, "InjectedProposal {}", p.block.number()) } - ExternalMessage::AddPeer => write!(f, "AddPeer"), - ExternalMessage::RemovePeer => write!(f, "RemovePeer"), ExternalMessage::Proposal(p) => write!(f, "Proposal({})", p.view()), ExternalMessage::Vote(v) => write!(f, "Vote({})", v.view), ExternalMessage::NewView(n) => write!(f, "NewView({})", n.view), diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index b5b829063..fe6e59c33 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -245,10 +245,7 @@ impl Sync { } } _ => { - tracing::debug!( - "sync::SyncProposal : syncing {} blocks in pipeline", - self.in_pipeline - ); + tracing::debug!("sync::SyncProposal : syncing {} blocks", self.in_pipeline); } } @@ -442,7 +439,7 @@ impl Sync { // Early exit if there's a request in-flight; and if it has not expired. if self.in_flight.is_some() || self.in_pipeline > self.max_blocks_in_flight { tracing::warn!( - "sync::RequestMissingBlocks : syncing {}/{} blocks in pipeline", + "sync::RequestMissingBlocks : syncing {}/{} blocks", self.in_pipeline, self.max_blocks_in_flight ); @@ -776,7 +773,7 @@ impl Sync { if self.in_flight.is_some() || self.in_pipeline > self.max_batch_size { // anything more than this and we cannot be sure whether the segment hits history tracing::warn!( - "sync::RequestMissingMetadata : syncing {}/{} blocks in pipeline", + "sync::RequestMissingMetadata : syncing {}/{} blocks", self.in_pipeline, self.max_batch_size ); From 9b28c959236d51bf56ded9e1b4ce87fede6210c1 Mon Sep 17 00:00:00 2001 From: Shawn Date: Tue, 28 Jan 2025 11:01:37 +0800 Subject: [PATCH 108/119] feat: added place-holder for active/passive sync. --- zilliqa/src/consensus.rs | 4 ++-- zilliqa/src/node.rs | 2 +- zilliqa/src/sync.rs | 24 ++++++++++++++++++++---- zilliqa/tests/it/staking.rs | 2 +- 4 files changed, 24 insertions(+), 8 deletions(-) diff --git a/zilliqa/src/consensus.rs b/zilliqa/src/consensus.rs index 9487ae98b..9b80e9510 100644 --- a/zilliqa/src/consensus.rs +++ b/zilliqa/src/consensus.rs @@ -3130,8 +3130,8 @@ impl Consensus { trace!("request_missing_blocks from timer"); // TODO: Drive passive-sync from Timeouts - if self.sync.am_syncing()? { - self.sync.sync_internal()?; + if !self.sync.am_syncing()? { + self.sync.sync_to_genesis()?; } else { trace!("not syncing ..."); } diff --git a/zilliqa/src/node.rs b/zilliqa/src/node.rs index c1dccc01f..5e50e14b5 100644 --- a/zilliqa/src/node.rs +++ b/zilliqa/src/node.rs @@ -909,7 +909,7 @@ impl Node { self.message_sender.broadcast_proposal(message)?; } } else { - self.consensus.sync.sync_proposal(proposal)?; // proposal is already verified + self.consensus.sync.sync_from_proposal(proposal)?; // proposal is already verified } Ok(()) diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index fe6e59c33..f284e26e7 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -133,16 +133,27 @@ impl Sync { }) } + /// P2P Failure + /// + /// This gets called for any libp2p request failure. pub fn handle_request_failure(&mut self, failure: OutgoingMessageFailure) -> Result<()> { // chekc if the request is a sync messages if let Some((peer, req_id)) = self.in_flight.as_ref() { // downgrade peer due to timeout if peer.peer_id == failure.peer && *req_id == failure.request_id { tracing::warn!(to = %peer.peer_id, err = %failure.error, - "sync::RequestFailure : in-flight request failed" + "sync::RequestFailure : in-flight failed" ); self.peers .done_with_peer(self.in_flight.take(), DownGrade::Timeout); + // Retry if failed in Phase 2 for whatever reason + match self.state { + SyncState::Phase1(_) if Self::DO_SPECULATIVE => { + self.request_missing_metadata(None)? + } + SyncState::Phase2(_) => self.state = SyncState::Retry1, + _ => {} + } } } Ok(()) @@ -156,17 +167,22 @@ impl Sync { /// If we find its parent in history, we inject the entire queue. Otherwise, we start syncing. /// /// We do not perform checks on the Proposal here. This is done in the consensus layer. - pub fn sync_proposal(&mut self, proposal: Proposal) -> Result<()> { + pub fn sync_from_proposal(&mut self, proposal: Proposal) -> Result<()> { // just stuff the latest proposal into the fixed-size queue. while self.recent_proposals.len() >= self.max_batch_size { self.recent_proposals.pop_front(); } self.recent_proposals.push_back(proposal); - self.sync_internal() + self.internal_sync() + } + + // TODO: Passive-sync place-holder + pub fn sync_to_genesis(&mut self) -> Result<()> { + Ok(()) } - pub fn sync_internal(&mut self) -> Result<()> { + fn internal_sync(&mut self) -> Result<()> { if self.recent_proposals.is_empty() { // Do nothing if there's no recent proposals. tracing::debug!("sync::Internal : missing recent proposals"); diff --git a/zilliqa/tests/it/staking.rs b/zilliqa/tests/it/staking.rs index b423ffcde..966662f21 100644 --- a/zilliqa/tests/it/staking.rs +++ b/zilliqa/tests/it/staking.rs @@ -532,7 +532,7 @@ async fn validators_can_join_and_become_proposer(mut network: Network) { // Give new node time to catch up to block including deposit_v3 deployment network - .run_until_block(&staker_wallet, 24.into(), 200) + .run_until_block(&staker_wallet, 24.into(), 424) .await; let deposit_hash = deposit_v3_stake( From c5d9b92c0e00b982e6e8dfa0923b03ad4a29d3b6 Mon Sep 17 00:00:00 2001 From: Shawn Date: Tue, 28 Jan 2025 11:44:11 +0800 Subject: [PATCH 109/119] fix #2227; and remove txn.verify() during Phase 2 - it is checked during Injection. --- zilliqa/src/sync.rs | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index f284e26e7..a4083a066 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -321,23 +321,6 @@ impl Sync { &mut self, from: PeerId, response: Vec, - ) -> Result<()> { - // Verify transactions on the client-side - let proposals = response - .into_iter() - .map(|p| { - let (b, t) = p.into_parts(); - let txns = t.into_iter().map(|t| t.verify().unwrap()).collect_vec(); - Proposal::from_parts(b, txns) - }) - .collect_vec(); - self.inner_handle_multiblock_response(from, proposals) - } - - pub fn inner_handle_multiblock_response( - &mut self, - from: PeerId, - response: Vec, ) -> Result<()> { if let Some((peer, _)) = self.in_flight.as_ref() { if peer.peer_id != from { @@ -399,7 +382,7 @@ impl Sync { .collect_vec(); self.db.pop_sync_segment()?; - self.inject_proposals(proposals)?; + self.inject_proposals(proposals)?; // txns are verified when processing InjectedProposal. // Done with phase 2 if self.db.count_sync_segments()? == 0 { @@ -614,7 +597,7 @@ impl Sync { .sorted_by(|a, b| b.number().cmp(&a.number())) .collect_vec(); - self.inner_handle_multiblock_response(from, multi_blocks)?; + self.handle_multiblock_response(from, multi_blocks)?; } _ => { tracing::error!( @@ -929,7 +912,6 @@ impl Sync { pub fn am_syncing(&self) -> Result { Ok(self.in_pipeline != 0 || !matches!(self.state, SyncState::Phase0) - || !self.recent_proposals.is_empty() || self.db.count_sync_segments()? != 0) } From dae52d57f1655618ba79a0d3b84c3bcbdf0ce97b Mon Sep 17 00:00:00 2001 From: Shawn Date: Tue, 28 Jan 2025 12:24:12 +0800 Subject: [PATCH 110/119] feat: place-holder to store old ZIL txn blocks. --- zilliqa/src/sync.rs | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index a4083a066..e0545304b 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -22,7 +22,7 @@ use crate::{ }, node::{MessageSender, OutgoingMessageFailure, RequestId}, time::SystemTime, - transaction::EvmGas, + transaction::{EvmGas, SignedTransaction}, }; // Syncing Algorithm @@ -874,19 +874,27 @@ impl Sync { // Just pump the Proposals back to ourselves. for p in proposals { - tracing::trace!( - "sync::InjectProposals : injecting number: {} hash: {}", - p.number(), - p.hash(), - ); - - self.message_sender.send_external_message( - self.peer_id, - ExternalMessage::InjectedProposal(InjectedProposal { - from: self.peer_id, - block: p, - }), - )?; + if !p + .transactions + .iter() + .any(|t| matches!(t, SignedTransaction::Zilliqa { .. })) + { + tracing::trace!( + number = %p.number(), hash = %p.hash(), + "sync::InjectProposals : applying", + ); + self.message_sender.send_external_message( + self.peer_id, + ExternalMessage::InjectedProposal(InjectedProposal { + from: self.peer_id, + block: p, + }), + )?; + } else { + tracing::warn!(number = %p.number(), hash = %p.hash(), "sync::InjectProposals : storing"); + // TODO: just store old ZIL blocks + todo!("store ZIL block"); + } } self.inject_at = Some((std::time::Instant::now(), self.in_pipeline)); From 0c3614a09ca2e7d7580c9daacd5e34448673a273 Mon Sep 17 00:00:00 2001 From: Shawn Date: Tue, 28 Jan 2025 12:30:14 +0800 Subject: [PATCH 111/119] nit: simplify run_until_synced(); --- zilliqa/src/sync.rs | 22 ++++++++++++---------- zilliqa/tests/it/main.rs | 9 ++------- 2 files changed, 14 insertions(+), 17 deletions(-) diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index e0545304b..47a22262e 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -177,7 +177,7 @@ impl Sync { self.internal_sync() } - // TODO: Passive-sync place-holder + // TODO: Passive-sync place-holder - https://github.com/Zilliqa/zq2/issues/2232 pub fn sync_to_genesis(&mut self) -> Result<()> { Ok(()) } @@ -764,6 +764,8 @@ impl Sync { /// This constructs a chain history by requesting blocks from a peer, going backwards from a given block. /// If Phase 1 is in progress, it continues requesting blocks from the last known Phase 1 block. /// Otherwise, it requests blocks from the given starting metadata. + /// + /// TODO: speed it up - https://github.com/Zilliqa/zq2/issues/2158 pub fn request_missing_metadata(&mut self, meta: Option) -> Result<()> { if !matches!(self.state, SyncState::Phase1(_)) && !matches!(self.state, SyncState::Phase0) { anyhow::bail!("sync::RequestMissingMetadata : invalid state"); @@ -883,18 +885,18 @@ impl Sync { number = %p.number(), hash = %p.hash(), "sync::InjectProposals : applying", ); - self.message_sender.send_external_message( - self.peer_id, - ExternalMessage::InjectedProposal(InjectedProposal { - from: self.peer_id, - block: p, - }), - )?; } else { tracing::warn!(number = %p.number(), hash = %p.hash(), "sync::InjectProposals : storing"); - // TODO: just store old ZIL blocks - todo!("store ZIL block"); + // TODO: just store old ZIL blocks - https://github.com/Zilliqa/zq2/issues/2232 } + + self.message_sender.send_external_message( + self.peer_id, + ExternalMessage::InjectedProposal(InjectedProposal { + from: self.peer_id, + block: p, + }), + )?; } self.inject_at = Some((std::time::Instant::now(), self.in_pipeline)); diff --git a/zilliqa/tests/it/main.rs b/zilliqa/tests/it/main.rs index c90064ef7..47bd10434 100644 --- a/zilliqa/tests/it/main.rs +++ b/zilliqa/tests/it/main.rs @@ -1061,17 +1061,12 @@ impl Network { break i; } }; - let mut debounce = 0; - let mut old_height = 0; self.run_until( |net| { + let syncing = net.get_node(index).consensus.sync.am_syncing().unwrap(); let height_i = net.get_node(index).get_finalized_height().unwrap(); let height_c = net.get_node(check).get_finalized_height().unwrap(); - if height_c == height_i && height_i > old_height { - debounce += 1; - old_height = height_i; - } - debounce == 3 + height_c == height_i && height_i > 0 && !syncing }, 2000, ) From d0d3f00203c6b885fec79bdd0b3f58b081579e3d Mon Sep 17 00:00:00 2001 From: Shawn Date: Tue, 28 Jan 2025 15:27:40 +0800 Subject: [PATCH 112/119] fix: flaw in get_next_peer(). --- zilliqa/src/sync.rs | 45 +++++++++++++++------------------------------ 1 file changed, 15 insertions(+), 30 deletions(-) diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index 47a22262e..a72cd3f5d 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -504,10 +504,7 @@ impl Sync { self.in_flight = Some((peer_info, request_id)); } } else { - tracing::warn!( - "sync::RequestMissingBlocks : {} insufficient peers to handle request", - self.peers.len() - ); + tracing::warn!("sync::RequestMissingBlocks : insufficient peers to handle request"); } Ok(()) } @@ -842,10 +839,7 @@ impl Sync { .send_external_message(peer_info.peer_id, message)?; self.in_flight = Some((peer_info, request_id)); } else { - tracing::warn!( - "sync::RequestMissingBlocks : {} insufficient peers to handle request", - self.peers.len() - ); + tracing::warn!("sync::RequestMissingBlocks : insufficient peers to handle request",); } Ok(()) } @@ -996,11 +990,10 @@ impl SyncPeers { /// Add bulk peers pub fn add_peers(&self, peers: Vec) { tracing::debug!("sync::AddPeers {:?}", peers); - for peer in peers { - if peer != self.peer_id { - self.add_peer(peer); - } - } + peers + .into_iter() + .filter(|p| *p != self.peer_id) + .for_each(|p| self.add_peer(p)); } /// Add a peer to the list of peers. @@ -1014,7 +1007,7 @@ impl SyncPeers { peer_id: peer, last_used: Instant::now(), }; - // ensure that it is unique - avoids single source of truth + // ensure that it is unique peers.retain(|p: &PeerInfo| p.peer_id != peer); peers.push(new_peer); @@ -1029,17 +1022,17 @@ impl SyncPeers { } /// Get the next best peer to use - pub fn get_next_peer(&self) -> Option { - let mut peer = self.peers.lock().unwrap().pop()?; - peer.last_used = std::time::Instant::now(); - // dynamic sizing should not be needed, if we're syncing recent blocks. - // self.max_batch_size = self.dynamic_batch_sizing(&peer); - tracing::trace!("sync::GetNextPeer {} ({})", peer.peer_id, peer.score); - Some(peer) + fn get_next_peer(&self) -> Option { + if let Some(mut peer) = self.peers.lock().unwrap().pop() { + peer.last_used = std::time::Instant::now(); + tracing::trace!(peer = % peer.peer_id, score= %peer.score, "sync::GetNextPeer"); + return Some(peer); + } + None } /// Reinserts the peer such that it is at the front of the queue. - pub fn reinsert_peer(&self, peer: PeerInfo) -> Result<()> { + fn reinsert_peer(&self, peer: PeerInfo) -> Result<()> { if peer.score == u32::MAX { return Ok(()); } @@ -1057,14 +1050,6 @@ impl SyncPeers { peers.push(peer); Ok(()) } - - pub fn len(&self) -> usize { - self.peers.lock().unwrap().len() - } - - pub fn is_empty(&self) -> bool { - self.peers.lock().unwrap().is_empty() - } } #[derive(Debug, Clone, Eq, PartialEq)] From dba0235a927e34ed1d3b8462890e08bb4653ee11 Mon Sep 17 00:00:00 2001 From: Shawn Date: Tue, 28 Jan 2025 15:37:54 +0800 Subject: [PATCH 113/119] feat: early prototype for issue #1878. --- zilliqa/src/node.rs | 4 +++- zilliqa/src/sync.rs | 42 +++++++++++++++++++++++++++++++++++++----- 2 files changed, 40 insertions(+), 6 deletions(-) diff --git a/zilliqa/src/node.rs b/zilliqa/src/node.rs index 5e50e14b5..f02d7fa46 100644 --- a/zilliqa/src/node.rs +++ b/zilliqa/src/node.rs @@ -350,7 +350,9 @@ impl Node { ExternalMessage::BlockResponse(response) => { self.consensus.sync.handle_block_response(from, response)? } - ExternalMessage::Acknowledgement => {} + ExternalMessage::Acknowledgement => { + self.consensus.sync.handle_acknowledgement(from)?; + } msg => { warn!(%msg, "unexpected message type"); } diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index a72cd3f5d..19bca2ede 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -133,13 +133,42 @@ impl Sync { }) } + /// Skip Failure + /// + /// We get a plain ACK in certain cases - treated as an empty response. + pub fn handle_acknowledgement(&mut self, from: PeerId) -> Result<()> { + if let Some((peer, _)) = self.in_flight.as_ref() { + // downgrade peer due to empty response + if peer.peer_id == from { + tracing::warn!(to = %peer.peer_id, + "sync::Acknowledgement : empty response" + ); + self.peers + .done_with_peer(self.in_flight.take(), DownGrade::Empty); + // Retry if failed in Phase 2 for whatever reason + match self.state { + SyncState::Phase1(_) if Self::DO_SPECULATIVE => { + self.request_missing_metadata(None)? + } + SyncState::Phase2(_) => self.state = SyncState::Retry1, + _ => {} + } + } else { + tracing::warn!(to = %peer.peer_id, + "sync::Acknowledgement : spurious" + ); + } + } + Ok(()) + } + /// P2P Failure /// - /// This gets called for any libp2p request failure. + /// This gets called for any libp2p request failure - treated as a network failure pub fn handle_request_failure(&mut self, failure: OutgoingMessageFailure) -> Result<()> { - // chekc if the request is a sync messages + // check if the request is a sync messages if let Some((peer, req_id)) = self.in_flight.as_ref() { - // downgrade peer due to timeout + // downgrade peer due to network failure if peer.peer_id == failure.peer && *req_id == failure.request_id { tracing::warn!(to = %peer.peer_id, err = %failure.error, "sync::RequestFailure : in-flight failed" @@ -154,6 +183,10 @@ impl Sync { SyncState::Phase2(_) => self.state = SyncState::Retry1, _ => {} } + } else { + tracing::warn!(to = %peer.peer_id, + "sync::RequestFailure : spurious" + ); } } Ok(()) @@ -726,8 +759,7 @@ impl Sync { return Ok(ExternalMessage::Acknowledgement); } - // TODO: Check if we should service this request - // Validators could respond to this request if there is nothing else to do. + // TODO: Check if we should service this request - https://github.com/Zilliqa/zq2/issues/1878 let batch_size: usize = self .max_batch_size From 5dce6ae16fde4b3c0bae05c50bf27fe4c13d994c Mon Sep 17 00:00:00 2001 From: James Hinshelwood Date: Wed, 29 Jan 2025 12:06:14 +0000 Subject: [PATCH 114/119] Delete all non-finalized blocks from database at startup Previously we only deleted 'canonical' blocks. --- zilliqa/src/consensus.rs | 12 +++++------- zilliqa/src/db.rs | 25 ++++++++++--------------- 2 files changed, 15 insertions(+), 22 deletions(-) diff --git a/zilliqa/src/consensus.rs b/zilliqa/src/consensus.rs index 9b80e9510..9718ad13e 100644 --- a/zilliqa/src/consensus.rs +++ b/zilliqa/src/consensus.rs @@ -277,21 +277,19 @@ impl Consensus { // If we have newer blocks, erase them // @todo .. more elegantly :-) loop { - let highest_block_number = db - .get_highest_canonical_block_number()? - .ok_or_else(|| anyhow!("can't find highest block num in database!"))?; let head_block = db - .get_canonical_block_by_number(highest_block_number)? - .ok_or_else(|| anyhow!("missing head block!"))?; + .get_highest_recorded_block()? + .ok_or_else(|| anyhow!("can't find highest block in database!"))?; trace!( - "recovery: highest_block_number {highest_block_number} view {0}", + "recovery: highest_block_number {} view {}", + head_block.number(), head_block.view() ); if head_block.view() > high_block.view() && head_block.view() > finalized_number { - trace!("recovery: stored block {0} reverted", highest_block_number); + trace!("recovery: stored block {0} reverted", head_block.number()); db.remove_transactions_executed_in_block(&head_block.hash())?; db.remove_block(&head_block)?; } else { diff --git a/zilliqa/src/db.rs b/zilliqa/src/db.rs index b4d1b8e77..3bb0a30eb 100644 --- a/zilliqa/src/db.rs +++ b/zilliqa/src/db.rs @@ -177,6 +177,7 @@ enum BlockFilter { Hash(Hash), View(u64), Height(u64), + MaxHeight, } const CHECKPOINT_HEADER_BYTES: [u8; 8] = *b"ZILCHKPT"; @@ -781,19 +782,6 @@ impl Db { .unwrap_or(None)) } - // Deliberately not named get_highest_block_number() because there used to be one - // of those with unclear semantics, so changing name to force the compiler to error - // if it was used. - pub fn get_highest_recorded_block_number(&self) -> Result> { - Ok(self - .db - .lock() - .unwrap() - .prepare_cached("SELECT height FROM blocks ORDER BY height DESC LIMIT 1")? - .query_row((), |row| row.get(0)) - .optional()?) - } - pub fn get_highest_canonical_block_number(&self) -> Result> { Ok(self .db @@ -1028,8 +1016,8 @@ impl Db { }) } macro_rules! query_block { - ($cond: tt, $key: tt) => { - self.db.lock().unwrap().prepare_cached(concat!("SELECT block_hash, view, height, qc, signature, state_root_hash, transactions_root_hash, receipts_root_hash, timestamp, gas_used, gas_limit, agg FROM blocks WHERE ", $cond),)?.query_row([$key], make_block).optional()? + ($cond: tt $(, $key:tt)*) => { + self.db.lock().unwrap().prepare_cached(concat!("SELECT block_hash, view, height, qc, signature, state_root_hash, transactions_root_hash, receipts_root_hash, timestamp, gas_used, gas_limit, agg FROM blocks WHERE ", $cond),)?.query_row([$($key),*], make_block).optional()? }; } Ok(match filter { @@ -1042,6 +1030,9 @@ impl Db { BlockFilter::Height(height) => { query_block!("height = ?1 AND is_canonical = TRUE", height) } + BlockFilter::MaxHeight => { + query_block!("TRUE ORDER BY height DESC LIMIT 1") + } }) } @@ -1072,6 +1063,10 @@ impl Db { self.get_block(BlockFilter::Height(number)) } + pub fn get_highest_recorded_block(&self) -> Result> { + self.get_block(BlockFilter::MaxHeight) + } + pub fn contains_block(&self, block_hash: &Hash) -> Result { Ok(self .db From 516fc63b6d7f2e20bc8d0ad0f7dbade1a998fd62 Mon Sep 17 00:00:00 2001 From: James Hinshelwood Date: Wed, 29 Jan 2025 17:29:43 +0000 Subject: [PATCH 115/119] Don't fail benchmark workflows on alert --- .github/workflows/base_benchmarks.yaml | 1 - .github/workflows/pr_benchmarks.yaml | 1 - 2 files changed, 2 deletions(-) diff --git a/.github/workflows/base_benchmarks.yaml b/.github/workflows/base_benchmarks.yaml index 69c20fff5..5d0912ae1 100644 --- a/.github/workflows/base_benchmarks.yaml +++ b/.github/workflows/base_benchmarks.yaml @@ -33,7 +33,6 @@ jobs: --threshold-max-sample-size 64 \ --threshold-upper-boundary 0.99 \ --thresholds-reset \ - --err \ --adapter rust_criterion \ --github-actions '${{ secrets.GITHUB_TOKEN }}' \ cargo bench diff --git a/.github/workflows/pr_benchmarks.yaml b/.github/workflows/pr_benchmarks.yaml index 305e99d16..de51b72e5 100644 --- a/.github/workflows/pr_benchmarks.yaml +++ b/.github/workflows/pr_benchmarks.yaml @@ -34,7 +34,6 @@ jobs: --start-point-clone-thresholds \ --start-point-reset \ --testbed self-hosted \ - --err \ --adapter rust_criterion \ --github-actions '${{ secrets.GITHUB_TOKEN }}' \ cargo bench From 770465f5540e2229ccd03f78f72d8b8b73ebc41c Mon Sep 17 00:00:00 2001 From: James Hinshelwood Date: Wed, 29 Jan 2025 17:45:28 +0000 Subject: [PATCH 116/119] Remove redundant config --- zilliqa/src/p2p_node.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/zilliqa/src/p2p_node.rs b/zilliqa/src/p2p_node.rs index 21737468e..24cc271ad 100644 --- a/zilliqa/src/p2p_node.rs +++ b/zilliqa/src/p2p_node.rs @@ -112,8 +112,7 @@ impl P2pNode { Ok(Behaviour { request_response: request_response::cbor::Behaviour::new( iter::once((StreamProtocol::new("/zq2-message/1"), ProtocolSupport::Full)), - request_response::Config::default() - .with_request_timeout(Duration::from_secs(10)), + Default::default(), ), gossipsub: gossipsub::Behaviour::new( MessageAuthenticity::Signed(key_pair.clone()), From eb42fbb4d45f45f42cc4c0af703bacf8e8b19ff4 Mon Sep 17 00:00:00 2001 From: James Hinshelwood Date: Thu, 30 Jan 2025 17:07:57 +0000 Subject: [PATCH 117/119] Hide listen addrs --- zilliqa/src/p2p_node.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/zilliqa/src/p2p_node.rs b/zilliqa/src/p2p_node.rs index 24cc271ad..05a130181 100644 --- a/zilliqa/src/p2p_node.rs +++ b/zilliqa/src/p2p_node.rs @@ -135,8 +135,7 @@ impl P2pNode { // So, the nodes are unable to see each other directly and remain isolated, defeating kademlia and autonat. identify: identify::Behaviour::new( identify::Config::new("zilliqa/1.0.0".into(), key_pair.public()) - .with_hide_listen_addrs(false) - .with_push_listen_addr_updates(true), + .with_hide_listen_addrs(true), ), }) })? From b10602e9c99b6ef80c9c1e2a77b9f5ecba735b2d Mon Sep 17 00:00:00 2001 From: Shawn Date: Fri, 31 Jan 2025 13:06:07 +0800 Subject: [PATCH 118/119] feat: store raw header blob in sync_metadata(). --- zilliqa/src/db.rs | 23 ++++++++++------------- zilliqa/src/sync.rs | 46 ++++++++------------------------------------- 2 files changed, 18 insertions(+), 51 deletions(-) diff --git a/zilliqa/src/db.rs b/zilliqa/src/db.rs index b52d4ece2..f8fa197da 100644 --- a/zilliqa/src/db.rs +++ b/zilliqa/src/db.rs @@ -341,10 +341,9 @@ impl Db { block_hash BLOB NOT NULL UNIQUE, parent_hash BLOB NOT NULL, block_number INTEGER NOT NULL PRIMARY KEY, - view_number INTEGER NOT NULL, - gas_used INTEGER NOT NULL, version INTEGER DEFAULT 0, - peer BLOB DEFAULT NULL + peer BLOB DEFAULT NULL, + rawdata BLOB NOT NULL ); CREATE INDEX IF NOT EXISTS idx_sync_metadata ON sync_metadata(block_number) WHERE peer IS NOT NULL;", )?; @@ -408,14 +407,14 @@ impl Db { /// Peeks into the top of the segment stack. pub fn last_sync_segment(&self) -> Result> { let db = self.db.lock().unwrap(); - let r = db.prepare_cached("SELECT parent_hash, block_hash, block_number, view_number, gas_used, version, peer FROM sync_metadata WHERE peer IS NOT NULL ORDER BY block_number ASC LIMIT 1")? + let r = db.prepare_cached("SELECT rawdata, version, peer FROM sync_metadata WHERE peer IS NOT NULL ORDER BY block_number ASC LIMIT 1")? .query_row([], |row| Ok(( - BlockHeader::from_meta_data(row.get(0)?,row.get(1)?, row.get(2)?, row.get(3)?, row.get(4)?), + serde_json::from_slice(row.get::<_,Vec>(0)?.as_slice()).unwrap(), PeerInfo { last_used: Instant::now(), score: u32::MAX, - version: row.get(5)?, - peer_id: PeerId::from_bytes(row.get::<_,Vec>(6)?.as_slice()).unwrap(), + version: row.get(1)?, + peer_id: PeerId::from_bytes(row.get::<_,Vec>(2)?.as_slice()).unwrap(), }))).optional()?; Ok(r) } @@ -424,16 +423,15 @@ impl Db { pub fn push_sync_segment(&self, peer: PeerInfo, meta: BlockHeader) -> Result<()> { let db = self.db.lock().unwrap(); db.prepare_cached( - "INSERT OR REPLACE INTO sync_metadata (parent_hash, block_hash, block_number, view_number, gas_used, version, peer) VALUES (:parent_hash, :block_hash, :block_number, :view_number, :gas_used, :version, :peer)")? + "INSERT OR REPLACE INTO sync_metadata (parent_hash, block_hash, block_number, version, peer, rawdata) VALUES (:parent_hash, :block_hash, :block_number, :version, :peer, :rawdata)")? .execute( named_params! { ":parent_hash": meta.qc.block_hash, ":block_hash": meta.hash, ":block_number": meta.number, - ":view_number": meta.view, - ":gas_used": meta.gas_used, ":peer": peer.peer_id.to_bytes(), ":version": peer.version, + ":rawdata": serde_json::to_vec(&meta).unwrap(), }, )?; Ok(()) @@ -446,14 +444,13 @@ impl Db { for meta in metas { tx.prepare_cached( - "INSERT OR REPLACE INTO sync_metadata (parent_hash, block_hash, block_number, view_number, gas_used) VALUES (:parent_hash, :block_hash, :block_number, :view_number, :gas_used)")? + "INSERT OR REPLACE INTO sync_metadata (parent_hash, block_hash, block_number, rawdata) VALUES (:parent_hash, :block_hash, :block_number, :rawdata)")? .execute( named_params! { ":parent_hash": meta.qc.block_hash, ":block_hash": meta.hash, ":block_number": meta.number, - ":view_number": meta.view, - ":gas_used": meta.gas_used, + ":rawdata": serde_json::to_vec(meta).unwrap(), })?; } tx.commit()?; diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index 19bca2ede..b62746cd9 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -22,7 +22,7 @@ use crate::{ }, node::{MessageSender, OutgoingMessageFailure, RequestId}, time::SystemTime, - transaction::{EvmGas, SignedTransaction}, + transaction::SignedTransaction, }; // Syncing Algorithm @@ -229,17 +229,7 @@ impl Sync { if !self.db.contains_block(&parent_hash)? { // No parent block, trigger sync tracing::warn!("sync::SyncProposal : syncing from {parent_hash}",); - let block_hash = self.recent_proposals.back().unwrap().hash(); - let block_number = self.recent_proposals.back().unwrap().number(); - let view_number = self.recent_proposals.back().unwrap().view(); - let gas_used = self.recent_proposals.back().unwrap().header.gas_used; - let meta = BlockHeader::from_meta_data( - parent_hash, - block_hash, - block_number, - view_number, - gas_used, - ); + let meta = self.recent_proposals.back().unwrap().header; self.request_missing_metadata(Some(meta))?; let highest_block = self @@ -707,6 +697,12 @@ impl Sync { // Chain segment is sane let segment = response; + // Record the constructed chain metadata + self.db.insert_sync_metadata(&segment)?; + + // Record landmark(s), including peer that has this set of blocks + self.db.push_sync_segment(segment_peer, *meta)?; + tracing::info!( "sync::MetadataResponse : received {} metadata segment #{} from {}", segment.len(), @@ -714,12 +710,6 @@ impl Sync { from ); - // Record the constructed chain metadata - self.db.insert_sync_metadata(&segment)?; - - // Record landmark(s), including peer that has this set of blocks - self.db.push_sync_segment(segment_peer, *meta)?; - // Record the oldest block in the chain's parent self.state = SyncState::Phase1(segment.last().cloned().unwrap()); let last_block_hash = segment.last().as_ref().unwrap().hash; @@ -1161,23 +1151,3 @@ impl ToSql for PeerVer { Ok((self.clone() as u32).into()) } } - -impl BlockHeader { - pub fn from_meta_data( - parent_hash: Hash, - block_hash: Hash, - block_number: u64, - view_number: u64, - gas_used: EvmGas, - ) -> BlockHeader { - let mut meta = BlockHeader { - gas_used, - view: view_number, - number: block_number, - hash: block_hash, - ..Default::default() - }; - meta.qc.block_hash = parent_hash; - meta - } -} From 071d40bef688e0ba02d12a4c05ce14d51716e43e Mon Sep 17 00:00:00 2001 From: Shawn Date: Mon, 3 Feb 2025 14:43:29 +0800 Subject: [PATCH 119/119] feat: minor log changes; remove redundant check in handle_metadata_response(). --- zilliqa/src/sync.rs | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/zilliqa/src/sync.rs b/zilliqa/src/sync.rs index b62746cd9..0d761a883 100644 --- a/zilliqa/src/sync.rs +++ b/zilliqa/src/sync.rs @@ -111,10 +111,9 @@ impl Sync { SyncState::Retry1 // continue sync }; - let latest_block_number = latest_block + let (latest_block_number, latest_block_hash) = latest_block .as_ref() - .expect("Some(block) expected") - .number(); + .map_or_else(|| (u64::MIN, Hash::ZERO), |b| (b.number(), b.hash())); Ok(Self { db, @@ -129,7 +128,7 @@ impl Sync { recent_proposals: VecDeque::with_capacity(max_batch_size), inject_at: None, started_at_block_number: latest_block_number, - checkpoint_hash: Hash::ZERO, + checkpoint_hash: latest_block_hash, }) } @@ -171,7 +170,7 @@ impl Sync { // downgrade peer due to network failure if peer.peer_id == failure.peer && *req_id == failure.request_id { tracing::warn!(to = %peer.peer_id, err = %failure.error, - "sync::RequestFailure : in-flight failed" + "sync::RequestFailure : network error" ); self.peers .done_with_peer(self.in_flight.take(), DownGrade::Timeout); @@ -228,7 +227,7 @@ impl Sync { let parent_hash = self.recent_proposals.back().unwrap().header.qc.block_hash; if !self.db.contains_block(&parent_hash)? { // No parent block, trigger sync - tracing::warn!("sync::SyncProposal : syncing from {parent_hash}",); + tracing::info!("sync::SyncProposal : syncing from {parent_hash}",); let meta = self.recent_proposals.back().unwrap().header; self.request_missing_metadata(Some(meta))?; @@ -371,6 +370,10 @@ impl Sync { .done_with_peer(self.in_flight.take(), DownGrade::None); } + let SyncState::Phase2(check_sum) = self.state else { + anyhow::bail!("sync::MultiBlockResponse : invalid state"); + }; + tracing::info!( "sync::MultiBlockResponse : received {} blocks for segment #{} from {}", response.len(), @@ -379,10 +382,6 @@ impl Sync { ); // If the checksum does not match, retry phase 1. Maybe the node has pruned the segment. - let SyncState::Phase2(check_sum) = self.state else { - anyhow::bail!("sync::MultiBlockResponse : invalid state"); - }; - let checksum = response .iter() .fold(Hash::builder().with(Hash::ZERO.as_bytes()), |sum, p| { @@ -460,7 +459,7 @@ impl Sync { } // Early exit if there's a request in-flight; and if it has not expired. if self.in_flight.is_some() || self.in_pipeline > self.max_blocks_in_flight { - tracing::warn!( + tracing::debug!( "sync::RequestMissingBlocks : syncing {}/{} blocks", self.in_pipeline, self.max_blocks_in_flight @@ -667,11 +666,11 @@ impl Sync { .done_with_peer(self.in_flight.take(), DownGrade::None); } - // Check the linkage of the returned chain let SyncState::Phase1(meta) = &self.state else { anyhow::bail!("sync::MetadataResponse : invalid state"); }; + // Check the linkage of the returned chain let mut block_hash = meta.qc.block_hash; let mut block_num = meta.number; for meta in response.iter() { @@ -712,14 +711,13 @@ impl Sync { // Record the oldest block in the chain's parent self.state = SyncState::Phase1(segment.last().cloned().unwrap()); - let last_block_hash = segment.last().as_ref().unwrap().hash; // If the checkpoint is in this segment let checkpointed = segment.iter().any(|b| b.hash == self.checkpoint_hash); let started = self.started_at_block_number <= segment.first().as_ref().unwrap().number && self.started_at_block_number >= segment.last().as_ref().unwrap().number; // If the segment hits our history, start Phase 2. - if started || checkpointed || self.db.contains_block(&last_block_hash)? { + if started || checkpointed { self.state = SyncState::Phase2(Hash::ZERO); } else if Self::DO_SPECULATIVE { self.request_missing_metadata(None)?; @@ -792,7 +790,7 @@ impl Sync { // Early exit if there's a request in-flight; and if it has not expired. if self.in_flight.is_some() || self.in_pipeline > self.max_batch_size { // anything more than this and we cannot be sure whether the segment hits history - tracing::warn!( + tracing::debug!( "sync::RequestMissingMetadata : syncing {}/{} blocks", self.in_pipeline, self.max_batch_size @@ -905,7 +903,6 @@ impl Sync { tracing::warn!(number = %p.number(), hash = %p.hash(), "sync::InjectProposals : storing"); // TODO: just store old ZIL blocks - https://github.com/Zilliqa/zq2/issues/2232 } - self.message_sender.send_external_message( self.peer_id, ExternalMessage::InjectedProposal(InjectedProposal {