Skip to content

Commit fabf27f

Browse files
committed
feat(gossipsub): upgrade internal message queue,
This started with an attempt to solve libp2p#5751 using the previous internal async-channel. After multiple ideas were discussed off band, replacing the async-channel with an internal more tailored priority queue seemed inevitable. This priority queue allows us to implement the cancellation of in flight IDONTWANT's very cleanly with the remove_data_messages function. Clearing the stale messages likewise becomes simpler as we also make use of remove_data_messages And this has the added advantage of being able to only have a single priority queue and making the code simpler. If a peer is not making progress we can assume it's not delivering High priority messages and we can penalize it.
1 parent 2d59160 commit fabf27f

File tree

11 files changed

+1142
-804
lines changed

11 files changed

+1142
-804
lines changed

Cargo.lock

Lines changed: 299 additions & 127 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

protocols/gossipsub/CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,9 @@
1414
- Fix incorrect default values in ConfigBuilder
1515
See [PR 6113](https://github.com/libp2p/rust-libp2p/pull/6113)
1616

17+
- Switch the internal `async-channel` used to dispatch messages from `NetworkBehaviour` to the `ConnectionHandler`
18+
with an internal priority queue. See [PR XXXX](https://github.com/libp2p/rust-libp2p/pull/XXXX)
19+
1720
## 0.49.2
1821

1922
- Relax `Behaviour::with_metrics` requirements, do not require DataTransform and TopicSubscriptionFilter to also impl Default
@@ -34,6 +37,7 @@
3437

3538
- Feature gate metrics related code. This changes some `Behaviour` constructor methods.
3639
See [PR 6020](https://github.com/libp2p/rust-libp2p/pull/6020)
40+
3741
- Send IDONTWANT before Publishing a new message.
3842
See [PR 6017](https://github.com/libp2p/rust-libp2p/pull/6017)
3943

protocols/gossipsub/Cargo.toml

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ web-time = { workspace = true }
3232
# crates.io libp2p
3333
libp2p-core = "0.43"
3434
libp2p-identity = { version = "0.2", features = ["rand"] }
35-
libp2p-swarm = "0.46"
35+
libp2p-swarm = "0.47"
3636
quick-protobuf = "0.8"
3737
quick-protobuf-codec = "0.3.1"
3838
rand = "0.8"
@@ -41,12 +41,10 @@ serde = { version = "1", optional = true, features = ["derive"] }
4141
sha2 = "0.10.8"
4242
tracing = { workspace = true }
4343

44-
# Metrics dependencies
45-
# TODO: update to 0.23 when libp2p-metrics 0.17 is released.
46-
prometheus-client = "0.22.2"
44+
prometheus-client = { version = "0.23", optional = true }
4745

4846
[dev-dependencies]
49-
libp2p-swarm-test = "0.5.0"
47+
libp2p-swarm-test = { version = "0.6.0", features = ["tokio"] }
5048
quickcheck = { workspace = true }
5149
tracing-subscriber = { workspace = true, features = ["env-filter"] }
5250
tokio = { workspace = true, features = ["rt", "rt-multi-thread", "time", "macros"] }

protocols/gossipsub/src/behaviour.rs

Lines changed: 53 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ use crate::{
6161
mcache::MessageCache,
6262
peer_score::{PeerScore, PeerScoreParams, PeerScoreState, PeerScoreThresholds, RejectReason},
6363
protocol::SIGNING_PREFIX,
64-
rpc::Sender,
64+
queue::Queue,
6565
rpc_proto::proto,
6666
subscription_filter::{AllowAllSubscriptionFilter, TopicSubscriptionFilter},
6767
time_cache::DuplicateCache,
@@ -751,6 +751,7 @@ where
751751
if self.send_message(
752752
*peer_id,
753753
RpcOut::Publish {
754+
message_id: msg_id.clone(),
754755
message: raw_message.clone(),
755756
timeout: Delay::new(self.config.publish_queue_duration()),
756757
},
@@ -1341,6 +1342,7 @@ where
13411342
self.send_message(
13421343
*peer_id,
13431344
RpcOut::Forward {
1345+
message_id: id.clone(),
13441346
message: msg,
13451347
timeout: Delay::new(self.config.forward_queue_duration()),
13461348
},
@@ -2081,9 +2083,9 @@ where
20812083
// steady-state size of the queues.
20822084
#[cfg(feature = "metrics")]
20832085
if let Some(m) = &mut self.metrics {
2084-
for sender_queue in self.connected_peers.values().map(|v| &v.sender) {
2085-
m.observe_priority_queue_size(sender_queue.priority_queue_len());
2086-
m.observe_non_priority_queue_size(sender_queue.non_priority_queue_len());
2086+
for sender_queue in self.connected_peers.values().map(|v| &v.messages) {
2087+
m.observe_priority_queue_size(sender_queue.priority_len());
2088+
m.observe_non_priority_queue_size(sender_queue.non_priority_len());
20872089
}
20882090
}
20892091

@@ -2499,6 +2501,11 @@ where
24992501
// Report expired messages
25002502
for (peer_id, failed_messages) in self.failed_messages.drain() {
25012503
tracing::debug!("Peer couldn't consume messages: {:?}", failed_messages);
2504+
#[cfg(feature = "metrics")]
2505+
if let Some(metrics) = self.metrics.as_mut() {
2506+
metrics.observe_failed_priority_messages(failed_messages.priority);
2507+
metrics.observe_failed_non_priority_messages(failed_messages.non_priority);
2508+
}
25022509
self.events
25032510
.push_back(ToSwarm::GenerateEvent(Event::SlowPeer {
25042511
peer_id,
@@ -2746,6 +2753,7 @@ where
27462753
self.send_message(
27472754
*peer_id,
27482755
RpcOut::Forward {
2756+
message_id: msg_id.clone(),
27492757
message: message.clone(),
27502758
timeout: Delay::new(self.config.forward_queue_duration()),
27512759
},
@@ -2874,33 +2882,20 @@ where
28742882
return false;
28752883
}
28762884

2877-
// Try sending the message to the connection handler.
2878-
match peer.sender.send_message(rpc) {
2885+
// Try sending the message to the connection handler,
2886+
// High priority messages should not fail.
2887+
match peer.messages.try_push(rpc) {
28792888
Ok(()) => true,
28802889
Err(rpc) => {
28812890
// Sending failed because the channel is full.
28822891
tracing::warn!(peer=%peer_id, "Send Queue full. Could not send {:?}.", rpc);
28832892

28842893
// Update failed message counter.
28852894
let failed_messages = self.failed_messages.entry(peer_id).or_default();
2886-
match rpc {
2887-
RpcOut::Publish { .. } => {
2888-
failed_messages.priority += 1;
2889-
failed_messages.publish += 1;
2890-
}
2891-
RpcOut::Forward { .. } => {
2892-
failed_messages.non_priority += 1;
2893-
failed_messages.forward += 1;
2894-
}
2895-
RpcOut::IWant(_) | RpcOut::IHave(_) | RpcOut::IDontWant(_) => {
2896-
failed_messages.non_priority += 1;
2897-
}
2898-
RpcOut::Graft(_)
2899-
| RpcOut::Prune(_)
2900-
| RpcOut::Subscribe(_)
2901-
| RpcOut::Unsubscribe(_) => {
2902-
unreachable!("Channel for highpriority control messages is unbounded and should always be open.")
2903-
}
2895+
if rpc.priority() {
2896+
failed_messages.priority += 1;
2897+
} else {
2898+
failed_messages.non_priority += 1;
29042899
}
29052900

29062901
// Update peer score.
@@ -3125,23 +3120,22 @@ where
31253120
// The protocol negotiation occurs once a message is sent/received. Once this happens we
31263121
// update the type of peer that this is in order to determine which kind of routing should
31273122
// occur.
3128-
let connected_peer = self
3129-
.connected_peers
3130-
.entry(peer_id)
3131-
.or_insert_with(|| PeerDetails {
3132-
kind: PeerKind::Floodsub,
3133-
connections: vec![],
3134-
outbound: false,
3135-
sender: Sender::new(self.config.connection_handler_queue_len()),
3136-
topics: Default::default(),
3137-
dont_send: LinkedHashMap::new(),
3138-
});
3123+
let connected_peer = self.connected_peers.entry(peer_id).or_insert(PeerDetails {
3124+
kind: PeerKind::Floodsub,
3125+
connections: vec![],
3126+
outbound: false,
3127+
messages: Queue::new(self.config.connection_handler_queue_len()),
3128+
topics: Default::default(),
3129+
dont_send: LinkedHashMap::new(),
3130+
});
31393131
// Add the new connection
31403132
connected_peer.connections.push(connection_id);
31413133

3134+
// This clones a reference to the Queue so any new handlers reference the same underlying
3135+
// queue. No data is actually cloned here.
31423136
Ok(Handler::new(
31433137
self.config.protocol_config(),
3144-
connected_peer.sender.new_receiver(),
3138+
connected_peer.messages.clone(),
31453139
))
31463140
}
31473141

@@ -3153,25 +3147,24 @@ where
31533147
_: Endpoint,
31543148
_: PortUse,
31553149
) -> Result<THandler<Self>, ConnectionDenied> {
3156-
let connected_peer = self
3157-
.connected_peers
3158-
.entry(peer_id)
3159-
.or_insert_with(|| PeerDetails {
3160-
kind: PeerKind::Floodsub,
3161-
connections: vec![],
3162-
// Diverging from the go implementation we only want to consider a peer as outbound
3163-
// peer if its first connection is outbound.
3164-
outbound: !self.px_peers.contains(&peer_id),
3165-
sender: Sender::new(self.config.connection_handler_queue_len()),
3166-
topics: Default::default(),
3167-
dont_send: LinkedHashMap::new(),
3168-
});
3150+
let connected_peer = self.connected_peers.entry(peer_id).or_insert(PeerDetails {
3151+
kind: PeerKind::Floodsub,
3152+
connections: vec![],
3153+
// Diverging from the go implementation we only want to consider a peer as outbound peer
3154+
// if its first connection is outbound.
3155+
outbound: !self.px_peers.contains(&peer_id),
3156+
messages: Queue::new(self.config.connection_handler_queue_len()),
3157+
topics: Default::default(),
3158+
dont_send: LinkedHashMap::new(),
3159+
});
31693160
// Add the new connection
31703161
connected_peer.connections.push(connection_id);
31713162

3163+
// This clones a reference to the Queue so any new handlers reference the same underlying
3164+
// queue. No data is actually cloned here.
31723165
Ok(Handler::new(
31733166
self.config.protocol_config(),
3174-
connected_peer.sender.new_receiver(),
3167+
connected_peer.messages.clone(),
31753168
))
31763169
}
31773170

@@ -3213,6 +3206,8 @@ where
32133206
}
32143207
}
32153208
}
3209+
// rpc is only used for metrics code.
3210+
#[allow(unused_variables)]
32163211
HandlerEvent::MessageDropped(rpc) => {
32173212
// Account for this in the scoring logic
32183213
if let PeerScoreState::Active(peer_score) = &mut self.peer_score {
@@ -3221,32 +3216,7 @@ where
32213216

32223217
// Keep track of expired messages for the application layer.
32233218
let failed_messages = self.failed_messages.entry(propagation_source).or_default();
3224-
failed_messages.timeout += 1;
3225-
match rpc {
3226-
RpcOut::Publish { .. } => {
3227-
failed_messages.publish += 1;
3228-
}
3229-
RpcOut::Forward { .. } => {
3230-
failed_messages.forward += 1;
3231-
}
3232-
_ => {}
3233-
}
3234-
3235-
// Record metrics on the failure.
3236-
#[cfg(feature = "metrics")]
3237-
if let Some(metrics) = self.metrics.as_mut() {
3238-
match rpc {
3239-
RpcOut::Publish { message, .. } => {
3240-
metrics.publish_msg_dropped(&message.topic);
3241-
metrics.timeout_msg_dropped(&message.topic);
3242-
}
3243-
RpcOut::Forward { message, .. } => {
3244-
metrics.forward_msg_dropped(&message.topic);
3245-
metrics.timeout_msg_dropped(&message.topic);
3246-
}
3247-
_ => {}
3248-
}
3249-
}
3219+
failed_messages.non_priority += 1;
32503220
}
32513221
HandlerEvent::Message {
32523222
rpc,
@@ -3345,10 +3315,17 @@ where
33453315
"Could not handle IDONTWANT, peer doesn't exist in connected peer list");
33463316
continue;
33473317
};
3318+
3319+
// Remove messages from the queue.
3320+
#[allow(unused)]
3321+
let removed = peer.messages.remove_data_messages(&message_ids);
3322+
33483323
#[cfg(feature = "metrics")]
33493324
if let Some(metrics) = self.metrics.as_mut() {
33503325
metrics.register_idontwant(message_ids.len());
3326+
metrics.register_removed_messages(removed);
33513327
}
3328+
33523329
for message_id in message_ids {
33533330
peer.dont_send.insert(message_id, Instant::now());
33543331
// Don't exceed capacity.

0 commit comments

Comments
 (0)